Cambridge SMT System
task.loadlm.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne
14 
15 #ifndef LOADLMTASK_HPP
16 #define LOADLMTASK_HPP
17 
25 #include <kenlmdetect.hpp>
26 #include <lm/config.hh>
27 #include <lm/enumerate_vocab.hh>
28 #ifdef WITH_NPLM
29 #include <lm/wrappers/nplm.hh>
30 #include <neuralLM.h> // v0.1
31 #endif
32 #include <idbridge.hpp>
34 
35 namespace ucam {
36 namespace fsttools {
37 
38 // External wrapped-in implementations
39 // don't necessarily have the same constructor.
40 // This class handles the general case
41 // and template specialization helps to handle exceptions
42 template<class KenLMModelT>
44  std::string const file_;
45  lm::ngram::Config &kenlm_config_;
46  KenLMModelHelper(std::string const &file
47  , lm::ngram::Config &kenlm_config)
48  : file_(file)
49  , kenlm_config_(kenlm_config)
50  {}
51 
52  KenLMModelT *operator()(){
53  return new KenLMModelT ( file_.c_str() , kenlm_config_);
54  }
55 
56 };
57 
58 #ifdef WITH_NPLM
59 // Specialization for NPLM:
60 template<>
61 struct KenLMModelHelper<lm::np::Model> {
62  std::string const file_;
63  lm::ngram::Config &kenlm_config_;
64 
65  class NplmVocabularyWrapper {
66  nplm::vocabulary const &v_;
67  public:
68  NplmVocabularyWrapper(nplm::vocabulary const &v):v_(v) {};
69  unsigned operator()(std::string const &s) {
70  return v_.lookup_word(s);
71  }
72  };
73 
74  KenLMModelHelper(std::string const &file
75  , lm::ngram::Config &kenlm_config)
76  : file_(file)
77  , kenlm_config_(kenlm_config)
78  {}
79  lm::np::Model *operator()(){
80  // \todo: Deal with this silly kenLM+nplm issue in a more reasonable manner
81  // As we cannot pass the kenlm_config to the current
82  // nplm wrapper, here goes a v. absurd roundabout
83  // to effectively fix it and move on:
84  boost::scoped_ptr<nplm::neuralLM> p(new nplm::neuralLM(file_));
85  nplm::vocabulary const &v=p->get_vocabulary();
86  std::vector<std::string> const &words = v.words();
89  (kenlm_config_.enumerate_vocab);
90  for (unsigned k = 0; k < words.size(); ++k) {
91  hev->Add(k,words[k]);
92  }
93  // what if input and output vocabulary are different?
94  nplm::vocabulary const &vo=p->get_output_vocabulary();
95  std::vector<std::string> const &owords = vo.words();
96  for (unsigned k = 0; k < owords.size(); ++k) {
97  hev->AddOutput(k,owords[k]);
98  }
99 
100 
101  // lets draw a veil over this and
102  // return new model as usual
103  return new lm::np::Model( file_);
104  }
105 };
106 #endif
107 
108 lm::base::Model *loadKenLm(std::string const &file
109  , lm::ngram::Config kenlm_config
110  , unsigned offset = 0) {
111  using namespace lm::ngram;
112  typedef lm::np::Model NplmModel;
113  // Detect here kenlm binary type
114  int kenmt = ucam::util::detectkenlm(file);
115  switch (kenmt) {
116  case PROBING:
117  return KenLMModelHelper<ProbingModel>(file, kenlm_config)();
118  case REST_PROBING:
119  return KenLMModelHelper<RestProbingModel>(file, kenlm_config)();
120  case TRIE:
121  return KenLMModelHelper<TrieModel>(file, kenlm_config)();
122  case QUANT_TRIE:
123  return KenLMModelHelper<QuantTrieModel>(file, kenlm_config)();
124  case ARRAY_TRIE:
125  return KenLMModelHelper<ArrayTrieModel>(file, kenlm_config)();
126  case QUANT_ARRAY_TRIE:
127  return KenLMModelHelper<QuantArrayTrieModel>(file, kenlm_config)();
128  case util::KENLM_NPLM:
129 #ifdef WITH_NPLM
130  return KenLMModelHelper<NplmModel>(file, kenlm_config)();
131 #endif
132  LERROR("Unsuported format: KENLM_NPLM. Did you compile NPLM library?");
133  exit(EXIT_FAILURE);
134  }
135  return NULL;
136 };
137 
143 template <class Data>
145  typedef lm::base::Model KenLMModelT;
146 
147  private:
148 
150  bool built_;
151 
154 
156  std::string previous_;
157  uint lmo_;
158 
160  KenLMData kld_;
161 
163  uint index_;
165  std::string lmkey_;
166 
167  const ucam::util::RegistryPO& rg_;
168 
169  const std::string wordmapkey_;
170  bool isintegermapped_;
171 
172  public:
173 
184  , const std::string& lmload = HifstConstants::kLmLoad
185  , const std::string& lmscale =
186  HifstConstants::kLmFeatureweights //if rg.get(lmscale)=="" the scale will default to 1
187  , const std::string& lmwp =
188  HifstConstants::kLmWordPenalty //if rg.get(wps)=="" the scale will default to 0
189  , const std::string& wordmapkey = HifstConstants::kLmWordmap
190  , bool forceone = false
191  )
192  : rg_ ( rg )
193  , lmkey_ ( lmload )
194  , previous_ ( "" )
195  , built_ ( false )
196  , index_ ( 0 )
197  , isintegermapped_ (!rg.exists (wordmapkey)
198  || rg.get<std::string> (wordmapkey) == "")
199  , wordmapkey_ (wordmapkey)
200  , lmfile_ ( rg.getVectorString ( lmload , 0 ) )
201  {
202  LDEBUG ( "LM loader using parameters " << lmload << "/" << lmscale << "/" << lmwp
203  << ", and key " << lmkey_ << ",index=" << index_ << ",wordmap=" <<
204  wordmapkey_);
205  FORCELINFO("Language model loader for " << lmfile_() );
206  setLanguageModelScale ( lmscale );
207  setLanguageModelWordPenalty ( lmwp );
208  if ( rg_.getVectorString ( lmload ).size() > 1 ) {
209  if ( !forceone ) {
210  LINFO ( "Appending Language model..." );
211  this->appendTask ( new LoadLanguageModelTask ( rg_, 1, lmload, lmscale , lmwp ,
212  wordmapkey ) );
213  } else {
214  LWARN ( "Only one loaded for " << lmload <<
215  ". Extra language models are being ignored" );
216  }
217  }
218  LINFO ( "Finished constructor!" );
219  };
220 
226  bool run ( Data& d ) {
227  LDEBUG ( "run!" );
228  if ( lmfile_() == "" ) return false;
229  // No need to build again...
230  if ( built_ && previous_ == lmfile_ ( d.sidx ) ) return false;
231  close();
232  FORCELINFO ( "loading LM=" << lmfile_ ( d.sidx ) );
233  d.stats->setTimeStart ("lm-load-" + index_ );
234  lm::ngram::Config kenlm_config;
235  // If lm is not integermapped, then we will need a proper grammar target wordmap.
236  // Make sure we have it.
237  ucam::util::WordMapper *wm = NULL;
238  if (!isintegermapped_) {
239  LINFO ("Using wordmap " << wordmapkey_);
240  LINFO ("There are " << d.wm.size() << " wordmaps");
241  USER_CHECK (d.wm.find (wordmapkey_) != d.wm.end()
242  , "Language model provided over words instead of integers. A target wordmap is required! ");
243  wm = d.wm[wordmapkey_];
244  }
246  kenlm_config.enumerate_vocab = &hev;
247  kld_.model = loadKenLm(lmfile_(d.sidx).c_str(), kenlm_config, index_);
248  d.stats->setTimeEnd ("lm-load-" + index_ );
249  previous_ = lmfile_ ( d.sidx );
250  built_ = true;
251  if ( d.klm.find ( lmkey_ ) == d.klm.end() ) d.klm[lmkey_].resize ( index_ + 1 );
252  else if ( d.klm[lmkey_].size() < index_ + 1 ) d.klm[lmkey_].resize
253  ( index_ + 1 );
254  d.klm[lmkey_][index_] = (const KenLMData*) &kld_;
255  LDEBUG ( "LM " << lmfile_ ( d.sidx ) << " loaded, key=" << lmkey_ <<
256  ", position=" << ucam::util::toString<unsigned> ( d.klm[lmkey_].size() - 1 ) <<
257  ",total number of language models for this key is " << d.klm[lmkey_].size() );
258  return false;
259  };
260 
262  bool close() {
263  if ( kld_.model != NULL ) {
264  LINFO ( "Releasing language model resources..." );
265  delete kld_.model;
266  kld_.model = NULL;
267  built_ = false;
268  return true;
269  }
270  return false;
271  }
272 
275  close();
276  }
277 
278  private:
279 
293  uint index ,
294  const std::string& lmload = HifstConstants::kLmLoad,
295  const std::string& lmscale = HifstConstants::kLmFeatureweights ,
296  const std::string& lmwp = HifstConstants::kLmWordPenalty,
297  const std::string& wordmapkey = HifstConstants::kLmWordmap
298  ) :
299  rg_ ( rg ),
300  lmkey_ ( lmload ),
301  previous_ ( "" ),
302  built_ ( false ),
303  index_ ( index ),
304  isintegermapped_ (!rg.exists (wordmapkey)
305  || rg.get<std::string> (wordmapkey) == ""),
306  wordmapkey_ (wordmapkey),
307  lmfile_ ( rg.getVectorString ( lmload , index ) ) {
308  LDEBUG ( "LM loader using parameters " << lmload << "/" << lmscale <<
309  ", and key " << lmkey_ << ",index=" << index_ << ",wordmapkey=" <<
310  wordmapkey_);
311  setLanguageModelScale ( lmscale );
312  setLanguageModelWordPenalty ( lmwp );
313  if ( rg.getVectorString ( lmload ).size() > index_ + 1 ) {
314  LDEBUG ( "Appending Language model..." );
315  this->appendTask ( new LoadLanguageModelTask ( rg, index_ + 1, lmload, lmscale ,
316  lmwp , wordmapkey ) );
317  }
318  LDEBUG ( "." );
319  };
320 
327  void setLanguageModelScale ( const std::string& lmscale ) {
328  kld_.lmscale = 1.0f;
329  if (!rg_.exists (lmscale) ) {
330  FORCELINFO ( "Language model scale " << index_ << " defaulting to 1.0f" );
331  return;
332  }
333  if (rg_.get<std::string> (lmscale) == "" ) {
334  FORCELINFO ( "Language model scale " << index_ << " defaulting to 1.0f" );
335  return;
336  }
337  std::string aux = rg_.getVectorString ( lmscale, index_ );
338  kld_.lmscale = ucam::util::toNumber<float> ( aux );
339  FORCELINFO ("Language model scale " << index_ << "=" << aux );
340  }
341 
351  void setLanguageModelWordPenalty ( const std::string& lmwp ) {
352  kld_.lmwp = 0.0f;
353  if (!rg_.exists (lmwp) ) {
354  FORCELINFO ( "Language model word penalty " << index_ <<
355  " defaulting to 0.0f" );
356  return;
357  }
358  if (rg_.get<std::string> (lmwp) == "" ) {
359  FORCELINFO ( "Language model scale " << index_ << " defaulting to 0.0f" );
360  return;
361  }
362  std::string aux = rg_.getVectorString ( lmwp, index_ );
363  kld_.lmwp = ucam::util::toNumber<float> ( aux );
364  FORCELINFO ( "Language model word penalty " << index_ << "=" << aux );
365  }
366 };
367 
368 }
369 } // end namespaces
370 #endif
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
std::vector< std::string > getVectorString(const std::string &key) const
Convenience method that returns a vector of strings taking "," as the separator character.
Definition: registrypo.hpp:245
#define LINFO(msg)
T get(const std::string &key) const
Returns parsed value associated to key.
Definition: registrypo.hpp:194
virtual void Add(WordIndex index, const StringPiece &str)
Extend EnumerateVocab to access kenlm ids.
#define FORCELINFO(msg)
This class extends EnumerateVocab in kenlm code. This class creates a grammar-integer to lm-integer h...
#define LDEBUG(msg)
LoadLanguageModelTask(const ucam::util::RegistryPO &rg, const std::string &lmload=HifstConstants::kLmLoad, const std::string &lmscale=HifstConstants::kLmFeatureweights, const std::string &lmwp=HifstConstants::kLmWordPenalty, const std::string &wordmapkey=HifstConstants::kLmWordmap, bool forceone=false)
Public constructor. If the user wants to load several language models (e.g. –lm.load=lm1,lm2,lm3,lm4 and –lm.scale=0.25,0.25,0.25 ), the second and following instances of LoadLanguageModelTask will be created using the private constructor (see below), which has an index to the actual language model that must be loaded. For the public constructor, the index is set to 0.
float lmscale
Scales applied to each model.
Definition: data.lm.hpp:50
Language model loader task, loads a language model wrapping it in a class to provide.
Templated (hybrid) Interface for Task classes.
lm::ngram::Config & kenlm_config_
Definition: task.loadlm.hpp:45
bool run(Data &d)
Method inherited from TaskInterface. Loads the language model and stores in lm data structure...
maps between grammar targets ids and lm ids
std::string const kLmFeatureweights
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
#define LWARN(msg)
bool exists(const std::string &key) const
Determines whether a program option (key) has been defined by the user.
Definition: registrypo.hpp:235
Language Model data structure.
Definition: data.lm.hpp:35
lm::base::Model * loadKenLm(std::string const &file, lm::ngram::Config kenlm_config, unsigned offset=0)
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
#define LERROR(msg)
lm::base::Model * model
KenLM.
Definition: data.lm.hpp:41
std::string const kLmWordmap
std::string const kLmLoad
bool close()
Free language model resources. Returns true if ok, false if otherwise.
KenLMModelHelper(std::string const &file, lm::ngram::Config &kenlm_config)
Definition: task.loadlm.hpp:46
std::string const kLmWordPenalty
Definition: bleu.hpp:14
void AddOutput(WordIndex index, const std::string &s)