tutorial/task_8loadlm_8hpp_source.html

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use these files except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne

 #ifndef LOADLMTASK_HPP
 #define LOADLMTASK_HPP

 #include <kenlmdetect.hpp>
 #include <lm/config.hh>
 #include <lm/enumerate_vocab.hh>
 #ifdef WITH_NPLM
 #include <lm/wrappers/nplm.hh>
 #include <neuralLM.h> // v0.1
 #endif
 #include <idbridge.hpp>
 #include <hifst_enumerate_vocab.hpp>

 namespace ucam {
 namespace fsttools {

 // External wrapped-in implementations
 // don't necessarily have the same constructor.
 // This class handles the general case
 // and template specialization helps to handle exceptions
 template<class KenLMModelT>
 struct KenLMModelHelper {
   std::string const file_;
   lm::ngram::Config &kenlm_config_;
   KenLMModelHelper(std::string const &file
                    , lm::ngram::Config &kenlm_config)
     : file_(file)
     , kenlm_config_(kenlm_config)
   {}

   KenLMModelT *operator()(){
     return new KenLMModelT ( file_.c_str() , kenlm_config_);
   }

 };

 #ifdef WITH_NPLM
 // Specialization for NPLM:
 template<>
 struct KenLMModelHelper<lm::np::Model> {
   std::string const file_;
   lm::ngram::Config &kenlm_config_;

   class NplmVocabularyWrapper {
     nplm::vocabulary const &v_;
   public:
     NplmVocabularyWrapper(nplm::vocabulary const &v):v_(v) {};
     unsigned operator()(std::string const &s) {
       return v_.lookup_word(s);
     }
   };

   KenLMModelHelper(std::string const &file
                    , lm::ngram::Config &kenlm_config)
     : file_(file)
     , kenlm_config_(kenlm_config)
   {}
   lm::np::Model *operator()(){
     // \todo: Deal with this silly kenLM+nplm issue in a more reasonable manner
     // As we cannot pass the kenlm_config to the current
     // nplm wrapper, here goes a v. absurd roundabout
     // to effectively fix it and move on:
     boost::scoped_ptr<nplm::neuralLM> p(new nplm::neuralLM(file_));
     nplm::vocabulary const &v=p->get_vocabulary();
     std::vector<std::string> const &words = v.words();
     lm::HifstEnumerateVocab<ucam::util::WordMapper> *hev
       = dynamic_cast<lm::HifstEnumerateVocab<ucam::util::WordMapper> *>
         (kenlm_config_.enumerate_vocab);
     for (unsigned k = 0; k < words.size(); ++k) {
       hev->Add(k,words[k]);
     }
     // what if input and output vocabulary are different?
     nplm::vocabulary const &vo=p->get_output_vocabulary();
     std::vector<std::string> const &owords = vo.words();
     for (unsigned k = 0; k < owords.size(); ++k) {
       hev->AddOutput(k,owords[k]);
     }


     // lets draw a veil over this and
     // return new model as usual
     return new lm::np::Model( file_);
   }
 };
 #endif

 lm::base::Model *loadKenLm(std::string const &file
                            , lm::ngram::Config kenlm_config
                            , unsigned offset = 0) {
   using namespace lm::ngram;
   typedef lm::np::Model NplmModel;
   // Detect here kenlm binary type
   int  kenmt = ucam::util::detectkenlm(file);
   switch (kenmt) {
   case PROBING:
     return KenLMModelHelper<ProbingModel>(file, kenlm_config)();
   case REST_PROBING:
     return KenLMModelHelper<RestProbingModel>(file, kenlm_config)();
   case TRIE:
     return KenLMModelHelper<TrieModel>(file, kenlm_config)();
  case QUANT_TRIE:
     return KenLMModelHelper<QuantTrieModel>(file, kenlm_config)();
  case ARRAY_TRIE:
     return KenLMModelHelper<ArrayTrieModel>(file, kenlm_config)();
  case QUANT_ARRAY_TRIE:
     return KenLMModelHelper<QuantArrayTrieModel>(file, kenlm_config)();
  case util::KENLM_NPLM:
 #ifdef WITH_NPLM
     return KenLMModelHelper<NplmModel>(file, kenlm_config)();
 #endif
     LERROR("Unsuported format: KENLM_NPLM. Did you compile NPLM library?");
     exit(EXIT_FAILURE);
   }
   return NULL;
 };

 template <class Data>
 class LoadLanguageModelTask: public ucam::util::TaskInterface<Data> {
   typedef lm::base::Model KenLMModelT;

  private:

   bool built_;

   ucam::util::IntegerPatternAddress lmfile_;

   std::string previous_;
   uint lmo_;

   KenLMData kld_;

   uint index_;
   std::string lmkey_;

   const ucam::util::RegistryPO& rg_;

   const std::string wordmapkey_;
   bool isintegermapped_;

  public:

   LoadLanguageModelTask ( const ucam::util::RegistryPO& rg
                           , const std::string& lmload = HifstConstants::kLmLoad
                           , const std::string& lmscale =
                           HifstConstants::kLmFeatureweights  //if rg.get(lmscale)=="" the scale will default to 1
                           , const std::string& lmwp =
                           HifstConstants::kLmWordPenalty  //if rg.get(wps)=="" the scale will default to 0
                           , const std::string& wordmapkey = HifstConstants::kLmWordmap
                           , bool forceone = false
                           )
       : rg_ ( rg )
       , lmkey_ ( lmload )
       , previous_ ( "" )
       , built_ ( false )
       , index_ ( 0 )
       , isintegermapped_ (!rg.exists (wordmapkey)
                           || rg.get<std::string> (wordmapkey) == "")
       , wordmapkey_ (wordmapkey)
       , lmfile_ ( rg.getVectorString ( lmload , 0 ) )
   {
     LDEBUG ( "LM loader using parameters " << lmload << "/" << lmscale << "/" << lmwp
             << ", and key " << lmkey_  << ",index=" << index_ << ",wordmap=" <<
             wordmapkey_);
     FORCELINFO("Language model loader for " << lmfile_() );
     setLanguageModelScale ( lmscale );
     setLanguageModelWordPenalty ( lmwp );
     if ( rg_.getVectorString ( lmload ).size() > 1 ) {
       if ( !forceone ) {
         LINFO ( "Appending Language model..." );
         this->appendTask ( new LoadLanguageModelTask ( rg_, 1, lmload, lmscale , lmwp ,
                            wordmapkey ) );
       } else {
         LWARN ( "Only one loaded for " << lmload <<
                 ". Extra language models are being ignored" );
       }
     }
     LINFO ( "Finished constructor!" );
   };

   bool run ( Data& d ) {
     LDEBUG ( "run!" );
     if ( lmfile_() == "" ) return false;
     // No need to build again...
     if ( built_ && previous_ == lmfile_ ( d.sidx ) ) return false;
     close();
     FORCELINFO ( "loading LM=" << lmfile_ ( d.sidx ) );
     d.stats->setTimeStart ("lm-load-" + index_ );
     lm::ngram::Config kenlm_config;
     // If lm is not integermapped, then we will need a proper grammar target wordmap.
     // Make sure we have it.
     ucam::util::WordMapper *wm = NULL;
     if (!isintegermapped_) {
       LINFO ("Using wordmap " << wordmapkey_);
       LINFO ("There are " << d.wm.size() << " wordmaps");
       USER_CHECK (d.wm.find (wordmapkey_) != d.wm.end()
                   , "Language model provided over words instead of integers. A target wordmap is required! ");
       wm = d.wm[wordmapkey_];
     }
     lm::HifstEnumerateVocab<ucam::util::WordMapper> hev (kld_.idb, wm);
     kenlm_config.enumerate_vocab = &hev;
     kld_.model = loadKenLm(lmfile_(d.sidx).c_str(), kenlm_config, index_);
     d.stats->setTimeEnd ("lm-load-" + index_ );
     previous_ = lmfile_ ( d.sidx );
     built_ = true;
     if ( d.klm.find ( lmkey_ ) == d.klm.end() ) d.klm[lmkey_].resize ( index_ + 1 );
     else if ( d.klm[lmkey_].size() < index_ + 1 ) d.klm[lmkey_].resize
         ( index_ + 1 );
     d.klm[lmkey_][index_] = (const KenLMData*) &kld_;
     LDEBUG ( "LM " << lmfile_ ( d.sidx ) << " loaded, key=" << lmkey_ <<
             ", position=" <<  ucam::util::toString<unsigned> ( d.klm[lmkey_].size() - 1 ) <<
              ",total number of language models for this key is " << d.klm[lmkey_].size() );
     return false;
   };

   bool close() {
     if ( kld_.model != NULL ) {
       LINFO ( "Releasing language model resources..." );
       delete kld_.model;
       kld_.model = NULL;
       built_ = false;
       return true;
     }
     return false;
   }

   ~LoadLanguageModelTask() {
     close();
   }

  private:

   LoadLanguageModelTask ( const ucam::util::RegistryPO& rg ,
                           uint index ,
                           const std::string& lmload = HifstConstants::kLmLoad,
                           const std::string& lmscale = HifstConstants::kLmFeatureweights ,
                           const std::string& lmwp = HifstConstants::kLmWordPenalty,
                           const std::string& wordmapkey = HifstConstants::kLmWordmap
                         ) :
     rg_ ( rg ),
     lmkey_ ( lmload ),
     previous_ ( "" ),
     built_ ( false ),
     index_ ( index ),
     isintegermapped_ (!rg.exists (wordmapkey)
                       || rg.get<std::string> (wordmapkey) == ""),
     wordmapkey_ (wordmapkey),
     lmfile_ ( rg.getVectorString ( lmload , index ) ) {
     LDEBUG ( "LM loader using parameters " << lmload << "/" << lmscale <<
             ", and key " << lmkey_  << ",index=" << index_ << ",wordmapkey=" <<
             wordmapkey_);
     setLanguageModelScale ( lmscale );
     setLanguageModelWordPenalty ( lmwp );
     if ( rg.getVectorString ( lmload ).size() > index_ + 1 ) {
       LDEBUG ( "Appending Language model..." );
       this->appendTask ( new LoadLanguageModelTask ( rg, index_ + 1, lmload, lmscale ,
                          lmwp , wordmapkey ) );
     }
     LDEBUG ( "." );
   };

   void setLanguageModelScale ( const std::string& lmscale ) {
     kld_.lmscale = 1.0f;
     if (!rg_.exists (lmscale) ) {
       FORCELINFO ( "Language model scale " << index_  << " defaulting to 1.0f" );
       return;
     }
     if (rg_.get<std::string> (lmscale) == "" ) {
       FORCELINFO ( "Language model scale " << index_  << " defaulting to 1.0f" );
       return;
     }
     std::string aux = rg_.getVectorString ( lmscale, index_ );
     kld_.lmscale = ucam::util::toNumber<float> ( aux );
     FORCELINFO ("Language model scale " << index_ << "=" << aux );
   }

   void setLanguageModelWordPenalty ( const std::string& lmwp ) {
     kld_.lmwp = 0.0f;
     if (!rg_.exists (lmwp) ) {
       FORCELINFO ( "Language model word penalty " << index_  <<
                    " defaulting to 0.0f" );
       return;
     }
     if (rg_.get<std::string> (lmwp) == "" ) {
       FORCELINFO ( "Language model scale " << index_  << " defaulting to 0.0f" );
       return;
     }
     std::string aux = rg_.getVectorString ( lmwp, index_ );
     kld_.lmwp = ucam::util::toNumber<float> ( aux );
     FORCELINFO ( "Language model word penalty " << index_  <<  "=" << aux );
   }
 };

 }
 } // end namespaces
 #endif
ucam::fsttools::KenLMModelHelper
Definition: task.loadlm.hpp:43

ucam::util::PatternAddress
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
Definition: addresshandler.hpp:33

kenlmdetect.hpp

ucam::util::RegistryPO
Definition: registrypo.hpp:95

ucam::util::RegistryPO::getVectorString
std::vector< std::string > getVectorString(const std::string &key) const
Convenience method that returns a vector of strings taking "," as the separator character.
Definition: registrypo.hpp:245

LINFO
#define LINFO(msg)
Definition: logger.boost_log.hpp:78

ucam::util::RegistryPO::get
T get(const std::string &key) const
Returns parsed value associated to key.
Definition: registrypo.hpp:194

lm::HifstEnumerateVocab::Add
virtual void Add(WordIndex index, const StringPiece &str)
Definition: hifst_enumerate_vocab.hpp:47

hifst_enumerate_vocab.hpp
Extend EnumerateVocab to access kenlm ids.

FORCELINFO
#define FORCELINFO(msg)
Definition: logger.boost_log.hpp:79

lm::HifstEnumerateVocab
This class extends EnumerateVocab in kenlm code. This class creates a grammar-integer to lm-integer h...
Definition: hifst_enumerate_vocab.hpp:38

LDEBUG
#define LDEBUG(msg)
Definition: logger.boost_log.hpp:107

ucam::fsttools::LoadLanguageModelTask::LoadLanguageModelTask
LoadLanguageModelTask(const ucam::util::RegistryPO &rg, const std::string &lmload=HifstConstants::kLmLoad, const std::string &lmscale=HifstConstants::kLmFeatureweights, const std::string &lmwp=HifstConstants::kLmWordPenalty, const std::string &wordmapkey=HifstConstants::kLmWordmap, bool forceone=false)
Public constructor. If the user wants to load several language models (e.g. –lm.load=lm1,lm2,lm3,lm4 and –lm.scale=0.25,0.25,0.25 ), the second and following instances of LoadLanguageModelTask will be created using the private constructor (see below), which has an index to the actual language model that must be loaded. For the public constructor, the index is set to 0.
Definition: task.loadlm.hpp:183

ucam::fsttools::KenLMData::lmscale
float lmscale
Scales applied to each model.
Definition: data.lm.hpp:50

ucam::fsttools::LoadLanguageModelTask
Language model loader task, loads a language model wrapping it in a class to provide.
Definition: task.loadlm.hpp:144

ucam::util::TaskInterface
Templated (hybrid) Interface for Task classes.
Definition: taskinterface.hpp:37

ucam::fsttools::KenLMModelHelper::operator()
KenLMModelT * operator()()
Definition: task.loadlm.hpp:52

ucam::fsttools::KenLMModelHelper::kenlm_config_
lm::ngram::Config & kenlm_config_
Definition: task.loadlm.hpp:45

ucam::fsttools::LoadLanguageModelTask::run
bool run(Data &d)
Method inherited from TaskInterface. Loads the language model and stores in lm data structure...
Definition: task.loadlm.hpp:226

ucam::fsttools::KenLMData::lmwp
float lmwp
Definition: data.lm.hpp:51

idbridge.hpp
maps between grammar targets ids and lm ids

HifstConstants::kLmFeatureweights
std::string const kLmFeatureweights
Definition: constants-fsttools.hpp:22

ucam::util::exists
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
Definition: global_funcs.hpp:82

LWARN
#define LWARN(msg)
Definition: logger.boost_log.hpp:125

ucam::util::RegistryPO::exists
bool exists(const std::string &key) const
Determines whether a program option (key) has been defined by the user.
Definition: registrypo.hpp:235

ucam::fsttools::KenLMData
Language Model data structure.
Definition: data.lm.hpp:35

ucam::fsttools::loadKenLm
lm::base::Model * loadKenLm(std::string const &file, lm::ngram::Config kenlm_config, unsigned offset=0)
Definition: task.loadlm.hpp:108

USER_CHECK
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Definition: custom_assert.hpp:41

ucam::util::WordMapper
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63

ucam::fsttools::KenLMModelHelper::file_
std::string const file_
Definition: task.loadlm.hpp:44

LERROR
#define LERROR(msg)
Definition: logger.boost_log.hpp:119

ucam::fsttools::KenLMData::model
lm::base::Model * model
KenLM.
Definition: data.lm.hpp:41

HifstConstants::kLmWordmap
std::string const kLmWordmap
Definition: constants-fsttools.hpp:24

ucam::fsttools::LoadLanguageModelTask::~LoadLanguageModelTask
~LoadLanguageModelTask()
Destructor.
Definition: task.loadlm.hpp:274

HifstConstants::kLmLoad
std::string const kLmLoad
Definition: constants-fsttools.hpp:23

ucam::fsttools::KenLMData::idb
IdBridge idb
Definition: data.lm.hpp:48

ucam::fsttools::LoadLanguageModelTask::close
bool close()
Free language model resources. Returns true if ok, false if otherwise.
Definition: task.loadlm.hpp:262

ucam::fsttools::KenLMModelHelper::KenLMModelHelper
KenLMModelHelper(std::string const &file, lm::ngram::Config &kenlm_config)
Definition: task.loadlm.hpp:46

HifstConstants::kLmWordPenalty
std::string const kLmWordPenalty
Definition: constants-fsttools.hpp:25

ucam
Definition: bleu.hpp:14

lm::HifstEnumerateVocab::AddOutput
void AddOutput(WordIndex index, const std::string &s)
Definition: hifst_enumerate_vocab.hpp:72