15 #ifndef LOADLMTASK_HPP 16 #define LOADLMTASK_HPP 26 #include <lm/config.hh> 27 #include <lm/enumerate_vocab.hh> 29 #include <lm/wrappers/nplm.hh> 42 template<
class KenLMModelT>
47 , lm::ngram::Config &kenlm_config)
49 , kenlm_config_(kenlm_config)
62 std::string
const file_;
65 class NplmVocabularyWrapper {
66 nplm::vocabulary
const &v_;
68 NplmVocabularyWrapper(nplm::vocabulary
const &v):v_(v) {};
70 return v_.lookup_word(s);
75 , lm::ngram::Config &kenlm_config)
84 boost::scoped_ptr<nplm::neuralLM> p(
new nplm::neuralLM(file_));
85 nplm::vocabulary
const &v=p->get_vocabulary();
86 std::vector<std::string>
const &words = v.words();
89 (kenlm_config_.enumerate_vocab);
90 for (
unsigned k = 0; k < words.size(); ++k) {
94 nplm::vocabulary
const &vo=p->get_output_vocabulary();
95 std::vector<std::string>
const &owords = vo.words();
96 for (
unsigned k = 0; k < owords.size(); ++k) {
103 return new lm::np::Model( file_);
109 , lm::ngram::Config kenlm_config
110 ,
unsigned offset = 0) {
111 using namespace lm::ngram;
112 typedef lm::np::Model NplmModel;
114 int kenmt = ucam::util::detectkenlm(file);
126 case QUANT_ARRAY_TRIE:
128 case util::KENLM_NPLM:
132 LERROR(
"Unsuported format: KENLM_NPLM. Did you compile NPLM library?");
143 template <
class Data>
145 typedef lm::base::Model KenLMModelT;
156 std::string previous_;
169 const std::string wordmapkey_;
170 bool isintegermapped_;
185 ,
const std::string& lmscale =
187 ,
const std::string& lmwp =
190 ,
bool forceone =
false 197 , isintegermapped_ (!rg.
exists (wordmapkey)
198 || rg.get<std::string> (wordmapkey) ==
"")
199 , wordmapkey_ (wordmapkey)
200 , lmfile_ ( rg.getVectorString ( lmload , 0 ) )
202 LDEBUG (
"LM loader using parameters " << lmload <<
"/" << lmscale <<
"/" << lmwp
203 <<
", and key " << lmkey_ <<
",index=" << index_ <<
",wordmap=" <<
205 FORCELINFO(
"Language model loader for " << lmfile_() );
206 setLanguageModelScale ( lmscale );
207 setLanguageModelWordPenalty ( lmwp );
210 LINFO (
"Appending Language model..." );
214 LWARN (
"Only one loaded for " << lmload <<
215 ". Extra language models are being ignored" );
218 LINFO (
"Finished constructor!" );
228 if ( lmfile_() ==
"" )
return false;
230 if ( built_ && previous_ == lmfile_ ( d.sidx ) )
return false;
232 FORCELINFO (
"loading LM=" << lmfile_ ( d.sidx ) );
233 d.stats->setTimeStart (
"lm-load-" + index_ );
234 lm::ngram::Config kenlm_config;
238 if (!isintegermapped_) {
239 LINFO (
"Using wordmap " << wordmapkey_);
240 LINFO (
"There are " << d.wm.size() <<
" wordmaps");
241 USER_CHECK (d.wm.find (wordmapkey_) != d.wm.end()
242 ,
"Language model provided over words instead of integers. A target wordmap is required! ");
243 wm = d.wm[wordmapkey_];
246 kenlm_config.enumerate_vocab = &hev;
247 kld_.
model =
loadKenLm(lmfile_(d.sidx).c_str(), kenlm_config, index_);
248 d.stats->setTimeEnd (
"lm-load-" + index_ );
249 previous_ = lmfile_ ( d.sidx );
251 if ( d.klm.find ( lmkey_ ) == d.klm.end() ) d.klm[lmkey_].resize ( index_ + 1 );
252 else if ( d.klm[lmkey_].size() < index_ + 1 ) d.klm[lmkey_].resize
254 d.klm[lmkey_][index_] = (
const KenLMData*) &kld_;
255 LDEBUG (
"LM " << lmfile_ ( d.sidx ) <<
" loaded, key=" << lmkey_ <<
256 ", position=" << ucam::util::toString<unsigned> ( d.klm[lmkey_].size() - 1 ) <<
257 ",total number of language models for this key is " << d.klm[lmkey_].size() );
263 if ( kld_.
model != NULL ) {
264 LINFO (
"Releasing language model resources..." );
304 isintegermapped_ (!rg.
exists (wordmapkey)
305 || rg.
get<std::string> (wordmapkey) ==
""),
306 wordmapkey_ (wordmapkey),
308 LDEBUG (
"LM loader using parameters " << lmload <<
"/" << lmscale <<
309 ", and key " << lmkey_ <<
",index=" << index_ <<
",wordmapkey=" <<
311 setLanguageModelScale ( lmscale );
312 setLanguageModelWordPenalty ( lmwp );
314 LDEBUG (
"Appending Language model..." );
316 lmwp , wordmapkey ) );
327 void setLanguageModelScale (
const std::string& lmscale ) {
329 if (!rg_.
exists (lmscale) ) {
330 FORCELINFO (
"Language model scale " << index_ <<
" defaulting to 1.0f" );
333 if (rg_.
get<std::string> (lmscale) ==
"" ) {
334 FORCELINFO (
"Language model scale " << index_ <<
" defaulting to 1.0f" );
338 kld_.
lmscale = ucam::util::toNumber<float> ( aux );
339 FORCELINFO (
"Language model scale " << index_ <<
"=" << aux );
351 void setLanguageModelWordPenalty (
const std::string& lmwp ) {
353 if (!rg_.
exists (lmwp) ) {
354 FORCELINFO (
"Language model word penalty " << index_ <<
355 " defaulting to 0.0f" );
358 if (rg_.
get<std::string> (lmwp) ==
"" ) {
359 FORCELINFO (
"Language model scale " << index_ <<
" defaulting to 0.0f" );
363 kld_.
lmwp = ucam::util::toNumber<float> ( aux );
364 FORCELINFO (
"Language model word penalty " << index_ <<
"=" << aux );
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
std::vector< std::string > getVectorString(const std::string &key) const
Convenience method that returns a vector of strings taking "," as the separator character.
T get(const std::string &key) const
Returns parsed value associated to key.
virtual void Add(WordIndex index, const StringPiece &str)
Extend EnumerateVocab to access kenlm ids.
This class extends EnumerateVocab in kenlm code. This class creates a grammar-integer to lm-integer h...
Templated (hybrid) Interface for Task classes.
maps between grammar targets ids and lm ids
std::string const kLmFeatureweights
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
bool exists(const std::string &key) const
Determines whether a program option (key) has been defined by the user.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
std::string const kLmWordmap
std::string const kLmLoad
std::string const kLmWordPenalty
void AddOutput(WordIndex index, const std::string &s)