Cambridge SMT System
main-run.applylm.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne
14 
15 #ifndef MAIN_RUN_APPLYLM_HPP
16 #define MAIN_RUN_APPLYLM_HPP
17 
18 #include <szfstream.hpp>
19 
28 namespace ucam {
29 namespace fsttools {
30 
31 namespace HC = HifstConstants;
32 
33 // Create a vector of source windows given an input sentence.
34 // Assume odd sizes (1,3,...)
35 inline void createSourceWindows(std::string const &integerMappedSentence
36  , unsigned srcSize
37  , std::vector< std::vector<unsigned> > &srcw) {
38 
39  LINFO("Creating source window...");
40  if (srcSize && !(srcSize % 2 )) {
41  LERROR("Only 0 or odd source sizes (1,3,...)");
42  exit(EXIT_FAILURE);
43  }
44  if (!srcSize) return;
45  using namespace boost::algorithm;
46  std::deque<std::string> words;
47  split(words, integerMappedSentence,is_any_of(" "));
48  srcw.clear();
49  srcw.resize(words.size());
50  // assume that we already have <s> and </s> (i.e. 1,2).
51  for (unsigned k = 0; k < srcSize/2; ++k) {
52  words.push_front("1");
53  words.push_back("2");
54  }
55 
57  for (unsigned k = 0; k < words.size() - srcSize + 1; ++k) {
58  srcw[k].push_back(toNumber<unsigned>(words[k]));
59  std::vector<unsigned> &aux = srcw[k];
60  for (unsigned j = 1; j < srcSize; ++j) {
61  aux.push_back(toNumber<unsigned>(words[ k + j ]));
62  }
63  }
64 #ifdef PRINTDEBUG1
65  // Print original src words here:
66  for (unsigned k = 0; k < srcw.size(); ++k) {
67  std::stringstream ss; ss << srcw[k][0];
68  for (unsigned j = 1; j < srcw[k].size(); ++j ){
69  ss << " " << srcw[k][j];
70  }
71  LDEBUG("*** ORIGINAL src words=" << ss.str());
72  }
73 #endif
74 };
75 
76 template< class ArcT, template<class> class DataT >
78 (bool bilm, ucam::util::RegistryPO const& rg) {
79 
80  typedef DataT<ArcT> Data;
81  typedef ApplyLanguageModelTask< Data, ArcT > ApplyLanguageModel;
82  typedef ApplyBiLMTask< Data, ArcT > ApplyBiLM;
83  using namespace HifstConstants;
84 
85  if (!bilm)
86  return new ApplyLanguageModel
87  ( rg
88  , kLmLoad
89  , kLatticeLoad
92  ;
93 
94  return new ApplyBiLM
95  ( rg
96  , kLmLoad
97  , kLatticeLoad
100  ;
101 };
102 
108 template < template <class> class DataT
109  , class ArcT
110  >
112  : public ucam::util::TaskInterface<DataT<ArcT> > {
113  private:
114  typedef DataT<ArcT> Data;
123  typedef boost::scoped_ptr<FastForwardRead> FastForwardReadPtr;
124 
125  // Use bilingual model or not
126  bool bilm_;
127  // Source window size for bilingual models:
128  unsigned srcSize_;
129  // source file
130  std::string srcFile_;
131  // Convenience file reading with range parameter:
132  FastForwardReadPtr fastForwardRead_;
133  //Command-line/config file options
134  ucam::util::RegistryPO const& rg_;
135 
136  public:
142  : bilm_(rg.getBool(HC::kUseBilingualModel))
143  , srcSize_(rg.get<unsigned>(HC::kUseBilingualModelSourceSize))
144  , srcFile_(rg.get<string>(HC::kUseBilingualModelSourceSentenceFile))
145  , rg_ ( rg )
146  {
147  if (bilm_)
148  if (srcFile_ != "")
149  fastForwardRead_.reset(new FastForwardRead
150  ( new iszfstream ( srcFile_) ));
151  else {
153  << " needs to be defined");
154  exit(EXIT_FAILURE);
155  }
156  };
157 
160  void operator() () {
161  Data d;
162  run ( d );
163  }
164 
170  bool run ( Data& original_data ) {
171  using namespace HifstConstants;
172  boost::scoped_ptr< LoadWordMap > tasks
173  (new LoadWordMap (rg_, kLmWordmap, true) );
174  tasks->appendTask
175  ( new LoadLanguageModel ( rg_ ) )
176  ( new ReadFst ( rg_ , kLatticeLoad ) )
177  ( addApplyLM<ArcT,DataT>(bilm_, rg_ ) )
178  ( WriteFst::init ( rg_ , kLatticeStore ) )
179  ( TuneWpWriteFst::init( rg_, kTuneWrite, kLatticeStore ) )
180  ( new SpeedStats ( rg_ ) )
181  ;
182  using namespace ucam::util;
183 
184  bool finished = false;
185  Data d;
186  for ( IntRangePtr ir
187  (IntRangeFactory ( rg_, kRangeOne ) )
188  ; !ir->done()
189  ; ir->next() ) {
190  d.sidx = ir->get();
191  if (bilm_) {
192  finished = (*fastForwardRead_) ( d.sidx , &d.integerMappedSentence);
193  // could go to task.prepro, or even better, make it a task of its own,
194  // so it's shareable.
195  createSourceWindows(d.integerMappedSentence, srcSize_, d.sourceWindows);
196  }
197  LINFO ( "Running sentence " << d.sidx );
198  tasks->chainrun ( d ); //Run!
199  if (bilm_ && finished) break;
200  }
201  // let the next task run.
202  return false;
203  }
204 };
205 
210 template < template <class> class DataT
211  , class ArcT
212  >
214  : public ucam::util::TaskInterface<DataT<ArcT> > {
215  private:
216  typedef DataT<ArcT> Data;
224 
227  typedef boost::scoped_ptr<FastForwardRead> FastForwardReadPtr;
228 
229  // Use bilingual model or not
230  bool bilm_;
231  // Source window size for bilingual models:
232  unsigned srcSize_;
233  // source file
234  std::string srcFile_;
235  // Convenience file reading with range parameter:
236  FastForwardReadPtr fastForwardRead_;
238  ucam::util::RegistryPO const& rg_;
240  unsigned threadcount_;
241  public:
247  : threadcount_ ( rg.get<unsigned> ( HC::kNThreads ) )
248  , bilm_(rg.getBool(HC::kUseBilingualModel))
249  , srcSize_(rg.get<unsigned>(HC::kUseBilingualModelSourceSize))
250  , srcFile_(rg.get<string>(HC::kUseBilingualModelSourceSentenceFile))
251  , rg_ ( rg )
252  {
253  if (bilm_)
254  if (srcFile_ != "")
255  fastForwardRead_.reset(new FastForwardRead
256  ( new iszfstream ( srcFile_) ));
257  else {
259  << " needs to be defined");
260  exit(EXIT_FAILURE);
261  }
262  };
263 
265  inline bool operator() () {
266  Data d;
267  return run ( d );
268  }
269 
280  bool run ( Data& original_data ) {
281  using namespace HifstConstants;
282  boost::scoped_ptr< LoadWordMap > mylm
283  (new LoadWordMap (rg_, kLmWordmap, true) );
284  mylm->appendTask ( new LoadLanguageModel ( rg_ ) );
285  mylm->chainrun ( original_data ); //Loading language model only once
286  {
287  using namespace ucam::util;
288  TrivialThreadPool tp ( threadcount_ );
289  bool finished = false;
290  for ( IntRangePtr ir(IntRangeFactory ( rg_, kRangeOne ) )
291  ; !ir->done(); ir->next() ) {
292  ReadFst *applylm = new ReadFst ( rg_ , kLatticeLoad ) ;
293  applylm->appendTask
294  ( addApplyLM<ArcT,DataT>(bilm_, rg_ ) )
295  ( WriteFst::init( rg_ , kLatticeStore ) )
296  ( TuneWpWriteFst::init( rg_, kTuneWrite, kLatticeStore) )
297  ( new SpeedStats ( rg_ ) )
298  ;
299  Data *d = new Data; //( original_data );
300  d->klm = original_data.klm;
301  d->sidx = ir->get();
302  if (bilm_) {
303  finished = (*fastForwardRead_) ( d->sidx , &d->integerMappedSentence);
304  // could go to task.prepro, or even better, make it a task of its own,
305  // so it's shareable.
306  createSourceWindows(d->integerMappedSentence, srcSize_, d->sourceWindows);
307  }
308  LINFO ( "Running sentence " << d->sidx );
309  tp ( TaskFunctor<Data> ( applylm, d ) );
310  if (bilm_ && finished) break;
311  }
312  }
313  return false;
314  };
315 };
316 
317 }} // end namespaces
318 
319 #endif //MAIN_RUN_APPLYLM_HPP
Convenience class that reads "quickly" until a queried line.
Definition: szfstream.hpp:381
void run(ucam::util::RegistryPO const &rg)
Stream wrapper for pipe/text/compressed files.
void createSourceWindows(std::string const &integerMappedSentence, unsigned srcSize, std::vector< std::vector< unsigned > > &srcw)
std::string const kUseBilingualModelSourceSentenceFile
std::string const kUseBilingualModel
#define LINFO(msg)
Convenience class that inherits Taskinterface behaviour and writes an fst to [file] using a key defin...
std::string const kLatticeStore
boost::scoped_ptr< NumberRangeInterface< unsigned > > IntRangePtr
Definition: range.hpp:214
#define LDEBUG(msg)
#define IntRangeFactory
Definition: range.hpp:213
Trivial implementation of a threadpool based on boost::asio methods When initiated, creates a threadpool of n threads (n <= number of cpus). Jobs should be submitted with the templated operator(). When the object is deleted it will wait for all threads to finish.
SingleThreadedApplyLanguageModelTask(ucam::util::RegistryPO const &rg)
Constructor.
Language model loader task, loads a language model wrapping it in a class to provide.
Loads wordmap in constructor and delivers pointer to data object during run time. ...
Convenience class that inherits Taskinterface behaviour and writes an fst to [file] using a key defin...
ucam::util::TaskInterface< DataT< ArcT > > * addApplyLM(bool bilm, ucam::util::RegistryPO const &rg)
Templated (hybrid) Interface for Task classes.
MultiThreadedApplyLanguageModelTask(ucam::util::RegistryPO const &rg)
Constructor.
Simple functor that accepts an interface and pointer to the data object in which it will have to run ...
TaskInterface & appendTask(TaskInterface *t)
Appends a task class. If there is no task, append here, otherwise delegate in next task...
std::string const kLatticeLoadDeleteLmCost
Task that reads stats from data object and writes them to a [file].
Definition: task.stats.hpp:29
std::string const kTuneWrite
const std::string kNThreads
Class for single threaded application of language model. It inherits taskinterface behaviour and also...
bool exists(const std::string &key) const
Determines whether a program option (key) has been defined by the user.
Definition: registrypo.hpp:235
bool run(Data &original_data)
Multithread lm application. Runs only if option –nthreads defined >0.
Class for multithreaded application of language model. Inherits taskinterface and provides standalone...
bool run(Data &original_data)
Core function running language model application. Creates list of tasks (load lm, apply lm) and execu...
Convenience class that loads an fst using a key defined in the constructor and delivers it to the dat...
std::string const kLatticeLoad
T toNumber(const std::string &x)
Converts a string to an arbitrary number Converts strings to a number. Quits execution if conversion ...
Language model loader task, loads a language model wrapping it in a class to provide.
#define LERROR(msg)
std::string const kLmWordmap
std::string const kLmLoad
const std::string kRangeOne
Definition: range.hpp:26
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
std::string const kUseBilingualModelSourceSize
Definition: bleu.hpp:14