Cambridge SMT System
main-run.hifst.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef MAIN_RUN_HIFST_HPP
16 #define MAIN_RUN_HIFST_HPP
17 
25 namespace ucam {
26 namespace hifst {
27 
28 using boost::asio::ip::tcp;
29 const int max_length = 1024;
30 typedef boost::shared_ptr<tcp::socket> socket_ptr;
31 
35 template < template <class> class DataT
36  , class ArcT
37  >
39  private:
40  typedef DataT<ArcT> Data;
58 
62  std::string textoutput_;
64  const ucam::util::RegistryPO& rg_;
65  bool usingTupleArc_;
66  public:
72  : fastforwardread_ ( new iszfstream ( rg.get<std::string>
74  , textoutput_ ( rg.get<std::string> ( HifstConstants::kTargetStore ) )
75  , rg_ ( rg )
76  , usingTupleArc_(rg.get<std::string>(HifstConstants::kHifstSemiring) == HifstConstants::kHifstSemiringTupleArc )
77  {
78  if (!usingTupleArc_) return;
79  // If we are using the tropical sparse tuple semiring, we need to set feature weights;
80  // Artificial parameters are:
81  // language model(s) feature weights: use the ones provided by the user.
82  // single grammar feature (dot product), hence dot product feature weight=1.
83  // local language models (dot product), hence dot product feature weight=1.
84  std::string params = rg.get<std::string>(HifstConstants::kLmFeatureweights);
85  params += ",1,1";
86  FORCELINFO("fake tuple params=" << params);
87  TupleW32::Params() = ucam::util::ParseParamString<float> (params);
88  };
89 
94  bool run ( Data& d ) {
95  using namespace HifstConstants;
96  // Assigns keys to fetch feature weights appropriately
97  // This keeps all current options available
98  // User can work with lm.featureweights + grammar.featureweights
99  // or only featureweights, as a concatenation of the two previous
100  // (language model features first)
101  // Probably lm.featureweights and grammar.featureweights will be deprecated
102  // at some point.
103  std::string lmFeatureweights = kLmFeatureweights;
104  std::string grammarFeatureweights = kGrammarFeatureweights;
105  unsigned grammarFeatureweightOffset = 0;
106  if (rg_.get<std::string> (kFeatureweights) != "") {
107  grammarFeatureweights = lmFeatureweights = kFeatureweights;
108  grammarFeatureweightOffset = rg_.getVectorString (
109  kLmLoad).size();
110  }
111  boost::scoped_ptr < LoadGrammar> grammartask
112  ( new LoadGrammar ( rg_, grammarFeatureweights, grammarFeatureweightOffset ) );
113  grammartask->appendTask
114  ( LoadWordMap::init ( rg_ , kPreproWordmapLoad , true ) )
116  //TODO: add an option to ensure that postpro.wordmap uses this one
117  //, but then reversed wordmap is forced
118  ( LoadWordMap::init ( rg_ , kLmWordmap , true) )
119  ( new LoadUnimap ( rg_ , kRecaserUnimapLoad ) )
120  ( new PrePro ( rg_ ) )
121  ( new LoadLanguageModel ( rg_
122  , kLmLoad
123  , (usingTupleArc_)? "":lmFeatureweights ) )
124  ( new LoadLanguageModel ( rg_
128  ( new LoadLanguageModel ( rg_
131  , kRecaserLmWps
132  , kRecaserLmWordmap ) )
133  ( new PatternsToInstances ( rg_ ) )
134  ( ReferenceFilter::init ( rg_ ) )
135  ( new SentenceSpecificGrammar ( rg_ ) )
136  ( new Parse ( rg_ ) )
137  ( new HiFST ( rg_ ) )
140  ( new Recase ( rg_
142  , kPostproInput
143  , kRecaserLmLoad ) )
145  ( new PostPro ( rg_ ) )
146  ( new HifstStats ( rg_ ) )
147  ;
148  bool finished = false;
149  oszfstream *fileoutput = NULL;
150  if ( textoutput_ != "" ) {
151  fileoutput = new oszfstream ( textoutput_ );
152  }
154  !ir->done ();
155  ir->next () ) {
156  d.sidx = ir->get ();
157  d.filters.clear();
158  boost::scoped_ptr<std::string> aux ( new std::string ( "" ) );
159  d.translation = aux.get();
160  //Move to whichever next sentence and read
161  finished = fastforwardread_ ( d.sidx , &d.originalsentence );
162  boost::algorithm::trim (d.originalsentence);
163  if (finished && d.originalsentence == "" ) break;
164  FORCELINFO ( "=====Translate sentence " << d.sidx << ":" <<
165  d.originalsentence );
166  grammartask->chainrun ( d ); //Run translation!
167  if ( fileoutput != NULL )
168  *fileoutput << *d.translation << endl;
169  if ( finished ) break;
170  }
171  if ( fileoutput != NULL )
172  delete fileoutput;
173  return false;
174  };
175 
177  inline bool operator() () {
178  Data d;
179  return run ( d );
180  }
181 
182  private:
183 
184  DISALLOW_COPY_AND_ASSIGN ( SingleThreadedHifstTask );
185 
186 };
187 
192 template < template <class> class DataT
193  , class ArcT
194  >
196 
197  private:
198  typedef DataT<ArcT> Data;
216 
219 
221  std::string textoutput_;
222 
224  const ucam::util::RegistryPO& rg_;
225 
227  unsigned threadcount_;
228  bool usingTupleArc_;
229  public:
235  : fastforwardread_ ( new iszfstream ( rg.get<std::string>
236  ( HifstConstants::kSourceLoad ) ) )
237  , textoutput_ ( rg.get<std::string> ( HifstConstants::kTargetStore ) )
238  , threadcount_ ( rg.get<unsigned> ( HifstConstants::kNThreads ) )
239  , usingTupleArc_(rg.get<std::string>(HifstConstants::kHifstSemiring) == HifstConstants::kHifstSemiringTupleArc )
240  , rg_ ( rg ) {
241 
242  if (!usingTupleArc_) return;
243  // If we are using the tropical sparse tuple semiring, we need to set feature weights;
244  // Artificial parameters are:
245  // language model(s) feature weights: use the ones provided by the user.
246  // single grammar feature (dot product), hence dot product feature weight=1.
247  // local language models (dot product), hence dot product feature weight=1.
248  std::string params = rg.get<std::string>(HifstConstants::kLmFeatureweights);
249  params += ",1,1";
250  FORCELINFO("fake tuple params=" << params);
251  TupleW32::Params() = ucam::util::ParseParamString<float> (params);
252  };
253 
259  bool run ( Data& original_data ) {
260  using namespace HifstConstants;
261 
262  std::string lmFeatureweights = kLmFeatureweights;
263  std::string grammarFeatureweights = kGrammarFeatureweights;
264  unsigned grammarFeatureweightOffset = 0;
265  if (rg_.get<std::string> (kFeatureweights) != "") {
266  grammarFeatureweights = lmFeatureweights = kFeatureweights;
267  grammarFeatureweightOffset = rg_.getVectorString (
268  kLmLoad).size();
269  }
270  boost::scoped_ptr < LoadGrammar > grammartask
271  ( new LoadGrammar ( rg_, grammarFeatureweights, grammarFeatureweightOffset ) );
272  grammartask->appendTask
273  ( new LoadLanguageModel ( rg_
274  , kLmLoad
275  , lmFeatureweights ) )
276  ( new LoadLanguageModel ( rg_
280  ( new LoadLanguageModel ( rg_
283  , kRecaserLmWps
284  , kRecaserLmWordmap) )
285  ( new LoadUnimap ( rg_ , kRecaserUnimapLoad ) )
286  ( LoadWordMap::init ( rg_ , kPreproWordmapLoad , true ) )
288  ;
289  //Load grammar and language model
290  grammartask->chainrun ( original_data );
291  std::vector < boost::shared_ptr<std::string> >translations;
292  {
293  ucam::util::TrivialThreadPool tp ( threadcount_ );
294  bool finished = false;
296  !ir->done();
297  ir->next() ) {
298  Data *d = new Data; //( original_data ); // reset.
299  d->grammar = original_data.grammar;
300  d->sidx = ir->get();
301  d->klm = original_data.klm;
302  translations.push_back ( boost::shared_ptr<std::string>
303  ( new std::string ( "" ) ) );
304  d->translation = translations[translations.size() - 1].get();
305  if ( original_data.fsts.find ( kRecaserUnimapLoad ) !=
306  original_data.fsts.end() )
307  d->fsts[kRecaserUnimapLoad] =
308  original_data.fsts[kRecaserUnimapLoad];
309  d->recasingvcblm = original_data.recasingvcblm;
310  d->wm = original_data.wm;
311  finished = fastforwardread_ ( d->sidx ,
312  & ( d->originalsentence ) ); //Move to whichever next sentence and read
313  if (finished && d->originalsentence == "") break;
314  FORCELINFO ( "=====Translate sentence " << d->sidx << ":" <<
315  d->originalsentence );
316  PrePro *p = new PrePro ( rg_ );
317  p->appendTask
318  ( new PatternsToInstances ( rg_ ) )
319  ( ReferenceFilter::init ( rg_ ) )
320  ( new SentenceSpecificGrammar ( rg_ ) )
321  ( new Parse ( rg_ ) )
322  ( new HiFST ( rg_ ) )
325  ( new Recase ( rg_ ,
331  ( new PostPro ( rg_ ) )
332  ( new HifstStats ( rg_ ) )
333  ;
334  tp ( ucam::util::TaskFunctor<Data> ( p, d ) );
335  if ( finished ) break;
336  }
337  }
339  if ( textoutput_ == "" ) return false;
340  boost::scoped_ptr<oszfstream> fileoutput ( new oszfstream ( textoutput_ ) );
341  for ( unsigned k = 0; k < translations.size(); ++k )
342  *fileoutput << *translations[k] << endl;
343  return false;
344  };
345 
347  inline bool operator() () {
348  Data d;
349  return run ( d );
350  }
351 
352  private:
353 
354  DISALLOW_COPY_AND_ASSIGN ( MultiThreadedHifstTask );
355 
356 };
357 
362 template < template <class> class DataT
363  , class ArcT
364  >
365 class HifstServerTask: public ucam::util::TaskInterface<DataT<ArcT> > {
366  private:
367  typedef DataT<ArcT> Data;
382 
384  const ucam::util::RegistryPO& rg_;
385 
387  short port_;
388 
390  Data d_;
391 
393  boost::scoped_ptr < GrammarTask < Data > >ttask_;
394 
396  class translation {
397  private:
399  const ucam::util::RegistryPO& rg_;
400 
401  public:
403  translation ( const ucam::util::RegistryPO& rg ) : rg_ ( rg ) {};
404 
412  bool operator () ( socket_ptr sock, Data& d ) {
413  using namespace HifstConstants;
415  LINFO ( "Init new taskdata..." );
416  boost::scoped_ptr<Data> mydata ( new Data );
417  mydata->grammar = d.grammar;
418  mydata->klm = d.klm;
419  mydata->filters.clear();
420  if ( d.fsts.find ( kRecaserUnimapLoad ) != d.fsts.end() )
421  mydata->fsts[kRecaserUnimapLoad] =
422  d.fsts[kRecaserUnimapLoad];
423  mydata->recasingvcblm = d.recasingvcblm;
424  mydata->wm = d.wm;
425  LINFO ("Number of wordmaps... " << mydata->wm.size() );
426  try {
427  char data[max_length + 1];
428  std::size_t query_length = 0;
429  std::size_t query_length1 = boost::asio::read ( *sock,
430  boost::asio::buffer ( &query_length, sizeof ( std::size_t ) ) );
431  std::size_t query_length2 = boost::asio::read ( *sock,
432  boost::asio::buffer ( data, query_length ) );
433  data[query_length2] = 0;
434  mydata->originalsentence = data;
435  FORCELINFO ( "Query to translate: " << mydata->originalsentence );
436  boost::scoped_ptr<std::string> translation ( new std::string );
437  mydata->translation = translation.get();
438  this->operator() ( *mydata );
439  char datasend[max_length + 1];
440  //Send data
441  strcpy ( datasend, ( char * ) translation.get()->c_str() );
442  std::size_t length = strlen ( datasend );
443  boost::asio::write ( *sock, boost::asio::buffer ( &length,
444  sizeof ( std::size_t ) ) );
445  FORCELINFO ( "Sending:" << datasend );
446  boost::asio::write ( *sock, boost::asio::buffer ( datasend, length ) );
447  sock->close();
448  } catch ( std::exception& e ) {
449  std::cerr << "Exception in thread! " << e.what() << "\n";
450  }
451  return true;
452  };
453 
454  private:
455 
459  bool operator () ( Data& d ) {
460  using namespace HifstConstants;
461  PrePro p ( rg_ );
462  p.appendTask
463  ( new PatternsToInstances ( rg_ ) )
464  ( new SentenceSpecificGrammar ( rg_ ) )
465  ( new Parse ( rg_ ) )
466  ( new HiFST ( rg_ ) )
467  ( new WriteFst ( rg_ , kHifstLatticeStore ) )
468  ( new Recase ( rg_ ,
473  ( new PostPro ( rg_ ) )
474  ;
475  p.chainrun ( d ); //Run translation!
476  }
477 
478  };
479 
480  public:
486  rg_ ( rg ),
487  port_ ( rg.get<short> ( HifstConstants::kServerPort ) ) {
488  };
489 
492  void load() {
493  using namespace HifstConstants;
494  std::string lmFeatureweights = kLmFeatureweights;
495  std::string grammarFeatureweights = kGrammarFeatureweights;
496  unsigned grammarFeatureweightOffset = 0;
497  if (rg_.get<std::string> (kFeatureweights) != "") {
498  grammarFeatureweights = lmFeatureweights = kFeatureweights;
499  grammarFeatureweightOffset = rg_.getVectorString (kLmLoad).size();
500  }
501  ttask_.reset ( new LoadGrammar ( rg_, grammarFeatureweights,
502  grammarFeatureweightOffset ) );
503  ttask_->appendTask
504  ( new LoadLanguageModel ( rg_
505  , kLmLoad
506  , lmFeatureweights ) )
507  ( new LoadLanguageModel ( rg_
511  ) )
512  ( new LoadLanguageModel ( rg_
515  ) )
516  ( new LoadUnimap ( rg_ , kRecaserUnimapLoad ) )
517  ( LoadWordMap::init ( rg_ , kPreproWordmapLoad, true ) )
519  ;
520  //Load grammar and language model
521  ttask_->chainrun ( d_ );
522  };
523 
525  inline bool operator() () {
526  load();
527  return run ( d_ );
528  }
529 
530  private:
531 
534  bool run ( Data& d ) {
535  boost::asio::io_service io_service;
536  tcp::acceptor a ( io_service, tcp::endpoint ( tcp::v4(), port_ ) );
537  for ( ;; ) {
538  LINFO ( "Waiting for a connection at port=" << port_ );
539  socket_ptr sock ( new tcp::socket ( io_service ) );
540  a.accept ( *sock );
541  translation tr ( rg_ );
542  boost::thread t ( boost::bind<void> ( tr, sock, d ) );
543  LINFO ( "Connection accepted... Thread created..." );
544  }
545  };
546 
547  DISALLOW_COPY_AND_ASSIGN ( HifstServerTask );
548 
549 };
550 
551 }
552 } // end namespaces
553 
554 #endif // MAIN_RUN_HIFST_HPP
Wrapper stream class that writes to pipes, text files or gzipped files.
Definition: szfstream.hpp:200
std::string const kHifstSemiring
Convenience class that reads "quickly" until a queried line.
Definition: szfstream.hpp:381
static WriteFstTask * init(const ucam::util::RegistryPO &rg, const std::string &fstkey, const std::string &readfstkey="")
const std::string kHifstLatticeStore
const std::string kServerPort
List of constants to be used both across program options and class runners.
const std::string kHifstLocalpruneLmFeatureweights
const std::string kHifstStripSpecialEpsilonLabels
std::vector< std::string > getVectorString(const std::string &key) const
Convenience method that returns a vector of strings taking "," as the separator character.
Definition: registrypo.hpp:245
Implements cyk+ parser.
Reads text file, performs tokenization and integer-mapping.
Definition: task.prepro.hpp:33
#define LINFO(msg)
std::string const kRecaserUnimapLoad
std::string const kRecaserLmLoad
Full multi-threaded Translation system.
static LoadWordMapTask * init(const ucam::util::RegistryPO &rg, const std::string &key, bool reverse=false)
Static constructor, will return NULL if there is no need for word-mapping.
Converts patterns to instanced patterns.
const std::string kGrammarFeatureweights
const std::string kTargetStore
const std::string kPreproWordmapLoad
T get(const std::string &key) const
Returns parsed value associated to key.
Definition: registrypo.hpp:194
#define FORCELINFO(msg)
Task that writes translation to a text file. This translation might be recased, wordmapped and tokeni...
boost::scoped_ptr< NumberRangeInterface< unsigned > > IntRangePtr
Definition: range.hpp:214
const std::string kPostproWordmapLoad
Generates a substring version of a reference translation lattice and associated vocabulary. This substring fst is typically used to guide translation towards a particular search space. The associated vocabulary can be used e.g. to restrict parsing algorithms.
boost::shared_ptr< tcp::socket > socket_ptr
static ReferenceFilterTask * init(const ucam::util::RegistryPO &rg, const std::string &referenceloadkey=HifstConstants::kReferencefilterLoad, const std::string &referencelatticekey=HifstConstants::kReferencefilterNosubstringStore)
Static constructor, returns NULL if the substring lattice is not needed (e.g. hifst not in alignment ...
#define IntRangeFactory
Definition: range.hpp:213
Core of Hifst. Implements the lattice-building procedure for a cyk-parsed sentence.
Definition: task.hifst.hpp:50
Trivial implementation of a threadpool based on boost::asio methods When initiated, creates a threadpool of n threads (n <= number of cpus). Jobs should be submitted with the templated operator(). When the object is deleted it will wait for all threads to finish.
const std::string kPostproInput
Language model loader task, loads a language model wrapping it in a class to provide.
Loads wordmap in constructor and delivers pointer to data object during run time. ...
Convenience class that inherits Taskinterface behaviour and writes an fst to [file] using a key defin...
Loads a unigram transduction model (aka unimap file) from a file with the format accepted by srilm di...
std::string const kRecaserOutput
const std::string kHifstLocalpruneLmLoad
Task class that loads a grammar into memory.
bool chainrun(Data &d)
Implements chain of responsability. Calls run method and, if there is another task, call its run method too.
Templated (hybrid) Interface for Task classes.
bool operator()()
Runs using its own internal data object.
Convenience class that inherits Taskinterface behaviour and optimizes an fst.
Simple functor that accepts an interface and pointer to the data object in which it will have to run ...
TaskInterface & appendTask(TaskInterface *t)
Appends a task class. If there is no task, append here, otherwise delegate in next task...
const std::string kHifstLocalpruneLmWordpenalty
std::string const kLmFeatureweights
static OptimizeFstTask * init(const ucam::util::RegistryPO &rg, const std::string &optimizefstkey, const std::string &fstkey, const std::string &stripepskey)
const std::string kNThreads
std::string const kHifstSemiringTupleArc
SingleThreadedHifstTask(const ucam::util::RegistryPO &rg)
Constructor.
Disambig Task tool. Given a search space, applies a unigram transduction model (generating alternativ...
bool run(Data &original_data)
Translates an input sentence (multithreaded)
HifstServerTask(const ucam::util::RegistryPO &rg)
Constructor.
This class uses instantiated patterns to analyze the grammar and deliver two hashes providing candida...
Full single-threaded Translation system.
const std::string kSourceLoad
std::string const kRecaserLmFeatureweight
bool run(Data &d)
Translates an input sentence (single threaded)
const std::string kHifstLatticeOptimize
Reads StatsData and dumps all stats to (sentence-specific) file. Provides a special method for cyk da...
std::string const kLmWordmap
std::string const kRecaserLmWps
std::string const kLmLoad
std::string const kRecaserLmWordmap
const std::string kFeatureweights
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
MultiThreadedHifstTask(const ucam::util::RegistryPO &rg)
Constructor.
Definition: bleu.hpp:14