Cambridge SMT System
task.disambig.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef TASK_DISAMBIG_HPP
16 #define TASK_DISAMBIG_HPP
17 
27 
28 namespace ucam {
29 namespace fsttools {
30 
35 template<class Data, class Arc>
37 private:
38  typedef typename Arc::Weight Weight;
39  typedef typename Arc::Label Label;
40 
42  fst::VectorFst<Arc> *unimap_;
44  unsigned shp_;
45 
47  float prune_;
48 
50  const ucam::util::RegistryPO& rg_;
51 
53  const std::string lmkey_;
55  const std::string inputkey_;
57  const std::string outputkey_;
59  const std::string unimapkey_;
60 
62  Data *d_;
63 
66 
68  fst::VectorFst<Arc> olattice_;
69 
70  public:
73  const std::string& inputkey = HifstConstants::kRecaserInput,
74  const std::string& outputkey = HifstConstants::kRecaserOutput,
75  const std::string& lmkey = HifstConstants::kRecaserLmLoad,
76  const std::string& unimapkey = HifstConstants::kRecaserUnimapLoad,
77  bool forceloading = false ) :
78  rg_ ( rg ),
79  lmkey_ ( lmkey ),
80  inputkey_ ( inputkey ),
81  outputkey_ ( outputkey ),
82  unimapkey_ ( unimapkey ),
83  unimap_ ( NULL ) {
85  // Prune or shortest path...
86  std::vector<std::string> pstrat = rg_.getVectorString (
88  USER_CHECK ( pstrat.size() == 2,
89  "prune parameter must be byshortestpath/byweight,number" );
90  if ( pstrat[0] == "byshortestpath" ) {
91  LINFO ( "Shortest Path n=" << pstrat[1] );
92  shp_ = toNumber<unsigned> ( pstrat[1] );
93  prune_ = std::numeric_limits<float>::max();
94  } else if ( pstrat[0] == "byweight" ) {
95  LINFO ( "Prune by weight b=" << pstrat[1] );
96  shp_ = std::numeric_limits<unsigned>::max();
97  prune_ = toNumber<float> ( pstrat[1] );
98  } else {
99  USER_CHECK ( false,
100  "prune parameter incorrectly set: first parameter is byshortestpath or byweight" );
101  }
102  if ( rg.get<std::string> ( lmkey_) == ""
103  && rg.get<std::string> ( unimapkey_ ) == "" ) return;
104  if ( ! USER_CHECK ( rg.get<std::string> ( lmkey_ ) != ""
105  && rg.get<std::string> ( unimapkey_ ) != "" ,
106  "recaser.lm and recaser.unimap must either be both defined or both left to empty string " ) )
107  return;
108  };
109 
111  bool run ( Data& d ) {
112  d_ = &d;
113  USER_CHECK ( d.fsts.find ( inputkey_ ) != d.fsts.end(),
114  "No input fst to recase?" );
115  fst::ShortestPath<Arc> ( * ( static_cast< fst::VectorFst<Arc> * >
116  (d.fsts[inputkey_] ) ), &olattice_, 1 );
117  fst::Map<Arc> ( &olattice_, fst::RmWeightMapper<Arc>() );
118  run ( &olattice_ );
119  LINFO ( "(Recased) lattice available at key=" << outputkey_ );
120  d.fsts[outputkey_] = &olattice_;
121  return false;
122  };
123 
124  virtual ~DisambigTask() { }
125 
126  private:
128  typedef boost::shared_ptr<ApplyLanguageModelOnTheFlyInterfaceType> ApplyLanguageModelOnTheFlyInterfacePtrType;
129  ApplyLanguageModelOnTheFlyInterfacePtrType almotf_;
130 
131 
132  // Initializes appropriate templated kenlm handler for composition
133  // TODO: this code can be merged with task.applylm and task.hifst
134  void initializeLanguageModelHandler() {
135  if (almotf_.get() ) return; // already initialized
136  USER_CHECK ( d_->klm.find ( lmkey_ ) != d_->klm.end()
137  && d_->klm[lmkey_].size() == 1
138  , "You need to load ONE recasing Language Model!" );
140  unordered_set<Label> epsilons;
142  epsilons.insert ( DR );
143  epsilons.insert ( OOV );
144  epsilons.insert ( EPSILON );
145  epsilons.insert ( SEP );
146  bool a = true;
147  almotf_.reset(assignKenLmHandler<Arc>
148  ( rg_, lmkey_, epsilons, *(d_->klm[lmkey_][0]), mw, a));
149  }
150 
152  void run ( fst::VectorFst<Arc> *fst ) {
153  if ( d_->fsts.find ( unimapkey_ ) == d_->fsts.end() ) {
154  LINFO ( "No recasing step (key=" << unimapkey_ << " not found)" );
155  return;
156  } else if ( d_->fsts[unimapkey_] == NULL ) {
157  LINFO ( "No recasing step (NULL) " );
158  return;
159  }
160  initializeLanguageModelHandler();
161  unimap_ = static_cast<fst::VectorFst<Arc> *> ( d_->fsts[unimapkey_] );
162  LINFO ( "Apply Unigram Model to 1-best" );
163  fst::VectorFst<Arc> mappedinput ( fst::RRhoCompose<Arc> ( *fst, *unimap_ ) );
164  LINFO ( "Tag OOVs" );
165  tagOOVs<Arc> ( &mappedinput, *d_->recasingvcblm );
166  LDBG_EXECUTE ( mappedinput.Write ( "mappedinput.fst" ) );
167  fst::VectorFst<Arc> *output = almotf_->run(mappedinput);
168  LINFO ( "Recover OOVs" );
169  recoverOOVs<Arc> ( output );
170  if ( shp_ < std::numeric_limits<unsigned>::max() ) {
171  fst::VectorFst<Arc> *aux = new fst::VectorFst<Arc>;
172  LINFO ( "Shortest Path n=" << shp_ );
173  fst::ShortestPath<Arc> ( *output, aux, shp_ );
174  delete output; output = aux;
175  fst::TopSort<Arc> ( output );
176  } else if ( prune_ < std::numeric_limits<float>::max() ) {
177  LINFO ( "Prune by weight=" << prune_ );
178  fst::Prune<Arc> ( output, mw_ ( prune_ ) );
179  } else {
180  USER_CHECK ( false,
181  "prune parameter incorrectly set: first parameter is byshortestpath or byweight" );
182  exit(EXIT_FAILURE);
183  }
184  *fst = *output; delete output;
185  fst::Project ( fst, fst::PROJECT_OUTPUT ); //Recased symbols on output language.
186  }
187 
188  ZDISALLOW_COPY_AND_ASSIGN ( DisambigTask );
189 
190 };
191 
192 }
193 } // end namespaces
194 #endif // TASK_DISAMBIG_HPP
195 
std::vector< std::string > getVectorString(const std::string &key) const
Convenience method that returns a vector of strings taking "," as the separator character.
Definition: registrypo.hpp:245
std::string const kRecaserPrune
#define LINFO(msg)
std::string const kRecaserUnimapLoad
Definition: fstio.hpp:27
bool run(Data &d)
Method inherited from TaskInterface, reads input lattice from data object, disambiguates and then sto...
std::string const kRecaserLmLoad
#define SEP
#define LDBG_EXECUTE(order)
T get(const std::string &key) const
Returns parsed value associated to key.
Definition: registrypo.hpp:194
#define DR
std::string const kRecaserOutput
Templated (hybrid) Interface for Task classes.
Templated functor that creates a weight given a float.
Wrapper to ApplyLanguageModelOnTheFly to apply different kenlm models.
#define EPSILON
Disambig Task tool. Given a search space, applies a unigram transduction model (generating alternativ...
DisambigTask(const ucam::util::RegistryPO &rg, const std::string &inputkey=HifstConstants::kRecaserInput, const std::string &outputkey=HifstConstants::kRecaserOutput, const std::string &lmkey=HifstConstants::kRecaserLmLoad, const std::string &unimapkey=HifstConstants::kRecaserUnimapLoad, bool forceloading=false)
Constructor with registry objects and several keys to access either ucam::util::RegistryPO program op...
Templated functor that creates a weight given a float.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Utilities for DisambigTask and related tasks.
T toNumber(const std::string &x)
Converts a string to an arbitrary number Converts strings to a number. Quits execution if conversion ...
#define OOV
Definition: bleu.hpp:14
std::string const kRecaserInput