Cambridge SMT System
task.referencefilter.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
21 #ifndef TASK_REFERENCE_FST_HPP
22 #define TASK_REFERENCE_FST_HPP
23 
24 namespace ucam {
25 namespace hifst {
26 
32 template <class Data , class Arc = fst::LexStdArc >
34 
35  typedef typename Arc::Weight Weight;
36  typedef typename Arc::Label Label;
37 
38  //Private variables are shown here. Private methods go after public methods
39  private:
40 
42  unordered_set<std::string> vocabulary_;
43 
45  fst::VectorFst<Arc> *referencesubstringlattice_;
46 
48  fst::VectorFst<Arc> *referencelattice_;
49 
51  bool built_;
52 
54  ucam::util::IntegerPatternAddress translationlatticefile_,
55  writereferencelatticefile_;
56 
57  std::string translationlatticefilesemiring_;
58  std::string semiring_;
59 
60  std::string oldfile_;
61 
63  float weight_;
65  unsigned shortestpath_;
67  bool useshortestpath_, useweight_;
69  bool disablesubstring_;
70 
72  const std::string referencelatticekey_;
73 
74  public:
82  const std::string& referencelatticekey =
84  referencelatticekey_ ( referencelatticekey ),
85  built_ ( false ),
86  translationlatticefile_ ( rg.get<std::string>
88  translationlatticefilesemiring_ ( rg.get<std::string>
90  semiring_ ( rg.get<std::string>
92  writereferencelatticefile_ ( rg.get<std::string>
94  disablesubstring_ ( rg.getBool ( HifstConstants::kReferencefilterSubstring ) ==
95  false ),
96  weight_ ( rg.get<float>
98  shortestpath_ ( rg.get<unsigned>
100  useshortestpath_ ( rg.get<unsigned>
102  std::numeric_limits<unsigned>::max() ),
103  useweight_ ( rg.get<float>
105  std::numeric_limits<float>::max() ),
106  referencesubstringlattice_ ( NULL ),
107  referencelattice_ (NULL) {
108  };
109 
110  inline bool getDisableSubString ( void ) {
111  return disablesubstring_;
112  };
113  inline bool getBuilt ( void ) {
114  return built_;
115  };
116  inline float getWeight ( void ) {
117  return weight_;
118  };
119  inline unsigned getShortestPath ( void ) {
120  return shortestpath_;
121  };
122  inline const unordered_set<std::string>& getVocabulary() {
123  return vocabulary_;
124  };
125  inline const std::string getTranslationLatticeFile() {
126  return translationlatticefile_();
127  };
128 
131  unload();
132  };
133 
136  const std::string& referenceloadkey = HifstConstants::kReferencefilterLoad,
137  const std::string& referencelatticekey
139  if ( rg.exists ( referenceloadkey ) )
140  if ( rg.get<std::string> ( referenceloadkey ) != "" ) return new
141  ReferenceFilterTask ( rg, referencelatticekey );
142  return NULL;
143  }
144 
149  void prune() {
150  fst::VectorFst<Arc> pruned, dweight;
151  if ( useshortestpath_ ) {
152  LINFO ( "Using shortestpath with reference lattice n=" << shortestpath_ );
153  fst::ShortestPath<Arc> ( *referencesubstringlattice_, &pruned, shortestpath_,
154  true );
155  }
156  if ( useweight_ ) {
157  LINFO ( "Pruning reference lattice with weight=" << weight_ );
159  fst::Prune<Arc> ( referencesubstringlattice_, mw ( weight_ ) );
160  LINFO ( "Weighted determinization with weight=" << weight_ );
161  fst::DeterminizeOptions<Arc> dopts;
162  dopts.weight_threshold = mw ( weight_ );
163  fst::Determinize<Arc> ( *referencesubstringlattice_, &dweight, dopts );
164  }
165  if ( useshortestpath_ || useweight_ ) {
166  // does not compile in openfst 1.4.1 (bug with Rational Implementation?)
167  // *referencesubstringlattice_ = ( fst::UnionFst<Arc> ( pruned,
168  // dweight) );
169  // so i'll do instead:
170  *referencesubstringlattice_ = pruned;
171  fst::Union(referencesubstringlattice_, dweight);
172  }
173  };
174 
179  void reduce() {
180  fst::Map<Arc> ( referencesubstringlattice_,
181  fst::RmWeightMapper<Arc>() ); //finally take weights away, so composition scores not affected.
182  fst::Determinize<Arc> ( fst::RmEpsilonFst<Arc> ( *referencesubstringlattice_ ),
183  referencesubstringlattice_ );
184  fst::Minimize<Arc> ( referencesubstringlattice_ );
185  }
186 
192  void build ( const std::string& file ) {
193  if ( file == "" ) return;
194  if ( built_ && oldfile_ == file ) return;
195  oldfile_ = file;
196  unload();
197  vocabulary_.clear();
198  loadLattice(file);
199  prune();
200  reduce();
201 
202  referencelattice_ = new fst::VectorFst<Arc> ( *referencesubstringlattice_ );
203 
204  if ( !disablesubstring_ ) {
205  LINFO ( "building substring reference" );
206  fst::buildSubstringTransducer<Arc>
207  ( referencesubstringlattice_ ); //now we build a sstransducer
208  } else {
209  LWARN ( "Using lattice as-is... substring will not be implemented!!!" );
210  }
211  fst::ArcSort<Arc> ( referencesubstringlattice_, fst::ILabelCompare<Arc>() );
212  fst::extractTargetVocabulary<Arc> ( *referencesubstringlattice_, &vocabulary_ );
213  built_ = true;
214  };
215 
217  void unload ( void ) {
218  if ( referencesubstringlattice_ ) delete referencesubstringlattice_;
219  referencesubstringlattice_ = NULL;
220  built_ = false;
221  if ( referencelattice_ ) delete referencelattice_;
222  referencelattice_ = NULL;
223  }
224 
226  void write ( Data& d ) {
227  if ( writereferencelatticefile_ ( d.sidx ) != "" )
228  fst::FstWrite ( *referencesubstringlattice_,
229  writereferencelatticefile_ ( d.sidx ) );
230  };
231 
233  bool run ( Data& d ) {
234  LINFO ( "build reference filter from lattice=" << translationlatticefile_.get (
235  d.sidx ) );
236  build ( translationlatticefile_.get ( d.sidx ) );
237  if ( referencesubstringlattice_ ) {
238  d.filters.push_back ( referencesubstringlattice_ );
239  d.tvcb = vocabulary_;
240  d.fsts[referencelatticekey_] = referencelattice_;
241  LINFO ( "Done! Full lattice stored with key="
242  << referencelatticekey_
243  << ", NS=" << static_cast<fst::VectorFst<Arc> *>
244  ( d.fsts[referencelatticekey_])->NumStates() );
245  }
246  write ( d );
247  return false;
248  };
249 
250  private:
251 
252  void loadLattice(std::string const &file) {
253  using namespace fst;
254  if (translationlatticefilesemiring_ == "" ) { // use default arc
255  referencesubstringlattice_ = VectorFstRead<Arc> ( file );
256  return;
257  }
258  if (semiring_ != "tuplearc") {
259  LERROR("Conversions currently allowed only from lexstdarc,tropical TO tuplearc)");
260  exit(EXIT_FAILURE);
261  }
262  referencesubstringlattice_ = new VectorFst<Arc>;
263 
264  if (translationlatticefilesemiring_ == "lexstdarc") {
265  VectorFst<LexStdArc> *aux= VectorFstRead<LexStdArc> ( file );
266  VectorFst<TupleArc32> *vwfst = new VectorFst<TupleArc32>;
267 
268  LINFO ( "Mapping Arc Target Lattice to TupleArc32" );
271  Map ( *aux, vwfst, WeightMapper(mwcopy));
272  // bypassing template instance conversions with a reinterpret cast
273  // but this code only happens from tuplearc32 to tuplearc32.
274  referencesubstringlattice_ = reinterpret_cast<VectorFst<Arc> *>(vwfst);
275 
276  delete aux;
277  return;
278  }
279  }
280 
281 
283 
284 };
285 
286 }
287 } // end namespaces
288 
289 #endif // TASK_REFERENCE_FST_HPP
std::string const kHifstSemiring
bool run(Data &d)
Runs... Load substring lattice and add pointer in data object.
#define ZDISALLOW_COPY_AND_ASSIGN(TypeName)
void unload(void)
Clean up fsts...
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
ReferenceFilterTask(const ucam::util::RegistryPO &rg, const std::string &referencelatticekey=HifstConstants::kReferencefilterNosubstringStore)
Constructor.
const std::string kReferencefilterSubstring
#define LINFO(msg)
Definition: fstio.hpp:27
T get(const std::string &key) const
Returns parsed value associated to key.
Definition: registrypo.hpp:194
void reduce()
Removes weights and reduces the reference lattice with determinization and minimization.
Template specialization of MakeSparseVectorWeight functor for LexStdArc.
fst::TropicalWeightTpl< F > Map(double)
Generates a substring version of a reference translation lattice and associated vocabulary. This substring fst is typically used to guide translation towards a particular search space. The associated vocabulary can be used e.g. to restrict parsing algorithms.
static ReferenceFilterTask * init(const ucam::util::RegistryPO &rg, const std::string &referenceloadkey=HifstConstants::kReferencefilterLoad, const std::string &referencelatticekey=HifstConstants::kReferencefilterNosubstringStore)
Static constructor, returns NULL if the substring lattice is not needed (e.g. hifst not in alignment ...
void prune()
Filters the reference lattice using either shortestpath, weighted determinization or both (union)...
const std::string kReferencefilterLoadSemiring
const std::string kReferencefilterPrunereferenceweight
Templated (hybrid) Interface for Task classes.
void FstWrite(const Fst< Arc > &fst, const std::string &filename, const std::string &txtname="txt")
Templated method that writes an fst either in binary or text format.
Definition: fstio.hpp:111
templated Mapper that modifies weights when copying from one FST to another, passing through the othe...
Templated functor that creates a weight given a float.
#define LWARN(msg)
bool exists(const std::string &key) const
Determines whether a program option (key) has been defined by the user.
Definition: registrypo.hpp:235
void build(const std::string &file)
Given an fst file, builds the unweighted substring transducer.
void write(Data &d)
Write reference substring lattice to [file].
const std::string kReferencefilterWrite
const std::string kReferencefilterPrunereferenceshortestpath
#define LERROR(msg)
const std::string kReferencefilterNosubstringStore
const std::string kReferencefilterLoad
const std::string get(T idx)
Expands string and returns.
Definition: bleu.hpp:14
const unordered_set< std::string > & getVocabulary()