Cambridge SMT System
task.lmbr.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, Graeme Blackwood, Adrià de Gispert, William Byrne
14 
15 #ifndef TASK_LMBR_HPP
16 
24 #include "task.lmbr.common.hpp"
27 
28 namespace ucam {
29 namespace lmbr {
30 
32 template <class Data>
33 class LmbrTask: public ucam::util::TaskInterface<Data> {
34 
35  private:
36  typedef fst::NGramList NGramList;
37 
38  ucam::util::NumberRange<float> alpha_, wps_;
39 
40  bool onebest_;
41 
42  unsigned minorder_;
43  unsigned maxorder_;
44 
45  //Lmbr output...
46  fst::VectorFst<fst::StdArc> lmbroutput_;
47 
48  NGramToStateMapper statemapper;
49  Wlist vocab;
50  std::vector<NGramList> ngrams;
51  Theta theta_;
52  NGramToPosteriorsMapper posteriors;
53 
54  //Key to access evidence space
55  const std::string evidencespacekey_;
56  //Key to access hypotheses space
57  const std::string hypothesesspacekey_;
58  //key to write lmbr lattice output
59  const std::string lmbroutputkey_;
60 
61  //For pre-pruning.
62  float ppweight_;
63 
64  bool loadlexstdarc_;
65 
66 //public methods
67  public:
70  const std::string& evidencespacekey = HifstConstants::kLmbrLoadEvidencespace,
71  const std::string& hypothesesspacekey =
73  const std::string& lmbroutputkey = HifstConstants::kLmbrWritedecoder,
74  const std::string& writeonebestkey = HifstConstants::kLmbrWriteonebest,
75  const std::string& alphakey = HifstConstants::kLmbrAlpha,
76  const std::string& wpskey = HifstConstants::kLmbrWps,
77  const std::string& minorder = HifstConstants::kLmbrMinorder,
78  const std::string& maxorder = HifstConstants::kLmbrMaxorder,
79  const std::string& unigramprecisionkey = HifstConstants::kLmbrP,
80  const std::string& precisionratiokey = HifstConstants::kLmbrR,
81  const std::string& numberunigramtokenskey = HifstConstants::kLmbrT,
82  const std::string& preprunekey = HifstConstants::kLmbrPreprune,
83  const std::string& lexstdarckey = HifstConstants::kLmbrLexstdarc
84  ) :
85  evidencespacekey_ (evidencespacekey),
86  hypothesesspacekey_ (hypothesesspacekey),
87  lmbroutputkey_ (lmbroutputkey),
88  minorder_ (rg.get<unsigned> (minorder) ),
89  maxorder_ (rg.get<unsigned> (maxorder) ),
90  alpha_ (rg, alphakey),
91  wps_ (rg, wpskey),
92  ppweight_ (rg.get<float> (preprunekey) ),
93  onebest_ (rg.exists (writeonebestkey) ),
94  loadlexstdarc_ (rg.exists (lexstdarckey) ),
95  theta_ (rg.get<float> (unigramprecisionkey),
96  rg.get<float> (precisionratiokey),
97  rg.get<float> (numberunigramtokenskey),
98  rg.get<unsigned> (minorder),
99  rg.get<unsigned> (maxorder)
100  ) {
101  if (minorder_ < 1 || maxorder_ < 1) {
102  cerr << "error: 'minorder' and/or 'maxorder' < 1 \n";
103  exit (1);
104  }
105  if (minorder_ > 10 || maxorder_ > 10) {
106  cerr << "error: 'minorder' and/or 'maxorder' > 10\n";
107  exit (1);
108  }
109  if (minorder_ > maxorder_) {
110  cerr << "error: 'minorder' > 'maxorder'\n";
111  exit (1);
112  }
113  LINFO ( "min order=" << minorder_ );
114  LINFO ( "max order=" << maxorder_ );
115  ngrams.resize (maxorder_ + 1);
116  };
117 
119  bool run (Data& d) {
120  FORCELINFO ("applying Lattice MBR , sentence " << d.sidx );
121  lmbroutput_.DeleteStates();
122  //Identify evidence space and hypothesis space. Could be the same.
123  fst::VectorFst<fst::StdArc>* fstevd = NULL;
124  fst::VectorFst<fst::StdArc> aux;
126  if (d.fsts.find (evidencespacekey_) != d.fsts.end() ) {
127  if (loadlexstdarc_) { //We have lexstd, no need for second weight at this point, so we just map down to tropical.
128  fst::Map<fst::LexStdArc, fst::StdArc> (*
129  (static_cast<fst::VectorFst<fst::LexStdArc> *> (d.fsts[evidencespacekey_]) ),
130  &aux,
132  d.fsts[evidencespacekey_] =
133  &aux; //rewriting this pointer, would not be accesible for future tasks.
134  }
135  fstevd = static_cast<fst::VectorFst<fst::StdArc> *> (d.fsts[evidencespacekey_]);
136  //Take out OOVs and DRs from evidence space.
137  ru.addIPL (DR, EPSILON)
138  .addOPL (DR, EPSILON)
139  .addIPL (OOV, EPSILON)
140  .addOPL (OOV, EPSILON)
141  .addIPL (SEP, EPSILON)
142  .addOPL (SEP, EPSILON)
143  (fstevd);
144  fst::Determinize (fst::RmEpsilonFst<fst::StdArc> (*fstevd), fstevd);
145  fst::Minimize (fstevd);
146  if (ppweight_ != std::numeric_limits<float>::max() ) {
147  LINFO ("Pruning evidence space, weight=" << ppweight_);
148  fst::Prune (fstevd, ppweight_);
149  fst::Determinize (fst::RmEpsilonFst<fst::StdArc> (*fstevd),
150  fstevd); //Awesome, repeat this.
151  fst::Minimize (fstevd);
152  }
153  }
154  if (fstevd == NULL) {
155  LINFO ("No evidence space provided. Skipping LMBR!");
156  return false;
157  }
158  fst::VectorFst<fst::StdArc>* fsthyp = NULL;
159  fst::VectorFst<fst::StdArc> aux2;
160  if (d.fsts.find (hypothesesspacekey_) != d.fsts.end() ) {
161  if (loadlexstdarc_) { //We have lexstd, no need for second weight at this point, so we just map down to tropical.
162  fst::Map<fst::LexStdArc, fst::StdArc> (*
163  (static_cast<fst::VectorFst<fst::LexStdArc> *> (d.fsts[hypothesesspacekey_]) ),
164  &aux2, fst::LexStdToStdMapper (1) );
165  d.fsts[hypothesesspacekey_] =
166  &aux2; //rewriting this pointer, would not be accesible for future tasks...
167  } else {
168  fsthyp = static_cast<fst::VectorFst<fst::StdArc> * >
169  (d.fsts[hypothesesspacekey_]);
170  }
171  } else fsthyp = fstevd;
172  //Take out OOVs and DRs from hypotheses space.
173  ru (fsthyp);
174  fst::RmEpsilon (fsthyp);
175  //Extract ngrams from evidence space
176  unsigned count = extractNGrams (*fstevd, ngrams, minorder_, maxorder_);
177  LINFO ( count << " ngrams extracted (evidence space)");
178  if (fsthyp != NULL && fsthyp != fstevd) {
179  //Additionally, extract ngrams from hypotheses space
180  unsigned count_hs = extractNGrams (*fsthyp, ngrams, minorder_, maxorder_);
181  LINFO ( count_hs << " ngrams extracted (hypotheses space)");
182  }
183  //Lattice vocabulary...
184  extractSourceVocabulary (*fstevd, &vocab);
185  LINFO ("Fast posterior computing");
186  ComputePosteriors cp (ngrams);
187  for ( alpha_.start(); !alpha_.done (); alpha_.next() ) {
188  LINFO ( "scaling weights by " << std::fixed << std::setprecision (
189  4) << alpha_() );
190  boost::scoped_ptr< fst::VectorFst<fst::StdArc> > scaledfst (FstScaleWeights (
191  fstevd, alpha_() ) );
192  cp (scaledfst.get() );
194  ApplyPosteriors ap (ngrams, pst, theta_);
195  fst::Map (*fsthyp, &lmbroutput_, fst::RmWeightMapper<fst::StdArc>() );
196  boost::scoped_ptr<fst::VectorFst<fst::StdArc> > lmbrlat (ap (lmbroutput_) );
197  fst::VectorFst<fst::StdArc> original (*lmbrlat);
198  for ( wps_.start(); !wps_.done (); wps_.next() ) {
199  fst::Map (original, &lmbroutput_, fst::TimesMapper<fst::StdArc> (wps_() ) );
200  std::string hyp;
201  FstGetBestStringHypothesis(lmbroutput_, hyp);
202  LINFO ("wps=" << wps_() << ":" << hyp);
203  if (onebest_) {
204  // *d.lmbronebest+="alpha=" + toString(alpha_())+ " wps=" + toString(wps_()) + " " + toString(d.sidx)+ ":" + hyp + "\n";
205  d.lmbronebest->alpha.push_back (alpha_() );
206  d.lmbronebest->wps.push_back (wps_() );
207  d.lmbronebest->hyp.push_back (hyp);
208  }
209  }
210  }
211  //Last alpha,wps gets to the lmbr lattice output. This is fine for single alpha/wp values.
212  //If you want to tune and dump a particular lmbr decoder _at the same time_ with a particular alpha and wps,
213  //put these values at the end of the range.
214  d.lmbronebest->idx = d.sidx;
215  d.fsts[lmbroutputkey_] = &lmbroutput_;
216  ngrams.clear();
217  LINFO ("LMBR finished");
218  return false;
219  };
220 
221  private:
222 
223 };
224 
225 }
226 } // end namespaces
227 #endif
void Prune(fst::MutableFst< FunctionArc > *, PruneStats &)
Definition: MertPrune.cpp:19
const std::string kLmbrAlpha
const std::string kLmbrLoadHypothesesspace
void FstGetBestStringHypothesis(const fst::VectorFst< Arc > &latfst, std::string &hyp)
Definition: fstutils.hpp:229
unordered_set< fst::WordId > Wlist
Definition: data.lmbr.hpp:25
#define LINFO(msg)
#define SEP
Based on Graeme Blackwood&#39;s PhD work and original code – implementation of posterior computation fro...
const std::string kLmbrLexstdarc
#define FORCELINFO(msg)
NGramToPosteriorsMapper & getPosteriors()
Retrieve reference to posteriors.
RelabelUtil & addIPL(typename Arc::Label labelfind, typename Arc::Label labelreplace)
Definition: fstutils.hpp:511
fst::TropicalWeightTpl< F > Map(double)
unordered_map< fst::NGram, std::vector< std::vector< Posterior > >, ucam::util::hashfvecuint, ucam::util::hasheqvecuint > NGramToPosteriorsMapper
Definition: data.lmbr.hpp:35
#define DR
bool run(Data &d)
Run method inherited from TaskInterface.
Definition: task.lmbr.hpp:119
Functor handling LMBR theta parameters.
void start(void)
Empty implementation.
Definition: range.hpp:107
const std::string kLmbrWritedecoder
const std::string kLmbrPreprune
void next(void)
Increment index.
Definition: range.hpp:111
Lattice MBR task.
Definition: task.lmbr.hpp:33
Templated (hybrid) Interface for Task classes.
LexStdArc to StdArc Mapper.
const std::string kLmbrLoadEvidencespace
const std::string kLmbrMaxorder
const std::string kLmbrT
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
Utility functor for relabeling one or more lattices. Note that you can chain commands. See Unit test in fstutils.gtest.cpp for an example.
Definition: fstutils.hpp:503
const std::string kLmbrMinorder
#define EPSILON
Common lmbr functions.
void extractSourceVocabulary(const fst::VectorFst< Arc > &myfst, unordered_set< std::string > *vcb)
Extract source (left-side) vocabulary from an fst.
Definition: fstutils.hpp:42
fst::VectorFst< fst::StdArc > * FstScaleWeights(fst::VectorFst< fst::StdArc > *fst, const double scale)
Definition: fstutils.hpp:387
bool done(void)
Checks if reached the last element.
Definition: range.hpp:115
std::unordered_map< NGram, StdArc::Weight, ucam::util::hashfvecuint, ucam::util::hasheqvecuint > NGramList
unordered_map< fst::NGram, fst::StdArc::StateId, ucam::util::hashfvecuint, ucam::util::hasheqvecuint > NGramToStateMapper
Definition: data.lmbr.hpp:31
#define OOV
const std::string kLmbrR
Functor that applies posteriors to any hypothesis space. Initializes with previously calculated poste...
Based on Graeme Blackwood&#39;s PhD work and original code – implementation of posterior application to ...
const std::string kLmbrP
LmbrTask(const ucam::util::RegistryPO &rg, const std::string &evidencespacekey=HifstConstants::kLmbrLoadEvidencespace, const std::string &hypothesesspacekey=HifstConstants::kLmbrLoadHypothesesspace, const std::string &lmbroutputkey=HifstConstants::kLmbrWritedecoder, const std::string &writeonebestkey=HifstConstants::kLmbrWriteonebest, const std::string &alphakey=HifstConstants::kLmbrAlpha, const std::string &wpskey=HifstConstants::kLmbrWps, const std::string &minorder=HifstConstants::kLmbrMinorder, const std::string &maxorder=HifstConstants::kLmbrMaxorder, const std::string &unigramprecisionkey=HifstConstants::kLmbrP, const std::string &precisionratiokey=HifstConstants::kLmbrR, const std::string &numberunigramtokenskey=HifstConstants::kLmbrT, const std::string &preprunekey=HifstConstants::kLmbrPreprune, const std::string &lexstdarckey=HifstConstants::kLmbrLexstdarc)
Constructor using multiple keys that can be arranged so to use different parameter names...
Definition: task.lmbr.hpp:69
Definition: bleu.hpp:14
uint extractNGrams(fst::VectorFst< Arc > myfst, std::vector< fst::NGramList > &ngramlist, uint minorder=1, uint maxorder=4)
Interfaces with extractNGrams and generates information in the right format for lmbr classes...
const std::string kLmbrWriteonebest
const std::string kLmbrWps