Cambridge SMT System
task.loadsparseweightflowerfst.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 
6 // http://www.apache.org/licenses/LICENSE-2.0
7 //
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne
15 
16 #ifndef TASK_RULES2FLOWERFST_HPP
17 #define TASK_RULES2FLOWERFST_HPP
18 
26 namespace ucam {
27 namespace hifst {
28 
30 template<class DataT>
32  // private:
33  protected:
34 
36  fst::VectorFst<TupleArc32> flowerlattice_;
37 
39  std::vector<float>& fscales_;
40 
43 
46 
48  std::string previousfile_;
49 
52 
55 
57  bool built_;
58 
60  const std::string grammarloadkey_;
62  const std::string grammarstorekey_;
63 
65  const unsigned offset_;
66  public:
69  const unsigned offset =
70  1, //minimum offset considering only one language model...
71  const std::string& alignmentlattices = "",
72  const std::string& grammarloadkey = HifstConstants::kRuleflowerlatticeLoad,
73  const std::string& grammarstorekey = HifstConstants::kRuleflowerlatticeStore
74  ) :
75  offset_ ( offset ),
76  rg_ ( rg ),
77  alilats_ ( rg.exists ( alignmentlattices ) ? rg.get<std::string>
78  ( alignmentlattices ) : "" ),
79  grammar_ ( rg.get<std::string> ( grammarloadkey ) ),
80  fscales_ ( fst::TropicalSparseTupleWeight<float>::Params() ),
81  filterbyalilats_ ( rg.exists ( alignmentlattices ) ),
82  grammarstorekey_ ( grammarstorekey ) {
83  };
84 
86  bool run ( DataT& d ) {
87  load ( grammar_ ( d.sidx ) );
89  return false;
90  };
91 
93  bool directload ( const std::string& filename ) {
94  bool directload = false;
95  std::vector<std::string> vgn;
96  boost::algorithm::split ( vgn, filename, boost::algorithm::is_any_of ( "." ) );
97  if ( vgn[vgn.size() - 1] == "fst" ) directload = true;
98  else if ( vgn.size() > 2 ) if ( vgn[vgn.size() - 1] == "gz"
99  && vgn[vgn.size() - 2] == "fst" ) directload = true;
100  if ( directload ) {
101  LINFO ( "Loading FST directly (assumes arcsorted flower) =" << filename );
102  fst::VectorFst<TupleArc32> *yupi = fst::VectorFstRead<TupleArc32> ( filename );
103  flowerlattice_ = *yupi;
104  delete yupi;
105  return true;
106  }
107  return false;
108  }
109 
110 
111  bool checkGrammarFile(std::string const &filename) {
112  if ( !USER_CHECK ( filename != "" , "No grammar to load?" ) ) return false;
113  if ( filename == previousfile_ ) {
114  LINFO ( "Skipping grammar loading..." );
115  return false;
116  }
117  return true;
118  }
119 
120  void updateFilename(std::string const &filename) {
121  previousfile_ = filename;
122  }
123 
124  virtual void gatherRuleIds(unordered_set<unsigned> &idxrules
125  , bool filterbyalilats
127  , ucam::util::RegistryPO const &rg ) {
128 
129  using namespace ucam::util;
130  using namespace fst;
131  if ( filterbyalilats )
132  for ( IntRangePtr ir (IntRangeFactory ( rg ) )
133  ; !ir->done ()
134  ; ir->next () ) {
135  VectorFst<LexStdArc> *alilatsfst =
136  VectorFstRead<LexStdArc> ( alilats_ ( ir->get() ) );
137  extractSourceVocabulary<LexStdArc> ( *alilatsfst, &idxrules );
138  delete alilatsfst;
139  }
140  }
141 
142  virtual void initStructure() {
143  flowerlattice_.AddState();
144  flowerlattice_.SetStart ( 0 );
145  flowerlattice_.SetFinal ( 0, TupleArc32::Weight::One() );
146  }
147 
148  virtual void fillStructure(unsigned label, TupleArc32::Weight const & vtcost) {
149  flowerlattice_.AddArc ( 0, TupleArc32 ( label, label, vtcost, 0 ) );
150  }
151 
152  virtual void closeStructure() {
153  fst::ArcSort<TupleArc32> ( &flowerlattice_, fst::ILabelCompare<TupleArc32>() );
154  }
162  bool load ( const std::string& filename ) {
163  if (!checkGrammarFile(filename)) return false;
164  updateFilename(filename);
165 
166  FORCELINFO ( "loading grammar from " << filename );
167  if ( directload ( filename ) ) return true;
168  unordered_set<unsigned> idxrules;
169  gatherRuleIds(idxrules, filterbyalilats_, alilats_, rg_ );
170  this->initStructure();
171 
172  unsigned lc = 0, llc = 0;
173  ucam::util::iszfstream myrulefile;
174  myrulefile.open ( filename.c_str() );
175  LINFO ( "Opening rule file " << filename );
176  if ( !myrulefile.is_open() ) {
177  LERROR ( "Failed to open " << filename );
178  return false;
179  }
180  do {
181  std::string line;
182  getline ( myrulefile, line );
183  ++lc;
184  if ( line.size() > 0 ) {
185  while ( line.at ( line.length() - 1 ) == ' ' )
186  line.erase ( line.length() - 1 );
187  while ( line.at ( 0 ) == ' ' )
188  line.erase ( 0, 1 );
189  if ( line.at ( 0 ) == '#' || line.at ( 0 ) == '%' ) line = "";
190  }
191  if ( line == "" ) {
192  LDEBUG ( lc << "is an empty line/comment." );
193  continue;
194  }
195  if ( ! ( lc % 100000 ) ) LINFO ( lc << " rules parsed..." );
196  if ( !idxrules.empty() )
197  if ( idxrules.find ( lc ) == idxrules.end() ) continue;
198  llc++;
199  TupleW32 vtcost;
200  std::vector<std::string> fields;
201  boost::algorithm::split ( fields, line, boost::algorithm::is_any_of ( " " ) );
202  for ( unsigned k = 3; k < fields.size(); ++k ) {
203  float prob;
204  unsigned pos = 0;
205  std::vector<std::string> splitfields;
206  boost::algorithm::split ( splitfields, fields[k],
207  boost::algorithm::is_any_of ( "@" ) );
208  using ucam::util::toNumber;
209  if ( splitfields.size() == 2 ) {
210  if ( splitfields[0] == "" ) prob = 1.000f;
211  else prob = toNumber<float> ( splitfields[0] );
212  pos = offset_ + toNumber<unsigned> ( splitfields[1] );
213  } else {
214  prob = toNumber<float> ( fields[k] );
215  pos = offset_ + k - 2;
216  LDEBUG ( pos );
217  }
218  vtcost.Push ( pos, prob );
219  // splitfields.clear();
220  }
221  unsigned label = lc;
222  this->fillStructure(label, vtcost);
223 
224  } while ( myrulefile && !myrulefile.eof() );
225  LINFO ( "File: " << filename << " has been succesfully loaded" );
226  myrulefile.close();
227  LINFO ( "Number of rules actually loaded for this job: " << llc );
228  this->closeStructure();
229  return built_ = true;
230  };
231 
232  private:
234 };
235 
236 
237 template<class DataT>
239  private:
240  unordered_map<unsigned,TupleArc32::Weight> weights_;
241 
242  public:
244  const unsigned offset =1, //minimum offset considering only one language model...
245  const std::string& alignmentlattices = HifstConstants::kRulesToWeightsLoadalilats,
246  const std::string& grammarloadkey = HifstConstants::kRulesToWeightsLoadGrammar
247  )
248  : LoadSparseWeightFlowerLatticeTask<DataT>(rg, offset, alignmentlattices, grammarloadkey)
249  {}
250  bool run ( DataT& d ) {
252  d.weights = &weights_;
253  }
254 
255  virtual void initStructure() {}
256 
257  virtual void fillStructure(unsigned label, TupleArc32::Weight const & vtcost) {
258  // rule ids start from 2 for "reasons" (openfst).
259  weights_[label + 2 ] = vtcost;
260  weights_[label + 2 ].SetDefaultValue(0);
261  LDEBUG("Loading rule id=" << label + 2 << " with weights=" << weights_[label + 2 ] );
262  }
263 
264  virtual void closeStructure() {}
265 
266  virtual void gatherRuleIds(unordered_set<unsigned> &idxrules
267  , bool filterbyalilats
269  , ucam::util::RegistryPO const &rg ) {
270  using namespace ucam::util;
271  using namespace fst;
272 
273  if ( ! filterbyalilats ) return;
274  // @todo -- need to read into the weights, gather rule ids there.
275  }
276 };
277 
278 
279 }} // end namespaces
280 
281 #endif // TASK_RULES2FLOWERFST_HPP
#define ZDISALLOW_COPY_AND_ASSIGN(TypeName)
bool directload(const std::string &filename)
If it is an fst, load directly.
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
const std::string kRuleflowerlatticeLoad
#define LINFO(msg)
bool run(DataT &d)
Inherited method from ucam::util::TaskInterface. Loads the flower lattice into the data object...
Definition: fstio.hpp:27
void open(const std::stringstream &ss)
Definition: szfstream.hpp:75
bool filterbyalilats_
If true, the grammar flower lattice will be loaded only with rules that have been used in the alignme...
#define FORCELINFO(msg)
const ucam::util::RegistryPO & rg_
Registry object – contains program options.
boost::scoped_ptr< NumberRangeInterface< unsigned > > IntRangePtr
Definition: range.hpp:214
#define LDEBUG(msg)
std::vector< float > & fscales_
sparse tuple-weight scales
#define IntRangeFactory
Definition: range.hpp:213
bool is_open()
Checks if the file/pipe is open.
Definition: szfstream.hpp:132
Implements Tropical Sparse tuple weight semiring, extending from openfst SparsePowerWeight class...
bool run(DataT &d)
Inherited method from ucam::util::TaskInterface. Loads the flower lattice into the data object...
Templated (hybrid) Interface for Task classes.
virtual void fillStructure(unsigned label, TupleArc32::Weight const &vtcost)
iszfstream & getline(iszfstream &izs, std::string &line)
Definition: szfstream.hpp:178
const std::string grammarstorekey_
Key to store in the data object.
const std::string grammarloadkey_
Key with access to registry object for the grammar file name.
const std::string kRulesToWeightsLoadalilats
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
const std::string kRuleflowerlatticeStore
void close()
Closes file.
Definition: szfstream.hpp:147
ucam::util::IntegerPatternAddress alilats_
Alignment lattices file names.
fst::ArcTpl< TupleW32 > TupleArc32
const std::string kRulesToWeightsLoadGrammar
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
virtual void fillStructure(unsigned label, TupleArc32::Weight const &vtcost)
virtual void gatherRuleIds(unordered_set< unsigned > &idxrules, bool filterbyalilats, ucam::util::IntegerPatternAddress &alilats, ucam::util::RegistryPO const &rg)
T toNumber(const std::string &x)
Converts a string to an arbitrary number Converts strings to a number. Quits execution if conversion ...
#define LERROR(msg)
virtual void gatherRuleIds(unordered_set< unsigned > &idxrules, bool filterbyalilats, ucam::util::IntegerPatternAddress &alilats, ucam::util::RegistryPO const &rg)
fst::VectorFst< TupleArc32 > flowerlattice_
Fst with the flower lattice itself.
bool load(const std::string &filename)
Load flower lattice from file.
LoadSparseWeightFlowerLatticeTask(const ucam::util::RegistryPO &rg, const unsigned offset=1, const std::string &alignmentlattices="", const std::string &grammarloadkey=HifstConstants::kRuleflowerlatticeLoad, const std::string &grammarstorekey=HifstConstants::kRuleflowerlatticeStore)
Constructor with registry object, offset and keys.
virtual int eof()
Checks for end-of-file.
Definition: szfstream.hpp:142
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
ucam::util::IntegerPatternAddress grammar_
Grammar file name.
Definition: bleu.hpp:14
Implements a class that loads the grammar sparseweight flower lattice and stores a pointer on the dat...
LoadSparseWeightsTask(const ucam::util::RegistryPO &rg, const unsigned offset=1, const std::string &alignmentlattices=HifstConstants::kRulesToWeightsLoadalilats, const std::string &grammarloadkey=HifstConstants::kRulesToWeightsLoadGrammar)