Cambridge SMT System
task.grammar.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef RULEFILETASK_HPP
16 #define RULEFILETASK_HPP
17 
19 
26 namespace ucam {
27 namespace hifst {
28 
37 template <class Data>
39 
40  //Private variables are shown here. Private methods go after public methods
41  private:
42 
44  ucam::util::IntegerPatternAddress grammarfile_, patternfile_;
46  std::string previous_;
47  PatternCompareTool pct_;
49  GrammarData gd_;
50  uint pos_;
52  std::priority_queue<posindex, std::vector<posindex>, PosIndexCompare> *vpq_;
53 
56 
57  std::vector<float> grammarscales_;
58  std::string ntorderfile_;
59 
60  public:
68  , std::string const& featureweightskey = HifstConstants::kGrammarFeatureweights
69  , unsigned featureoffset = 0) :
70  previous_ ( "" ),
71  grammarfile_ ( rg.get<std::string> ( HifstConstants::kGrammarLoad ) ),
72  patternfile_ ( rg.get<std::string> ( HifstConstants::kGrammarStorepatterns ) ) ,
73  ntorderfile_ (rg.get<std::string> ( HifstConstants::kGrammarStorentorder) ),
74  grammarscales_ ( ucam::util::ParseParamString<float> ( rg.get<std::string>
75  ( featureweightskey ) ) ) {
76  gd_.ct = &pct_;
77  if (featureoffset ) {
78  std::vector<float> aux (grammarscales_.size() - featureoffset);
79  std::copy (grammarscales_.begin() + featureoffset, grammarscales_.end(),
80  aux.begin() );
81  grammarscales_ = aux;
82  }
83  USER_CHECK ( grammarscales_.size(),
84  "0 feature weights. So the grammar is not a probabilistic model? Not my cup of tea." );
85  };
86 
94  GrammarTask ( const std::string& grammarfilekey = HifstConstants::kGrammarLoad,
95  const std::string& patternfilekey = HifstConstants::kGrammarStorepatterns ) :
96  previous_ ( "" ),
97  grammarfile_ ( grammarfilekey ),
98  patternfile_ ( patternfilekey ) ,
99  grammarscales_ ( ucam::util::ParseParamString<float> ( "1" ) ) {
100  };
101 
107  return &gd_;
108  };
109 
117  bool run ( Data& d ) {
118  std::string thisgrammarfile = grammarfile_ ( d.sidx );
119  if ( thisgrammarfile != previous_ ) {
120  FORCELINFO ( "Loading hierarchical grammar: " << thisgrammarfile );
121  USER_CHECK ( ucam::util::fileExists ( thisgrammarfile ),
122  "This grammar does not exist" );
123  d.stats->setTimeStart ( "load-grammar-patterns" );
124  load ( thisgrammarfile );
125  d.stats->setTimeEnd ( "load-grammar-patterns" );
126  std::string patternfile = patternfile_ ( d.sidx );
127  if ( patternfile != "" ) {
128  ucam::util::oszfstream o ( patternfile );
129  for ( unordered_set<std::string>::iterator itx = gd_.patterns.begin();
130  itx != gd_.patterns.end(); ++itx ) o << *itx << endl;
131  o.close();
132  }
133  previous_ = thisgrammarfile;
134  } else {
135  LINFO ( "Skipping grammar loading..." );
136  }
137  d.grammar = &gd_;
138  return false;
139  };
140 
148  inline void load ( const std::string& file ) {
149  load_init();
150  LINFO ( "=> Loading..." << file );
151  ucam::util::readtextfile<GrammarTask> ( file, *this );
152  load_sort();
153  LINFO ( "Done! ****" );
154  generate_ntorder();
155  };
156 
164  inline void load ( std::stringstream& s ) {
165  load_init();
166  std::string myline;
167  while ( getline ( s, myline ) ) {
168  parse ( myline );
169  }
170  load_sort();
171  LINFO ( "Done!" );
172  generate_ntorder();
173  };
174 
175  virtual ~GrammarTask() {};
176 
177  private:
178 
183  void generate_ntorder() {
184  std::string ntorder;
185  nth_ ( ntorder );
186  LINFO ( "ntorder=" << ntorder );
187  std::vector<std::string> aux;
188  boost::algorithm::split ( aux, ntorder, boost::algorithm::is_any_of ( " ," ) );
189  for ( uint k = 0; k < aux.size(); ++k ) {
190  gd_.vcat[k + 1] = aux[k]; //Note that mapped indices always start from 1
191  gd_.categories[aux[k]] = k + 1;
192  }
193  if (ntorderfile_ != "") {
194  ucam::util::oszfstream o ( ntorderfile_ );
195  for ( uint k = 0; k < gd_.vcat.size(); ++k )
196  o << gd_.vcat[k + 1] << "\t" << k + 1 << std::endl;
197  }
198  }
199 
205  inline void load_init() {
206  pos_ = 0;
207  gd_.reset();
208  gd_.ct = &pct_;
209  vpq_ = new
210  std::priority_queue<posindex, std::vector<posindex>, PosIndexCompare>
211  ( PosIndexCompare ( &gd_.filecontents, gd_.ct ) );
212  };
213 
219  inline void load_sort() {
220  LINFO ( "Sorting indices..." );
221  uint newidx = 0;
222  gd_.sizeofvpos = vpq_->size();
223  gd_.vpos = new
224  posindex[gd_.sizeofvpos]; //peak memory footprint here, we could avoid this by enforcing sorted grammar input (although it would have to meet the same pattern sorting criterion...)
225  LINFO ( gd_.sizeofvpos << " indices" );
226  while ( !vpq_->empty() ) {
227  gd_.vpos[newidx++] = vpq_->top();
228  vpq_->pop();
229  LDEBUG2 ( gd_.getRule ( newidx - 1 ) << " at " << gd_.vpos[newidx - 1].order );
230  }
231  delete vpq_;
232  };
233 
240  __always_inline void parse ( std::string& line ) {
241  using namespace std;
242  using namespace ucam::util;
243 
244  boost::algorithm::trim ( line );
245  if ( line == "" ) return;
246  size_t pos1 = line.find_first_of ( " " ); // src
247  size_t pos2 = line.find_first_of ( " ", pos1 + 1 ); // trg
248  size_t pos3 = line.find_first_of ( " ", pos2 + 1 ); // weight
249 
250  if (pos3 == std::string::npos) {
251  LERROR("Grammar not valid. At least one weight is needed: \n=>\t" << line);
252  exit(EXIT_FAILURE);
253  }
254  size_t pos4 = line.find_first_of ( "\t"); // optional alignments
255  if (pos4 == std::string::npos) pos4 = line.size();
256  LDEBUG("pos1=" << pos1 << ",pos2=" << pos2 << ",pos3=" << pos3 << ",pos4=" << pos4);
257 
258  vector<float> weights;
259  ParseParamString<float> ( line, weights, pos3 + 1 , pos4 - pos3 - 1 );
260  string sweight = toString<float>
261  ( dotproduct (weights, grammarscales_ ), numeric_limits<unsigned>::max() );
262  trim_trailing_zeros ( sweight );
263  line = ( pos4 <line.size() )
264  ? line.substr ( 0, pos3 + 1 ) + sweight + line.substr(pos4)
265  : line.substr ( 0, pos3 + 1 ) + sweight;
266 
267 
268  LDEBUG("Adding line=[" << line << "]");
269  gd_.filecontents += line + '\n';
270  posindex pi;
271  bool waitingfornextfield = false;
272  unsigned cf = 2; //Second field
273  char previous = ' ';
274  for ( unsigned k = 0; k < line.size(); ++k ) {
275  if ( previous == ' ' && line[k] != ' ' ) --cf;
276  if ( !cf ) {
277  pi.o = k;
278  break;
279  }
280  previous = line[k];
281  }
282  pi.p = pos_ + pi.o;
283  string pattern;
284  bool word = false;
285  bool nt = false;
286  for ( unsigned k = pi.o; k < line.size(); ++k ) {
287  if ( line[k] == ' ' ) break;
288  if ( line[k] >= '0' && line[k] <= '9' ) {
289  if ( !word && !nt ) {
290  pattern += 'w';
291  word = true;
292  nt = false;
293  }
294  } else if ( line[k] >= 'A' && line[k] <= 'Z' ) {
295  if ( !nt ) {
296  pattern += 'X';
297  nt = true;
298  word = false;
299  }
300  } else {
301  pattern += line[k];
302  nt = word = false;
303  }
304  }
305  if ( gd_.patterns.find ( pattern ) == gd_.patterns.end() ) {
306  gd_.patterns.insert ( pattern );
307  }
308  pi.order = vpq_->size();
309  vpq_->push ( pi );
310  pos_ += line.size() + 1;
311  LDEBUG2 ( "reading rule " << line << ", at line " << pi.order << ", pattern=" <<
312  pattern );
313  if ( pattern == "X" ) {
314  LINFO ( "Identity rule detected:" << line << "===" );
315  nth_.insertIdentityRule ( line );
316  } else {
317  nth_.insertLHS ( line.substr ( 0, pi.o - 1 ) );
318  }
319  };
320 
322  template <typename FM>
323  friend inline void ucam::util::readtextfile ( const std::string& filename,
324  FM& fm );
325 
327 
328 };
329 
330 }
331 } // end namespaces
332 #endif
Wrapper stream class that writes to pipes, text files or gzipped files.
Definition: szfstream.hpp:200
std::size_t order
absolute index
#define ZDISALLOW_COPY_AND_ASSIGN(TypeName)
GrammarTask(const std::string &grammarfilekey=HifstConstants::kGrammarLoad, const std::string &patternfilekey=HifstConstants::kGrammarStorepatterns)
Constructor used for unit testing.
Class that provides "pattern" comparison between two const char *. The "patterns" are an abstraction ...
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
grammar_categories_t categories
Ordered list of non-terminals (listed in hierarchical order according to identity rules) ...
#define LDEBUG2(msg)
Struct containing rule positions and offsets.
#define LINFO(msg)
const std::string kGrammarFeatureweights
#define FORCELINFO(msg)
#define LDEBUG(msg)
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
std::vector< T > ParseParamString(const std::string &stringparams, size_t pos=0)
Function to parse string of parameters, e.g. separated by commas.
Definition: params.hpp:51
unordered_set< std::string > patterns
Patterns in these rules.
Struct containing grammar rules.
posindex * vpos
Sorted Indices.
Task class that loads a grammar into memory.
Templated (hybrid) Interface for Task classes.
iszfstream & getline(iszfstream &izs, std::string &line)
Definition: szfstream.hpp:178
void trim_trailing_zeros(std::string &snumber)
std::string filecontents
The whole grammar.
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
void load(const std::string &file)
Loads rules from a grammar file.
float dotproduct(std::vector< float > &v1, std::vector< float > &v2)
Implements dot product.
void readtextfile(const std::string &filename, FM &fm)
Function that reads from a file. Templated on any external class with a parse method.
Definition: szfstream.hpp:359
bool fileExists(const std::string &fileName)
this class decides automatically the hierarchy of non-terminals
Functor Class that provides comparison accross the posindex structure. This is typically used e...
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
GrammarData * getGrammarData()
Returns GrammarData.
const std::string kGrammarStorepatterns
#define LERROR(msg)
bool run(Data &d)
ucam::util::TaskInterface mandatory method implementation. This method loads the hierarchical grammar...
void insertIdentityRule(const std::string &identityrule)
Method to store identity rules, i.e. S -> X X , etc.
This is a functor with additional methods to include relevant rules (i.e. identify SCFG rules...
const std::string kGrammarStorentorder
grammar_inversecategories_t vcat
GrammarTask(ucam::util::RegistryPO const &rg, std::string const &featureweightskey=HifstConstants::kGrammarFeatureweights, unsigned featureoffset=0)
Constructor.
std::size_t sizeofvpos
Number of rules.
void reset()
Reset object.
Definition: bleu.hpp:14
const std::string kGrammarLoad
void load(std::stringstream &s)
Loads rules from a stringstream.
void close()
Closes the file.
Definition: szfstream.hpp:323