Cambridge SMT System
task.cykparser.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
22 #ifndef CYKPARSER_HPP
23 #define CYKPARSER_HPP
24 
25 #define CYK_RETURN_FAILURE 0
26 
27 namespace ucam {
28 namespace hifst {
29 
31 template <class Data>
33 
34  //Private variables are shown here. Private methods go after public methods
35  private:
36 
37  unsigned finalresult_;
38 
40  unsigned hrmh_;
41 
43  unordered_map <std::string, unsigned> hmax_;
45  unordered_map <std::string, unsigned> hmin_;
46 
48  unsigned nnt_;
49 
51  CYKdata cykdata_;
52 
54  Data *d_;
55 
56  public:
57 
63  d_ ( NULL ),
64  finalresult_ ( CYK_RETURN_FAILURE ),
65  hrmh_ ( rg.get<unsigned> ( HifstConstants::kCykparserHrmaxheight ) ),
66  hmax_ ( rg.getPairMappedStringUInt ( HifstConstants::kCykparserHmax ) ),
67  hmin_ ( rg.getPairMappedStringUInt ( HifstConstants::kCykparserHmin ) ) {
68  cykdata_.nt_exceptions_maxspan = rg.getSetString (
70  LDEBUG ( "Constructor done!" );
71  };
72 
79  bool run ( Data& d ) {
80  d.cykdata = &cykdata_;
81  d_ = &d;
82  const GrammarData& gd = *d_->grammar;
84  cykdata_.freeMemory();
85  grammar_categories_t& categories = cykdata_.categories;
86  categories = gd.categories;
87  grammar_inversecategories_t& vcat = cykdata_.vcat;
88  vcat = gd.vcat;
89  unsigned& nnt = cykdata_.nnt;
90  nnt = categories.size();
91  cykparser_sentence_t& sentence = cykdata_.sentence;
92  sentence.clear();
95  USER_CHECK ( cykdata_.categories["S"] == 1, "Mandatory to use S as first guy" );
96  USER_CHECK ( cykdata_.vcat[1] == "S",
97  "Mandatory to have an S as the first non-terminal guy" );
98  std::vector<std::string> aux;
99  boost::algorithm::split ( aux, d.sentence,
100  boost::algorithm::is_any_of ( " " ) );
101  for ( unsigned k = 0; k < aux.size(); ++k ) {
102  sentence.push_back ( ucam::util::toNumber<unsigned> ( aux[k] ) );
103  if ( categories.find ( aux[k] ) == categories.end() ) {
104  categories[aux[k]] = k + nnt + 1;
105  cykdata_.cykgrid.Add ( k + nnt + 1, k, 0, sentence[k] );
106  } else {
107  cykdata_.cykgrid.Add ( categories[aux[k]], k, 0, sentence[k] );
108  }
109  vcat[k + nnt + 1] = aux[k];
110  }
111  USER_CHECK ( vcat[1] == "S",
112  "S must be the top guy in the non-terminal hierarchy" );
113  USER_CHECK ( categories["S"] == 1,
114  "S must be the top guy in the non-terminal hierarchy (2)" );
115  USER_CHECK ( aux.size() == sentence.size(),
116  "sentence not properly initialized?" );
117  d_->stats->setTimeStart ("cyk");
118  LDEBUG ( "Now start stage 1: Initialize Bottom row of the chart" );
119  fillBottomChart();
120  LDEBUG ( "Start stage 2: fill bottom-up the chart" );
121  fillChart();
122  d_->stats->setTimeEnd ("cyk");
123  finalresult_ = cykdata_.cykgrid ( cykdata_.categories["S"], 0,
124  sentence.size() - 1 ).size();
125  if ( finalresult_ > 0 ) {
126  LINFO ( "CYK success:" << finalresult_ << " parse trees" );
127  } else {
128  LINFO ( "CYK failed!" );
129  }
130  d.cykdata->storeRuleCounts ( d.stats->rulecounts );
131  d.stats->numcats = d.cykdata->vcat.size();
132  LINFO ( "Number of rules applied:" << d.stats->rulecounts.size() );
133  cykdata_.success = finalresult_;
134  return false;
135  };
136 
138  ~CYKParserTask ( void ) {
139  hmin_.clear();
140  hmax_.clear();
141  };
142 
144  const CYKdata *getcykdata() {
145  return &cykdata_;
146  };
147 
150  return finalresult_;
151  };
152 
153  private:
154 
169  void examineRule ( unsigned x, unsigned ymax, unsigned rule_idx,
170  unsigned rule_pos, cykparser_rulebpcoordinates_t coord ) {
171  cykparser_ruledependencies_t& rd = cykdata_.rd;
173  SentenceSpecificGrammarData& ssgd = *d_->ssgd;
174  std::string cat = ssgd.getRHSSource ( rule_idx, rule_pos );
175  getFilteredNonTerminal ( cat );
176  unsigned regla_tam = ssgd.getRHSSourceSize ( rule_idx );
177  int x2;
178  unsigned z = ( cykdata_.categories[cat] == 0 ) ? 0 : 1;
179  for ( unsigned n = 0; n < ymax; n++ ) {
180  if ( cykdata_.cykgrid ( cykdata_.categories[cat], x, n ).size() > 0 ) {
181  coord2 = coord; //copy coord, we will need to reuse later.
182  coord2.push_back ( cykdata_.categories[cat] );
183  coord2.push_back ( x );
184  coord2.push_back ( n );
185  if ( rule_pos == regla_tam - 1 ) { //last element of the rule
186  if ( n == ymax -
187  1 ) { //We are in the diagonal, therefore span is complete -- full span verified.
188  rd.push_back ( coord2 );
189  // LDEBUG("span completely verified!, n="<<n<<",ymax="<<ymax);
190  } else {
191  // LDEBUG("span not verified, rule not valid, n="<<n<<",ymax="<<ymax);
192  }
193  } else {
194  if ( ymax > 1 ) {
195  x2 = n + x + 1; //next x.
196  //This element verified, now look next one.
197  examineRule ( x2, ymax - n - 1, rule_idx, rule_pos + 1, coord2 );
198  }
199  }
200  }
201  z = 0;
202  }
203  return;
204  };
205 
213  void insertMonoRHSRules ( unsigned x, unsigned y ) {
215  grammar_categories_t& categories = cykdata_.categories;
216  grammar_inversecategories_t& vcat = cykdata_.vcat;
217  unsigned& nnt = cykdata_.nnt; //number of non-terminals
218  SentenceSpecificGrammarData& ssgd = *d_->ssgd;
220  for ( unsigned cc = nnt; cc > 1;
221  --cc ) { // cc>1 as for 1 there will be no rule with cc lower.
222  if ( cykdata_.cykgrid ( cc, x, y ).size() > 0 ) {
223  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" <<
224  "Searching for rules with only one element" );
225  if ( ssgd.rulesWithRhsSpan1[x].find ( vcat[cc] ) !=
226  ssgd.rulesWithRhsSpan1[x].end() )
227  rules = ssgd.rulesWithRhsSpan1[x][vcat[cc]];
228  else rules.clear();
229  LDEBUG ( "Found: " << rules.size() );
230  for ( unsigned jpts = 0; jpts < rules.size(); ++jpts ) {
231  unsigned& idx = rules[jpts];
232  unsigned head = categories[ssgd.getLHS ( idx )];
233  // if ( head <= cykdata_.numberTopNonTerminals && x > 0 ) continue;
234  //if not listed as exception, then discard
235  if ( y >= hrmh_ &&
236  cykdata_.nt_exceptions_maxspan.find ( ssgd.getLHS ( idx ) ) ==
237  cykdata_.nt_exceptions_maxspan.end() ) continue;
238  if ( head >= cc ) {
239  LWARN ( ssgd.getLHS ( idx ) << "," << x << "," << y << ":" <<
240  "Potentially nasty looping rule (filtering): " << ssgd.getRule ( idx ) );
241  continue;
242  }
243  LDEBUG ( ssgd.getLHS ( idx ) << "," << x << "," << y << ":" << "Accepted: " <<
244  ssgd.getRule ( idx ) << ", original rule_id=" << ssgd.getIdx (
245  idx ) << ",vs" << idx );
246  cykdata_.cykgrid.Add ( head, x, y, idx );
247  coord.clear();
248  coord.push_back ( cc );
249  coord.push_back ( x );
250  coord.push_back ( y );
251  cykdata_.bp.Add ( head, x, y, coord );
252  }
253  }
254  }
255  };
256 
261  void fillBottomChart() {
262  grammar_categories_t& categories = cykdata_.categories;
263  cykparser_sentence_t& sentence = cykdata_.sentence;
264  // const GrammarData &gd = *d_->grammar;
265  SentenceSpecificGrammarData& ssgd = *d_->ssgd;
267  for ( unsigned x = 0; x < sentence.size(); ++x ) {
268  std::string cat = ucam::util::toString ( sentence[x] );
269  if ( ssgd.rulesWithRhsSpan1[x].find ( cat ) !=
270  ssgd.rulesWithRhsSpan1[x].end() ) {
271  rules = ssgd.rulesWithRhsSpan1[x][cat]; //In this case it should be identical...
272  } else {
273  LDEBUG ( "No rules to process" );
274  rules.clear();
275  }
276  for ( unsigned p = 0; p < rules.size(); ++p ) {
277  unsigned& idx = rules[p];
278  LDEBUG ( ssgd.getLHS ( idx ) << "," << x << ",0:" << "Accepted: " <<
279  ssgd.getRule ( idx ) << ", original rule_id=" << ssgd.getIdx (
280  idx ) << ",vs" << idx );
281  cykdata_.cykgrid.Add ( categories[ssgd.getLHS ( idx )], x, 0, idx );
283  cdi.push_back ( categories[cat] );
284  cdi.push_back ( x );
285  cdi.push_back ( 0 ); //pointing to the word itself.
286  cykdata_.bp.Add ( categories[ssgd.getLHS ( idx )], x, 0, cdi );
287  }
288  insertMonoRHSRules ( x, 0 );
289  }
290  };
291 
308  void fillChart() {
309  grammar_categories_t& categories = cykdata_.categories;
310  grammar_inversecategories_t& vcat = cykdata_.vcat;
311  cykparser_sentence_t& sentence = cykdata_.sentence;
312  unsigned& nnt = cykdata_.nnt;
313  unordered_map<std::string, cykparser_ruledependencies_t > minicache;
314  cykparser_ruledependencies_t& rd = cykdata_.rd;
316  const GrammarData& gd = *d_->grammar;
317  SentenceSpecificGrammarData& ssgd = *d_->ssgd;
319  for ( unsigned y = 1; y < sentence.size(); ++y ) {
320  for ( unsigned x = 0; x < sentence.size() - y; ++x ) {
321  for ( unsigned n = 0; n < y; n++ ) {
322  //just an alias.
323  unsigned& x1 = x;
324  unsigned y1 = n;
325  unsigned x2 = y1 + x1 + 1;
326  for ( unsigned cc = 1; cc <= nnt + sentence.size();
327  ++cc ) { //Only in this case we have to make sure it exists.
328  std::string cat = vcat[cc];
329  // prevent non terminals other than the top nonterminals (e.g. S) to have a span greater or equal to hrmaxheight
330  if ( cykdata_.cykgrid ( cc, x1, y1 ).size() == 0 ) continue;
331  //Keep caching at the same position for identical rules with different translations
332  minicache.clear();
333  rules = ssgd.rulesWithRhsSpan2OrMore[x][cat];
334  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" << " Obtained " << ucam::util::toString (
335  rules.size() ) << " rules." );
336  for ( unsigned i = 0; i < rules.size(); i++ ) {
337  unsigned idx = rules[i];
338  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" << "Testing rule idx=" << idx
339  << ",value=" << ssgd.getRule ( idx ) << "; Iteration=" << i );
340  if ( ssgd.getRHSSourceSize ( idx ) > y + 1 ) {
341  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" <<
342  "Rejected (rule minimum span too big): " << ssgd.getRule ( idx ) );
343  continue;
344  }
345  if (y >= hrmh_ ) {
346  if (cykdata_.nt_exceptions_maxspan.find (ssgd.getLHS ( idx ) ) ==
347  cykdata_.nt_exceptions_maxspan.end() ) {
348  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" <<
349  "Rejected (Default Maximum span reached hrmaxheight): " << ssgd.getRule (
350  idx ) );
351  LDEBUG ( "Note: hrmaxheight=" << hrmh_ <<
352  " has been reached, only nonterminals listed as exceptions (--cykparser.ntexceptionsmaxspan are allowed!" );
353  continue;
354  }
355  }
356  if ( hmax_.find ( ssgd.getLHS ( idx ) ) != hmax_.end() )
357  if ( y > hmax_[ssgd.getLHS ( idx )] ) {
358  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" << "Rejected (hmax): " <<
359  ssgd.getRule ( idx ) );
360  continue;
361  }
362  if ( hmin_.find ( ssgd.getLHS ( idx ) ) != hmin_.end() )
363  if ( y < hmin_[ssgd.getLHS ( idx )] ) {
364  LDEBUG ( vcat[cc] << "," << x << "," << y << ":" << "Rejected (hmin): " <<
365  ssgd.getRule ( idx ) );
366  continue;
367  }
368  std::string rhs = ssgd.getRHSSource ( idx );
369  rd.clear();
370  unordered_map<std::string, cykparser_ruledependencies_t > ::iterator itx =
371  minicache.find ( rhs );
372  if ( itx != minicache.end() ) {
373  rd = itx->second;
374  } else {
375  coord.clear();
376  coord.push_back ( cc );
377  coord.push_back ( x1 );
378  coord.push_back ( y1 );
379  examineRule ( x2 - 0, y - ( x2 - x1 - 1 ), idx, 1, coord );
380  minicache[rhs] = rd;
381  }
382  if ( rd.size() == 0 ) {
383  LDEBUG ( vcat[cc] << "," << x << "," << y << "(/" << n << "):" << "Rejected " <<
384  ssgd.getRule ( idx ) << ", for: " << ssgd.getLHS ( idx ) << "," << x << "," <<
385  y );
386  continue;
387  }
388  LDEBUG ( vcat[cc] << "," << x << "," << y << "(/" << n << "):" << "Accepted: "
389  << ssgd.getRule ( idx ) << ", for: " << ssgd.getLHS ( idx ) << "," << x << ","
390  << y << ", original rule_id=" << ssgd.getIdx ( idx ) << ",vs" << idx );
391  for ( unsigned kpt = 0; kpt < rd.size(); kpt++ ) {
392  cykdata_.cykgrid.Add ( categories[ssgd.getLHS ( idx )], x, y , idx );
393  cykdata_.bp.Add ( categories[ssgd.getLHS ( idx )], x, y, rd[kpt] );
394  }
395  }
396  }
397  }
398  insertMonoRHSRules ( x, y );
399  }
400  }
401  };
402 
403  ZDISALLOW_COPY_AND_ASSIGN ( CYKParserTask );
404 
405 };
406 
407 }
408 } // end namespaces
409 
410 #endif
CYKbackpointers bp
bool run(Data &d)
Runs the parsing algorithm.
Data structure containing all cyk-related information.
const std::string kCykparserHrmaxheight
std::string toString(const T &x, uint pr=2)
Converts an arbitrary type to string Converts to string integers, floats, doubles Quits execution if ...
Implements cyk+ parser.
grammar_categories_t categories
Ordered list of non-terminals (listed in hierarchical order according to identity rules) ...
#define LINFO(msg)
grammar_inversecategories_t vcat
Inverse map (1=S,2=X,...)
const std::string kCykparserNtexceptionsmaxspan
unordered_map< uint, std::string > grammar_inversecategories_t
void Add(const uint cc, const uint x, const uint y, const uint ruleidx)
Add a rule to the cyk grid at (cc,x,y)
uint nnt
number of non-terminals
~CYKParserTask(void)
Destructor.
#define LDEBUG(msg)
const std::string getRule(std::size_t idx)
Returns rule corresponding to index idx.
cykparser_ruledependencies_t rd
coordinate dependencies for each candidate.
const CYKdata * getcykdata()
returns cykdata structure
grammar_categories_t categories
Map between categories (S=1,X=2,...)
Struct containing grammar rules.
CYKgrid cykgrid
Cyk grid. Each cell of the grid is uniquely defined by three dimensions: [category,x,y].
Templated (hybrid) Interface for Task classes.
CYKParserTask(const ucam::util::RegistryPO &rg)
Constructor.
const std::string getRHSSource(std::size_t idx)
Returns Right-hand-side (source) of the rule with index=idx.
int getFinalResult()
Returns success (number of nodes in topmost cell) or failure (CYK_RETURNS_FAILURE=0) ...
const std::string getLHS(std::size_t idx)
Returns Left-hand-side of a rule corresponding to index idx.
unordered_set< std::string > nt_exceptions_maxspan
const std::string kCykparserHmin
std::vector< cykparser_rulebpcoordinates_t > cykparser_ruledependencies_t
#define LWARN(msg)
#define CYK_RETURN_FAILURE
std::basic_string< uint > cykparser_rulebpcoordinates_t
ssgrammar_rulesmap_t rulesWithRhsSpan2OrMore
cells containing potentially applicable rules with two or more elements
std::size_t size()
Return actual size of the cyk grid.
const uint getRHSSourceSize(std::size_t idx)
Returns size of RHS source of a rule.
std::basic_string< uint > cykparser_sentence_t
const std::size_t getIdx(std::size_t idx)
Returns the true idx of a rule (i.e. line in the grammar file). If it is sentence specific...
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
cykparser_sentence_t sentence
The sentence we want to parse.
uint success
Success and how many parse S nodes have been found in the topmost cell. If 0, cyk parser has failed...
Structure for sentence-specific grammar Rules will be queried by cyk per position and number of eleme...
grammar_inversecategories_t vcat
void Add(const unsigned cc, const unsigned x, const unsigned y, const cykparser_ruledependencies_t &coords)
Add all the set of backpointers to the grid.
std::basic_string< uint > ssgrammar_listofrules_t
const std::string kCykparserHmax
Definition: bleu.hpp:14
unordered_map< std::string, uint > grammar_categories_t
unordered_set< std::string > getSetString(const std::string &key) const
Convenience method that returns a set of strings taking "," as the separator character.
Definition: registrypo.hpp:279
void getFilteredNonTerminal(std::string &word)
Return the filtered non-terminal name. For example, for the rule Z 3_XT2_5 XT2, getFilteredNonTermina...