Cambridge SMT System
task.ssgrammar.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef SENTENCESPECIFICGRAMMARTASK_HPP
16 #define SENTENCESPECIFICGRAMMARTASK_HPP
17 
26 namespace ucam {
27 namespace hifst {
28 
37 template <class Data>
39 
40  //Private variables are shown here. Private methods go after public methods
41  private:
42 
44  ucam::util::IntegerPatternAddress ssgrammarfile_;
45 
47  bool addoovs_;
48  bool deleteoovs_;
49  std::size_t oovindex_;
50 
52  unsigned rule_id_offset_;
53 
56 
57  public:
60  rule_id_offset_ ( 0 ),
61  ssgrammarfile_ ( rg.get<std::string> ( HifstConstants::kSsgrammarStore ) ),
62  addoovs_ ( rg.getBool ( HifstConstants::kSsgrammarAddoovsEnable ) ) ,
63  oovindex_ ( 0 ),
64  deleteoovs_ ( rg.getBool (
66  LDEBUG ( "Constructor done!" );
67  };
68 
73  bool run ( Data& d ) {
74  LDEBUG ( "HP size=" << d.hpinstances.size() );
75  USER_CHECK ( d.grammar, "Big grammar data not available!" );
76  oovindex_ = 1;
77  d.ssgd = &ssgd_;
78  LDEBUG ( "HP size=" << d.hpinstances.size() );
79  d.stats->setTimeStart ("ssgrammar-extract");
80  get ( d );
81  d.stats->setTimeEnd ("ssgrammar-extract");
82  LDEBUG ( "HP size=" << d.hpinstances.size() );
84  // addFeedbackRules
85  if ( ssgrammarfile_ ( d.sidx ) != "" )
86  writessgrammar ( ssgrammarfile_ ( d.sidx ) );
87  LDEBUG ( "Finished run method" );
88  return false;
89  };
90 
91  private:
92 
94  std::size_t createOOVRule ( const std::string& oov ) {
95  std::size_t newindex = ssgd_.grammar->sizeofvpos + ( rule_id_offset_++ );
96  unsigned n = ucam::util::toNumber<unsigned> ( oov );
97  if ( n >= OOVID && !deleteoovs_ )
98  ssgd_.extrarules[ newindex ] = "X " + oov + " " + oov + " 0";
99  else
100  ssgd_.extrarules[ newindex ] = "X " + oov + " <oov> 0";
101  LINFO ( "New oov rule id=" << newindex << ",rule=" <<
102  ssgd_.extrarules[ newindex ] );
103  return newindex;
104  }
105 
108  void writessgrammar ( const std::string& filename ) {
109  FORCELINFO ( "Saving ssgrammar to " << filename );
110  unordered_set<unsigned> seenrules;
111  ucam::util::oszfstream o ( filename );
112  for ( ssgrammar_rulesmap_t::iterator itx = ssgd_.rulesWithRhsSpan1.begin();
113  itx != ssgd_.rulesWithRhsSpan1.end();
114  ++itx ) {
115  for ( ssgrammar_firstelementmap_t::iterator itx2 = itx->second.begin();
116  itx2 != itx->second.end();
117  ++itx2 ) {
118  for ( unsigned k = 0; k < itx2->second.size(); ++k ) {
119  if ( seenrules.find ( itx2->second[k] ) != seenrules.end() ) continue;
120  seenrules.insert ( itx2->second[k] );
121  o << ssgd_.grammar->getRule ( itx2->second[k] ) << endl;
122  }
123  }
124  }
125  for ( ssgrammar_rulesmap_t::iterator itx =
126  ssgd_.rulesWithRhsSpan2OrMore.begin();
127  itx != ssgd_.rulesWithRhsSpan2OrMore.end();
128  ++itx ) {
129  for ( ssgrammar_firstelementmap_t::iterator itx2 = itx->second.begin();
130  itx2 != itx->second.end();
131  ++itx2 ) {
132  for ( unsigned k = 0; k < itx2->second.size(); ++k ) {
133  if ( seenrules.find ( itx2->second[k] ) != seenrules.end() ) continue;
134  seenrules.insert ( itx2->second[k] );
135  o << ssgd_.grammar->getRule ( itx2->second[k] ) << endl;
136  }
137  }
138  }
139  o.close();
140  };
141 
151  void get ( Data& d ) {
152  ssgrammar_instancemap_t& hpinstances = d.hpinstances;
153  ssgd_.reset();
154  ssgd_.grammar = d.grammar;
155  for ( ssgrammar_instancemap_t::iterator itx = hpinstances.begin();
156  itx != hpinstances.end(); ++itx ) {
157  LDEBUG ( "Search for [" << itx->first << "]" );
158  std::string needle = itx->first + " ";
159  int pos = exists ( needle );
160  if ( -1 == pos ) {
161  if ( addoovs_ )
162  if ( phraseIsTerminalWord ( itx->first ) ) {
163  std::size_t ruleid = createOOVRule ( itx->first );
164  for ( unsigned k = 0; k < itx->second.size(); ++k ) {
165  unsigned& x = itx->second[k].first;
166  ssgd_.rulesWithRhsSpan1[x][itx->first].push_back ( ruleid );
167  LDEBUG ( "***Adding (OOV) rule index " << ruleid << ":" << ssgd_.getRule (
168  ruleid ) );
169  }
170  }
171  LDEBUG ( "Pattern not found!" );
172  continue;
173  }
174  USER_CHECK ( pos >= 0, "positive value required!" );
175  LDEBUG ( "Extracting indices for =>" << itx->first << ",size of pattern=" <<
176  getSize ( itx->first ) <<
177  ", number of instances at which this was found: (x,span): " <<
178  itx->second.size() );
180  unordered_set<unsigned> seenx;
181  if ( getSize ( itx->first ) == 1 ) {
182  for ( unsigned k = 0; k < itx->second.size(); ++k ) {
183  unsigned& x = itx->second[k].first;
184  if ( seenx.find ( x ) != seenx.end() ) {
185  LDEBUG ( "Repeated:" << itx->first << " at x=" << x );
186  continue;
187  }
188  seenx.insert ( x );
189  LDEBUG ( "*calling addRuleIndicesRHS (1) at x=" << x );
190  addRuleIndicesRHS ( needle, pos, ssgd_.rulesWithRhsSpan1[x] , d.tvcb );
191  LDEBUG ( "*Done!" );
192  }
193  } else {
194  for ( unsigned k = 0; k < itx->second.size(); ++k ) {
195  unsigned& x = itx->second[k].first;
196  if ( seenx.find ( x ) != seenx.end() ) {
197  LDEBUG ( "Repeated:" << itx->first << " at x=" << x );
198  continue;
199  }
200  seenx.insert ( x );
201  LDEBUG ( "*calling addRuleIndicesRHS (2) at x=" << x );
202  addRuleIndicesRHS ( needle, pos, ssgd_.rulesWithRhsSpan2OrMore[x] , d.tvcb );
203  LDEBUG ( "*Done" );
204  }
205  }
206  LDEBUG ( "Finished extracting indices for " << itx->first );
207  }
208  LDEBUG ( "Finished get method" );
209  };
210 
220  void addRuleIndicesRHS ( const std::string& needle
221  , const int pos, ssgrammar_firstelementmap_t& rules
222  , const unordered_set<std::string>& vcb ) {
223  USER_CHECK ( pos >= 0, "pos needs to be positive" );
224  LDEBUG ( "**Adding indices for rules" );
225  const GrammarData& g = *ssgd_.grammar;
226  for ( unsigned j = pos; j < g.sizeofvpos ; ++j ) {
227  if ( g.ct->ncompare ( needle.c_str(), g.filecontents.c_str() + g.vpos[j].p,
228  needle.size() ) ) break;
229  if ( !g.isAcceptedByVocabulary ( j, vcb ) ) {
230  LDEBUG ( "skipping rule (rejected by vcb):" << g.getRule ( j ) );
231  continue;
232  }
233  std::string firstelement = g.getRHSSource ( j , 0 );
234  getFilteredNonTerminal ( firstelement );
235  LDEBUG ( "***Adding rule #" << j << ":" << g.getRule ( j ) );
236  rules[firstelement].push_back ( j );
237  }
238  if ( pos == 0 ) return;
239  for ( int j = pos - 1; j >= 0 ; --j ) {
240  if ( g.ct->ncompare ( needle.c_str(), g.filecontents.c_str() + g.vpos[j].p,
241  needle.size() ) ) break;
242  if ( !g.isAcceptedByVocabulary ( j, vcb ) ) continue;
243  std::string firstelement = g.getRHSSource ( j , 0 );
244  getFilteredNonTerminal ( firstelement );
245  LDEBUG ( "***Adding rules # " << j << ":" << g.getRule ( j ) );
246  rules[firstelement].push_back ( j );
247  }
248  };
249 
255  inline int exists ( const std::string& needle ) {
256  USER_CHECK ( needle.at ( needle.size() - 1 ) == ' ',
257  "This method requires a space appended to que queried string" );
258  const GrammarData& g = *ssgd_.grammar;
259  int oldmid = -1, mid = 0;
260  int first = 0;
261  int last = g.sizeofvpos - 1;
262  //good old fashioned bs over the raw sequence of chars.
263  while ( first <= last ) {
264  mid = ( first + last ) / 2;
265  if ( mid == oldmid ) break;
266  int res = g.ct->ncompare ( needle.c_str(),
267  g.filecontents.c_str() + g.vpos[mid].p, needle.size() );
268  if ( res < 0 ) first = mid + 1;
269  else if ( res > 0 ) last = mid - 1;
270  else first = last + 1;
271  oldmid = mid;
272  }
273  if ( !g.ct->ncompare ( needle.c_str(), g.filecontents.c_str() + g.vpos[mid].p,
274  needle.size() ) ) {
275  return mid;
276  }
277  LDEBUG ( "Could not find: [" << needle << "]" );
278  return -1;
279  };
280 
281  ZDISALLOW_COPY_AND_ASSIGN ( SentenceSpecificGrammarTask );
282 
283 };
284 
285 }
286 } // end namespaces
287 
288 #endif
Wrapper stream class that writes to pipes, text files or gzipped files.
Definition: szfstream.hpp:200
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
#define LINFO(msg)
#define FORCELINFO(msg)
bool run(Data &d)
run method, given a grammar and instantiated patterns, creates and returns the hashes ...
#define LDEBUG(msg)
const std::string getRule(std::size_t idx)
Returns rule corresponding to index idx.
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
Struct containing grammar rules.
posindex * vpos
Sorted Indices.
Templated (hybrid) Interface for Task classes.
const std::string getRHSSource(std::size_t idx) const
Gets right-hand-side source for a rule using rule index idx.
SentenceSpecificGrammarTask(const ucam::util::RegistryPO &rg)
Constructor.
virtual __always_inline int ncompare(const char *s1, const char *s2, uint n)
const std::string kSsgrammarAddoovsSourcedeletions
const std::string kSsgrammarAddoovsEnable
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb) const
Determines whether a particular rule is allowed within a vocabulary, i.e. all target words of the rul...
#define OOVID
unordered_map< std::string, std::vector< pair< uint, uint > > > ssgrammar_instancemap_t
std::string filecontents
The whole grammar.
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
bool phraseIsTerminalWord(const std::string &phrase)
This class uses instantiated patterns to analyze the grammar and deliver two hashes providing candida...
ssgrammar_rulesmap_t rulesWithRhsSpan2OrMore
cells containing potentially applicable rules with two or more elements
unordered_map< std::string, ssgrammar_listofrules_t > ssgrammar_firstelementmap_t
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
unordered_map< std::size_t, std::string > extrarules
const GrammarData * grammar
Pointer to the original grammar data (no ownership)
const std::string kSsgrammarStore
const uint getSize(const std::string &rhs)
A generic element counter that can be used to any string. It is intended to use with either source or...
Structure for sentence-specific grammar Rules will be queried by cyk per position and number of eleme...
std::size_t sizeofvpos
Number of rules.
Definition: bleu.hpp:14
void getFilteredNonTerminal(std::string &word)
Return the filtered non-terminal name. For example, for the rule Z 3_XT2_5 XT2, getFilteredNonTermina...
void close()
Closes the file.
Definition: szfstream.hpp:323