Cambridge SMT System
data.ssgrammar.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef SENTENCESPECIFICGRAMMARDATA_HPP
16 #define SENTENCESPECIFICGRAMMARDATA_HPP
17 
26 namespace ucam {
27 namespace hifst {
28 
39 
41  : grammar ( NULL )
42  {}
43 
46 
52 
55  unordered_map<std::size_t, std::string> extrarules;
57 
58  inline void reset() {
59  rulesWithRhsSpan1.clear();
60  rulesWithRhsSpan2OrMore.clear();
61  extrarules.clear();
62  grammar = NULL;
63  }
64 
66  inline const std::string getRule ( std::size_t idx ) {
67  if ( extrarules.find ( idx ) == extrarules.end() )
68  return grammar->getRule ( idx );
69  LINFO ( "ssgrammar idx=" << idx );
70  return extrarules[idx];
71  };
72 
74  inline const std::string getLHS ( std::size_t idx ) {
75  if ( extrarules.find ( idx ) == extrarules.end() )
76  return grammar->getLHS ( idx );
77  std::size_t pos = extrarules[idx].find_first_of ( " " );
78  return extrarules[idx].substr ( 0, pos );
79  };
80 
82  inline const std::string getRHSSource ( std::size_t idx ) {
83  if ( extrarules.find ( idx ) == extrarules.end() )
84  return grammar->getRHSSource ( idx );
85  std::size_t pos = extrarules[idx].find_first_of ( " " ) + 1;
86  std::size_t pos2 = extrarules[idx].find_first_of ( " ", pos );
87  return extrarules[idx].substr ( pos, pos2 - pos );
88  };
89 
91  inline const std::string getRHSSource ( std::size_t idx, uint rulepos ) {
92  if ( extrarules.find ( idx ) == extrarules.end() )
93  return grammar->getRHSSource ( idx, rulepos );
94  std::size_t pos = extrarules[idx].find_first_of ( " " );
95  std::size_t j = pos , jold;
96  for ( uint k = 0; k <= rulepos; ++k ) {
97  jold = j;
98  j = extrarules[idx].find_first_of ( "_ ", jold + 1 );
99  if ( j == std::string::npos )
100  if ( rulepos ) return "";
101  }
102  return extrarules[idx].substr ( jold + 1, j - jold - 1 );
103  };
104 
106  inline const std::vector<std::string> getRHSSplitSource ( std::size_t idx ) {
107  if ( extrarules.find ( idx ) == extrarules.end() )
108  return grammar->getRHSSplitSource ( idx );
109  std::vector<std::string> splitsource;
110  boost::algorithm::split ( splitsource, getRHSSource ( idx ),
111  boost::algorithm::is_any_of ( "_" ) );
112  return splitsource;
113  };
114 
116  inline const uint getRHSSourceSize ( std::size_t idx ) {
117  if ( extrarules.find ( idx ) == extrarules.end() )
118  return grammar->getRHSSourceSize ( idx );
119  std::size_t pos = extrarules[idx].find_first_of ( " " ) + 1;
120  std::size_t pos1 = extrarules[idx].find_first_of ( " " , pos ) + 1;
121  return ucam::util::count_needles ( extrarules[idx], '_', pos, pos1 ) + 1 ;
122  };
123 
125  inline const std::string getRHSTranslation ( std::size_t idx ) {
126  if ( extrarules.find ( idx ) == extrarules.end() )
127  return grammar->getRHSTranslation ( idx );
128  std::size_t pos = extrarules[idx].find_first_of ( " " ) + 1;
129  std::size_t pos1 = extrarules[idx].find_first_of ( " ", pos ) + 1;
130  std::size_t pos2 = extrarules[idx].find_first_of ( " ", pos1 );
131  return extrarules[idx].substr ( pos1, pos2 - pos1 );
132  };
133 
135  inline const std::vector<std::string> getRHSSplitTranslation (
136  std::size_t idx ) {
137  if ( extrarules.find ( idx ) == extrarules.end() )
138  return grammar->getRHSSplitTranslation ( idx );
139  std::vector<std::string> splittranslation;
140  boost::algorithm::split ( splittranslation, getRHSTranslation ( idx ),
141  boost::algorithm::is_any_of ( "_" ) );
142  return splittranslation;
143  };
144 
146  inline const uint getRHSTranslationSize ( std::size_t idx ) {
147  if ( extrarules.find ( idx ) == extrarules.end() )
148  return grammar->getRHSTranslationSize ( idx );
149  std::size_t pos = extrarules[idx].find_first_of ( " " ) + 1;
150  std::size_t pos1 = extrarules[idx].find_first_of ( " ", pos ) + 1;
151  std::size_t pos2 = extrarules[idx].find_first_of ( " ", pos1 );
152  return ucam::util::count_needles ( extrarules[idx], '_', pos1, pos2 ) + 1;
153  };
154 
156  inline const float getWeight ( std::size_t idx ) {
157  if ( extrarules.find ( idx ) == extrarules.end() )
158  return grammar->getWeight ( idx );
159  std::size_t pos = extrarules[idx].find_first_of ( " " ) + 1;
160  std::size_t pos1 = extrarules[idx].find_first_of ( " ", pos );
161  std::size_t pos2 = extrarules[idx].find_first_of ( " ", pos1 + 1 );
162  std::size_t pos3 = extrarules[idx].find_first_of ( " \n\0", pos2 + 1 );
163  return ucam::util::toNumber<float> ( extrarules[idx].substr ( pos2,
164  pos3 - pos2 ) );
165  };
166 
167  void getLinks(std::size_t idx
168  , std::vector<unsigned> &links) const {
169  if ( extrarules.find ( idx ) == extrarules.end() ) {
170  grammar->getLinks ( idx , links);
171  return;
172  }
173  LERROR("Untested code");
174  exit(EXIT_FAILURE);
175  // std::size_t pos = extrarules[idx].find_first_of ( " " ) + 1;
176  // std::size_t pos1 = extrarules[idx].find_first_of ( " " , pos) + 1;
177  // std::size_t pos2 = extrarules[idx].find_first_of ( " ", pos1) + 1;
178  // std::size_t pos3 = extrarules[idx].find_first_of ( " \n\0", pos2);
179  // if (extrarules[idx][pos3] == ' ') {
180  // std::size_t pos4 = extrarules[idx].find_first_of ( " \n\0", pos3 + 1 );
181  // return extrarules[idx].substr ( pos3, pos4 - pos3 );
182  // }
183  // return ""; // no affiliation or links
184  }
185 
186 
187 
188 
189  inline const bool isPhrase ( std::size_t idx ) {
190  if ( extrarules.find ( idx ) == extrarules.end() )
191  return grammar->isPhrase ( idx );
192  std::size_t pos0 = extrarules[idx].find_first_of ( " " ) + 1;
193  std::size_t pos = extrarules[idx].find_first_of ( " ", pos0 );
194  for ( const char *c = extrarules[idx].c_str() + pos0;
195  c <= extrarules[idx].c_str() + pos; ++c )
196  if ( *c >= 'A' && *c <= 'Z' ) return false; //has non-terminals.
197  return true; //pure phrase.
198  };
200  inline const std::size_t getIdx ( std::size_t idx ) {
201  if ( extrarules.find ( idx ) == extrarules.end() )
202  return grammar->getIdx ( idx );
203  return idx;
204  };
205 
208  inline const bool isAcceptedByVocabulary ( const std::size_t idx,
209  const unordered_set<std::string>& vcb ) {
210  if ( extrarules.find ( idx ) == extrarules.end() )
211  return grammar->isAcceptedByVocabulary ( idx, vcb );
212  return true;
213  };
214 
220  inline void getMappings ( std::size_t idx,
221  unordered_map<uint, uint> *mappings ) {
222  if ( extrarules.find ( idx ) == extrarules.end() )
223  grammar->getMappings ( idx, mappings );
224  if ( isPhrase ( idx ) ) return;
225  std::vector<std::string> source = getRHSSplitSource ( idx );
226  std::vector<std::string> translation = getRHSSplitTranslation ( idx );
227  getRuleMappings ( source, translation, mappings );
228  };
229 };
230 
234 inline bool phraseIsTerminalWord ( const std::string& phrase ) {
235  for ( uint k = 0; k < phrase.size(); ++k ) {
236  if ( phrase[k] >= 'A' && phrase[k] <= 'Z' ) return false;
237  else if ( phrase[k] == '_' ) return false;
238  }
239  return true;
240 };
241 
242 }
243 } // end namespaces
244 
245 #endif
const std::string getRHSTranslation(std::size_t idx)
Returns RHS translation of a rule with index idx.
const float getWeight(std::size_t idx)
Returns the weight of a rule. This weight is the dot product of all the features with its scales...
#define LINFO(msg)
const std::string getLHS(std::size_t idx) const
Gets left-hand-side of the rule indexed by idx.
void getMappings(std::size_t idx, unordered_map< uint, uint > *mappings)
Returns the non-terminal mappings for a rule. For more details see getRuleMappings function...
void getLinks(std::size_t idx, std::vector< unsigned > &links) const
const uint getRHSTranslationSize(std::size_t idx) const
Returns the number of elements in translation for a given rule.
const std::string getRule(std::size_t idx)
Returns rule corresponding to index idx.
const bool isPhrase(std::size_t idx) const
Checks whether the rule is a phrase or not (i.e. is hierarchical)
const uint getRHSSourceSize(std::size_t idx) const
Gets number of elements in the RHS source.
Struct containing grammar rules.
const std::vector< std::string > getRHSSplitSource(std::size_t idx) const
Gets a splitted version of RHS (source)
const bool isPhrase(std::size_t idx)
const std::size_t getIdx(std::size_t idx) const
Gets the real position (line) in the (potentially unsorted) file.
const std::string getRHSSource(std::size_t idx) const
Gets right-hand-side source for a rule using rule index idx.
const std::string getRHSSource(std::size_t idx)
Returns Right-hand-side (source) of the rule with index=idx.
const std::string getLHS(std::size_t idx)
Returns Left-hand-side of a rule corresponding to index idx.
const float getWeight(std::size_t idx) const
Returns weight of a rule accessed by index idx.
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb) const
Determines whether a particular rule is allowed within a vocabulary, i.e. all target words of the rul...
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
const std::vector< std::string > getRHSSplitSource(std::size_t idx)
Returns vector of elements of the RHS source.
bool phraseIsTerminalWord(const std::string &phrase)
const std::string getRHSSource(std::size_t idx, uint rulepos)
Returns element at position rulepos of right-hand-side (source)
ssgrammar_rulesmap_t rulesWithRhsSpan2OrMore
cells containing potentially applicable rules with two or more elements
uint count_needles(const std::string &haystack, const char needle, std::size_t start, std::size_t end)
Convenience function that counts the number of times a needle appears.
const std::vector< std::string > getRHSSplitTranslation(std::size_t idx) const
Returns the translation as a vector of elements.
const uint getRHSSourceSize(std::size_t idx)
Returns size of RHS source of a rule.
const std::size_t getIdx(std::size_t idx)
Returns the true idx of a rule (i.e. line in the grammar file). If it is sentence specific...
unordered_map< uint, ssgrammar_firstelementmap_t > ssgrammar_rulesmap_t
unordered_map< std::size_t, std::string > extrarules
const GrammarData * grammar
Pointer to the original grammar data (no ownership)
void getLinks(std::size_t idx, std::vector< unsigned > &links) const
#define LERROR(msg)
const std::string getRHSTranslation(std::size_t idx) const
Returns RHS translation part of a rule accessed by index idx.
void getMappings(std::size_t idx, unordered_map< uint, uint > *mappings) const
Returns the non-terminal mappings. For more details see getRuleMappings function. ...
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb)
Structure for sentence-specific grammar Rules will be queried by cyk per position and number of eleme...
const uint getRHSTranslationSize(std::size_t idx)
Returns size of RHS (translation) of a rule.
Definition: bleu.hpp:14
void getRuleMappings(const std::vector< std::string > &source, const std::vector< std::string > &translation, unordered_map< uint, uint > *mappings)
Given a source and translation of the same rule, sharing the same non-terminals in RHS...
const std::vector< std::string > getRHSSplitTranslation(std::size_t idx)
Returns translation as a vector of elements.