Cambridge SMT System
data.grammar.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef DATA_GRAMMAR_HPP
16 #define DATA_GRAMMAR_HPP
17 
27 
28 namespace ucam {
29 namespace hifst {
42 struct GrammarData {
43 
46  vpos ( NULL ),
47  sizeofvpos ( 0 ),
48  ct ( NULL ) {
49  };
50 
53  if ( vpos != NULL ) delete [] vpos;
54  }
55 
57  std::string filecontents;
61  std::size_t sizeofvpos;
63  unordered_set<std::string> patterns;
66 
70 
72  inline void reset() {
73  filecontents = "";
74  if ( vpos != NULL ) delete [] vpos;
75  patterns.clear();
76  categories.clear();
77  vcat.clear();
78  sizeofvpos = 0;
79  ct = NULL;
80  }
81 
83  inline const std::string getRule ( std::size_t idx ) const {
84  std::size_t rpos = vpos[idx].p - vpos[idx].o;
85  std::size_t pos = filecontents.find_first_of ( "\n", rpos );
86  return filecontents.substr ( rpos, pos - rpos );
87  }
88 
90  inline const std::string getLHS ( std::size_t idx ) const {
91  std::size_t rpos = vpos[idx].p - vpos[idx].o;
92  return filecontents.substr ( rpos, vpos[idx].p - rpos - 1 );
93  }
94 
96  inline const std::string getRHSSource ( std::size_t idx ) const {
97  std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
98  return filecontents.substr ( vpos[idx].p, pos - vpos[idx].p );
99  }
100 
102  inline const std::string getRHSSource ( std::size_t idx, uint rulepos ) const {
103  std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
104  std::size_t j = vpos[idx].p - 1, jold;
105  for ( uint k = 0; k <= rulepos; ++k ) {
106  jold = j;
107  j = filecontents.find_first_of ( "_ ", jold + 1 );
108  if ( j == std::string::npos )
109  if ( rulepos ) return "";
110  }
111  return filecontents.substr ( jold + 1, j - jold - 1 );
112  }
113 
115  inline const std::vector<std::string> getRHSSplitSource (std::size_t idx ) const {
116  std::vector<std::string> splitsource;
117  boost::algorithm::split ( splitsource, getRHSSource ( idx )
118  , boost::algorithm::is_any_of ( "_" ) );
119  return splitsource;
120  }
121 
123  inline const uint getRHSSourceSize ( std::size_t idx ) const {
124  std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
125  return ucam::util::count_needles ( filecontents, '_', vpos[idx].p, pos ) + 1;
126  }
127 
129  inline const std::string getRHSTranslation ( std::size_t idx ) const {
130  std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p ) + 1;
131  std::size_t pos2 = filecontents.find_first_of ( " ", pos );
132  return filecontents.substr ( pos, pos2 - pos );
133  }
134 
136  inline const std::vector<std::string> getRHSSplitTranslation (
137  std::size_t idx ) const {
138  std::vector<std::string> splittranslation;
139  boost::algorithm::split ( splittranslation, getRHSTranslation ( idx ),
140  boost::algorithm::is_any_of ( "_" ) );
141  return splittranslation;
142  }
143 
145  inline const uint getRHSTranslationSize ( std::size_t idx ) const {
146  std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p ) + 1;
147  std::size_t pos2 = filecontents.find_first_of ( " ", pos );
148  return ucam::util::count_needles ( filecontents, '_', pos, pos2 ) + 1;
149  }
150 
152  inline const float getWeight ( std::size_t idx ) const {
153  std::size_t pos1 = filecontents.find_first_of ( " ", vpos[idx].p );
154  std::size_t pos2 = filecontents.find_first_of ( " ", pos1 + 1 );
155  std::size_t pos3 = filecontents.find_first_of ( " \t\n\0", pos2 + 1 );
156  return ucam::util::toNumber<float> ( filecontents.substr ( pos2,
157  pos3 - pos2 ) );
158  }
159 
160  // Affiliation or alignments go physically after the weight, so that
161  // it is an optional field.
162  void getLinks(std::size_t idx
163  , std::vector<unsigned> &links ) const {
164  using namespace std;
165  using namespace boost::algorithm;
166  size_t pos1 = filecontents.find_first_of ( " ", vpos[idx].p );
167  size_t pos2 = filecontents.find_first_of ( " ", pos1 + 1 );
168  size_t pos3 = filecontents.find_first_of ( "\t\n\0", pos2 + 1 );
169  if (filecontents[pos3] == '\t') {
170  size_t pos4 = filecontents.find_first_of ( " \t\n\0", pos3 + 1 );
171  string y = filecontents.substr ( pos3 + 1, pos4 - pos3 - 1);
172  LDEBUG("Links=[" << y << "]");
173  vector<string> x;
174  split(x, y, is_any_of("_"));
175  if (links.size() != x.size()) {
176  LERROR("Houston! " << idx << "=>" << y << ",x.size=" << x.size() << ",links.size=" << links.size() );
177  exit(EXIT_FAILURE);
178  }
179  for (unsigned k = 0; k < x.size(); ++k) {
180  LDEBUG("x at " << k << "=" << x[k] << ";");
181  ucam::util::toNumber<unsigned>("0");
182  ucam::util::toNumber<unsigned>("1");
183  links[k] = ucam::util::toNumber<unsigned>(x[k]);
184  }
185  }
186  }
187 
189  inline const bool isPhrase ( std::size_t idx ) const {
190  std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
191  for ( const char *c = filecontents.c_str() + vpos[idx].p;
192  c <= filecontents.c_str() + pos; ++c )
193  if ( *c >= 'A' && *c <= 'Z' ) return false; //has non-terminals.
194  return true; //pure phrase.
195  }
196 
198  inline const std::size_t getIdx ( std::size_t idx ) const {
199  return vpos[idx].order;
200  }
201 
207  void getMappings ( std::size_t idx,
208  unordered_map<uint, uint> *mappings ) const {
209  if ( isPhrase ( idx ) ) return;
210  const std::vector<std::string> source = getRHSSplitSource ( idx );
211  const std::vector<std::string> translation = getRHSSplitTranslation ( idx );
212  getRuleMappings ( source, translation, mappings );
213  return;
214  }
215 
225  inline const bool isAcceptedByVocabulary ( const std::size_t idx,
226  const unordered_set<std::string>& vcb ) const {
227  if ( !vcb.size() ) return true;
228  std::vector<std::string> tx = getRHSSplitTranslation ( idx );
229  for ( uint k = 0; k < tx.size(); ++k ) {
230  if ( tx[k] == "<dr>" || tx[k] == "<oov>" || tx[k] == "<s>" || tx[k] == "</s>"
231  || tx[k] == "<sep>") continue;
232  if ( !isTerminal ( tx[k] ) ) continue;
233  if ( vcb.find ( tx[k] ) == vcb.end() ) return false;
234  }
235  return true;
236  };
237 
238 };
239 
240 }
241 } // end namespaces
242 
243 #endif
std::size_t order
absolute index
GrammarData()
GrammarData constructor. Initializes GrammarData with empty information.
bool isTerminal(const std::string &word)
Determine if the element is a terminal (i.e. a word, represented by a number) or a non-terminal (i...
Contains structures and classes for GrammarData.
grammar_categories_t categories
Ordered list of non-terminals (listed in hierarchical order according to identity rules) ...
Struct containing rule positions and offsets.
unordered_map< uint, std::string > grammar_inversecategories_t
const std::string getLHS(std::size_t idx) const
Gets left-hand-side of the rule indexed by idx.
const uint getRHSTranslationSize(std::size_t idx) const
Returns the number of elements in translation for a given rule.
#define LDEBUG(msg)
const bool isPhrase(std::size_t idx) const
Checks whether the rule is a phrase or not (i.e. is hierarchical)
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
const uint getRHSSourceSize(std::size_t idx) const
Gets number of elements in the RHS source.
unordered_set< std::string > patterns
Patterns in these rules.
Struct containing grammar rules.
const std::vector< std::string > getRHSSplitSource(std::size_t idx) const
Gets a splitted version of RHS (source)
posindex * vpos
Sorted Indices.
const std::size_t getIdx(std::size_t idx) const
Gets the real position (line) in the (potentially unsorted) file.
const std::string getRHSSource(std::size_t idx) const
Gets right-hand-side source for a rule using rule index idx.
const std::string getRHSSource(std::size_t idx, uint rulepos) const
Gets element at position rulepos from the right-hand-side source for a rule indexed by idx...
~GrammarData()
Destructor.
const float getWeight(std::size_t idx) const
Returns weight of a rule accessed by index idx.
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb) const
Determines whether a particular rule is allowed within a vocabulary, i.e. all target words of the rul...
std::string filecontents
The whole grammar.
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
uint count_needles(const std::string &haystack, const char needle, std::size_t start, std::size_t end)
Convenience function that counts the number of times a needle appears.
const std::vector< std::string > getRHSSplitTranslation(std::size_t idx) const
Returns the translation as a vector of elements.
void getLinks(std::size_t idx, std::vector< unsigned > &links) const
#define LERROR(msg)
const std::string getRHSTranslation(std::size_t idx) const
Returns RHS translation part of a rule accessed by index idx.
void getMappings(std::size_t idx, unordered_map< uint, uint > *mappings) const
Returns the non-terminal mappings. For more details see getRuleMappings function. ...
grammar_inversecategories_t vcat
Contains structures and classes for GrammarData.
std::size_t sizeofvpos
Number of rules.
Class that provides basic string comparison between two const char *.
void reset()
Reset object.
Definition: bleu.hpp:14
unordered_map< std::string, uint > grammar_categories_t
void getRuleMappings(const std::vector< std::string > &source, const std::vector< std::string > &translation, unordered_map< uint, uint > *mappings)
Given a source and translation of the same rule, sharing the same non-terminals in RHS...