15 #ifndef DATA_GRAMMAR_HPP 16 #define DATA_GRAMMAR_HPP 74 if ( vpos != NULL )
delete []
vpos;
83 inline const std::string
getRule ( std::size_t idx )
const {
84 std::size_t rpos = vpos[idx].
p - vpos[idx].
o;
85 std::size_t pos = filecontents.find_first_of (
"\n", rpos );
86 return filecontents.substr ( rpos, pos - rpos );
90 inline const std::string
getLHS ( std::size_t idx )
const {
91 std::size_t rpos = vpos[idx].
p - vpos[idx].
o;
92 return filecontents.substr ( rpos, vpos[idx].p - rpos - 1 );
97 std::size_t pos = filecontents.find_first_of (
" ", vpos[idx].p );
98 return filecontents.substr ( vpos[idx].p, pos - vpos[idx].p );
102 inline const std::string
getRHSSource ( std::size_t idx, uint rulepos )
const {
103 std::size_t pos = filecontents.find_first_of (
" ", vpos[idx].p );
104 std::size_t j = vpos[idx].
p - 1, jold;
105 for ( uint k = 0; k <= rulepos; ++k ) {
107 j = filecontents.find_first_of (
"_ ", jold + 1 );
108 if ( j == std::string::npos )
109 if ( rulepos )
return "";
111 return filecontents.substr ( jold + 1, j - jold - 1 );
116 std::vector<std::string> splitsource;
117 boost::algorithm::split ( splitsource,
getRHSSource ( idx )
118 , boost::algorithm::is_any_of (
"_" ) );
124 std::size_t pos = filecontents.find_first_of (
" ", vpos[idx].p );
130 std::size_t pos = filecontents.find_first_of (
" ", vpos[idx].p ) + 1;
131 std::size_t pos2 = filecontents.find_first_of (
" ", pos );
132 return filecontents.substr ( pos, pos2 - pos );
137 std::size_t idx )
const {
138 std::vector<std::string> splittranslation;
140 boost::algorithm::is_any_of (
"_" ) );
141 return splittranslation;
146 std::size_t pos = filecontents.find_first_of (
" ", vpos[idx].p ) + 1;
147 std::size_t pos2 = filecontents.find_first_of (
" ", pos );
152 inline const float getWeight ( std::size_t idx )
const {
153 std::size_t pos1 = filecontents.find_first_of (
" ", vpos[idx].p );
154 std::size_t pos2 = filecontents.find_first_of (
" ", pos1 + 1 );
155 std::size_t pos3 = filecontents.find_first_of (
" \t\n\0", pos2 + 1 );
156 return ucam::util::toNumber<float> ( filecontents.substr ( pos2,
163 , std::vector<unsigned> &links )
const {
165 using namespace boost::algorithm;
166 size_t pos1 = filecontents.find_first_of (
" ", vpos[idx].p );
167 size_t pos2 = filecontents.find_first_of (
" ", pos1 + 1 );
168 size_t pos3 = filecontents.find_first_of (
"\t\n\0", pos2 + 1 );
169 if (filecontents[pos3] ==
'\t') {
170 size_t pos4 = filecontents.find_first_of (
" \t\n\0", pos3 + 1 );
171 string y = filecontents.substr ( pos3 + 1, pos4 - pos3 - 1);
172 LDEBUG(
"Links=[" << y <<
"]");
174 split(x, y, is_any_of(
"_"));
175 if (links.size() != x.size()) {
176 LERROR(
"Houston! " << idx <<
"=>" << y <<
",x.size=" << x.size() <<
",links.size=" << links.size() );
179 for (
unsigned k = 0; k < x.size(); ++k) {
180 LDEBUG(
"x at " << k <<
"=" << x[k] <<
";");
181 ucam::util::toNumber<unsigned>(
"0");
182 ucam::util::toNumber<unsigned>(
"1");
183 links[k] = ucam::util::toNumber<unsigned>(x[k]);
189 inline const bool isPhrase ( std::size_t idx )
const {
190 std::size_t pos = filecontents.find_first_of (
" ", vpos[idx].p );
191 for (
const char *c = filecontents.c_str() + vpos[idx].
p;
192 c <= filecontents.c_str() + pos; ++c )
193 if ( *c >=
'A' && *c <=
'Z' )
return false;
198 inline const std::size_t
getIdx ( std::size_t idx )
const {
199 return vpos[idx].
order;
208 unordered_map<uint, uint> *mappings )
const {
226 const unordered_set<std::string>& vcb )
const {
227 if ( !vcb.size() )
return true;
229 for ( uint k = 0; k < tx.size(); ++k ) {
230 if ( tx[k] ==
"<dr>" || tx[k] ==
"<oov>" || tx[k] ==
"<s>" || tx[k] ==
"</s>" 231 || tx[k] ==
"<sep>")
continue;
233 if ( vcb.find ( tx[k] ) == vcb.end() )
return false;
std::size_t order
absolute index
GrammarData()
GrammarData constructor. Initializes GrammarData with empty information.
bool isTerminal(const std::string &word)
Determine if the element is a terminal (i.e. a word, represented by a number) or a non-terminal (i...
Contains structures and classes for GrammarData.
grammar_categories_t categories
Ordered list of non-terminals (listed in hierarchical order according to identity rules) ...
Struct containing rule positions and offsets.
unordered_map< uint, std::string > grammar_inversecategories_t
const std::string getLHS(std::size_t idx) const
Gets left-hand-side of the rule indexed by idx.
const uint getRHSTranslationSize(std::size_t idx) const
Returns the number of elements in translation for a given rule.
const bool isPhrase(std::size_t idx) const
Checks whether the rule is a phrase or not (i.e. is hierarchical)
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
const uint getRHSSourceSize(std::size_t idx) const
Gets number of elements in the RHS source.
unordered_set< std::string > patterns
Patterns in these rules.
Struct containing grammar rules.
const std::vector< std::string > getRHSSplitSource(std::size_t idx) const
Gets a splitted version of RHS (source)
posindex * vpos
Sorted Indices.
const std::size_t getIdx(std::size_t idx) const
Gets the real position (line) in the (potentially unsorted) file.
const std::string getRHSSource(std::size_t idx) const
Gets right-hand-side source for a rule using rule index idx.
const std::string getRHSSource(std::size_t idx, uint rulepos) const
Gets element at position rulepos from the right-hand-side source for a rule indexed by idx...
~GrammarData()
Destructor.
const float getWeight(std::size_t idx) const
Returns weight of a rule accessed by index idx.
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb) const
Determines whether a particular rule is allowed within a vocabulary, i.e. all target words of the rul...
std::string filecontents
The whole grammar.
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
uint count_needles(const std::string &haystack, const char needle, std::size_t start, std::size_t end)
Convenience function that counts the number of times a needle appears.
const std::vector< std::string > getRHSSplitTranslation(std::size_t idx) const
Returns the translation as a vector of elements.
void getLinks(std::size_t idx, std::vector< unsigned > &links) const
const std::string getRHSTranslation(std::size_t idx) const
Returns RHS translation part of a rule accessed by index idx.
void getMappings(std::size_t idx, unordered_map< uint, uint > *mappings) const
Returns the non-terminal mappings. For more details see getRuleMappings function. ...
grammar_inversecategories_t vcat
std::size_t sizeofvpos
Number of rules.
void reset()
Reset object.
unordered_map< std::string, uint > grammar_categories_t
void getRuleMappings(const std::vector< std::string > &source, const std::vector< std::string > &translation, unordered_map< uint, uint > *mappings)
Given a source and translation of the same rule, sharing the same non-terminals in RHS...