tutorial/data_8grammar_8hpp_source.html

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use these files except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne

 #ifndef DATA_GRAMMAR_HPP
 #define DATA_GRAMMAR_HPP

 #include "data.grammar.utilities.hpp"
 #include "data.grammar.comparetool.hpp"

 namespace ucam {
 namespace hifst {
 struct GrammarData {

   GrammarData() :
     vpos ( NULL ),
     sizeofvpos ( 0 ),
     ct ( NULL ) {
   };

   ~GrammarData() {
     if ( vpos != NULL ) delete [] vpos;
   }

   std::string filecontents;
   posindex *vpos;
   std::size_t sizeofvpos;
   unordered_set<std::string> patterns;
   CompareTool *ct;

   grammar_categories_t categories;
   grammar_inversecategories_t vcat;

   inline void reset() {
     filecontents = "";
     if ( vpos != NULL ) delete [] vpos;
     patterns.clear();
     categories.clear();
     vcat.clear();
     sizeofvpos = 0;
     ct = NULL;
   }

   inline const std::string getRule ( std::size_t idx ) const {
     std::size_t rpos = vpos[idx].p - vpos[idx].o;
     std::size_t pos = filecontents.find_first_of ( "\n", rpos );
     return filecontents.substr ( rpos, pos - rpos );
   }

   inline const std::string getLHS ( std::size_t idx ) const {
     std::size_t rpos = vpos[idx].p - vpos[idx].o;
     return filecontents.substr ( rpos, vpos[idx].p - rpos - 1 );
   }

   inline const std::string getRHSSource ( std::size_t idx ) const {
     std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
     return filecontents.substr ( vpos[idx].p, pos - vpos[idx].p );
   }

   inline const std::string getRHSSource ( std::size_t idx, uint rulepos ) const {
     std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
     std::size_t j = vpos[idx].p - 1, jold;
     for ( uint k = 0; k <= rulepos; ++k ) {
       jold = j;
       j = filecontents.find_first_of ( "_ ", jold + 1 );
       if ( j == std::string::npos )
         if ( rulepos ) return "";
     }
     return filecontents.substr ( jold + 1, j - jold - 1 );
   }

   inline const std::vector<std::string> getRHSSplitSource (std::size_t idx ) const {
     std::vector<std::string> splitsource;
     boost::algorithm::split ( splitsource, getRHSSource ( idx )
                               , boost::algorithm::is_any_of ( "_" ) );
     return splitsource;
   }

   inline const uint getRHSSourceSize ( std::size_t idx ) const {
     std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
     return ucam::util::count_needles ( filecontents, '_', vpos[idx].p, pos ) + 1;
   }

   inline const std::string getRHSTranslation ( std::size_t idx ) const {
     std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p ) + 1;
     std::size_t pos2 = filecontents.find_first_of ( " ", pos );
     return filecontents.substr ( pos, pos2 - pos );
   }

   inline const std::vector<std::string> getRHSSplitTranslation (
     std::size_t idx ) const {
     std::vector<std::string> splittranslation;
     boost::algorithm::split ( splittranslation, getRHSTranslation ( idx ),
                               boost::algorithm::is_any_of ( "_" ) );
     return splittranslation;
   }

   inline const uint getRHSTranslationSize ( std::size_t idx ) const {
     std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p ) + 1;
     std::size_t pos2 = filecontents.find_first_of ( " ", pos );
     return ucam::util::count_needles ( filecontents, '_', pos, pos2 ) + 1;
   }

   inline const float getWeight ( std::size_t idx ) const {
     std::size_t pos1 = filecontents.find_first_of ( " ", vpos[idx].p );
     std::size_t pos2 = filecontents.find_first_of ( " ", pos1 + 1 );
     std::size_t pos3 = filecontents.find_first_of ( " \t\n\0", pos2 + 1 );
     return ucam::util::toNumber<float> ( filecontents.substr ( pos2,
                                          pos3 - pos2 ) );
   }

   // Affiliation or alignments go physically after the weight, so that
   // it is an optional field.
   void getLinks(std::size_t idx
                 , std::vector<unsigned> &links ) const {
     using namespace std;
     using namespace boost::algorithm;
     size_t pos1 = filecontents.find_first_of ( " ", vpos[idx].p );
     size_t pos2 = filecontents.find_first_of ( " ", pos1 + 1 );
     size_t pos3 = filecontents.find_first_of ( "\t\n\0", pos2 + 1 );
     if (filecontents[pos3] == '\t') {
       size_t pos4 = filecontents.find_first_of ( " \t\n\0", pos3 + 1 );
       string y = filecontents.substr ( pos3 + 1, pos4 - pos3 - 1);
       LDEBUG("Links=[" << y << "]");
       vector<string> x;
       split(x, y, is_any_of("_"));
       if (links.size() != x.size()) {
         LERROR("Houston! " << idx << "=>" << y << ",x.size=" << x.size() << ",links.size=" << links.size() );
         exit(EXIT_FAILURE);
       }
       for (unsigned k = 0; k < x.size(); ++k) {
         LDEBUG("x at " << k << "=" << x[k] << ";");
         ucam::util::toNumber<unsigned>("0");
         ucam::util::toNumber<unsigned>("1");
         links[k] = ucam::util::toNumber<unsigned>(x[k]);
       }
     }
   }

   inline const bool isPhrase ( std::size_t idx ) const {
     std::size_t pos = filecontents.find_first_of ( " ", vpos[idx].p );
     for ( const char *c = filecontents.c_str() + vpos[idx].p;
           c <= filecontents.c_str() + pos; ++c )
       if ( *c >= 'A' && *c <= 'Z' ) return false; //has non-terminals.
     return true; //pure phrase.
   }

   inline const std::size_t getIdx ( std::size_t idx ) const {
     return vpos[idx].order;
   }

   void getMappings ( std::size_t idx,
                      unordered_map<uint, uint> *mappings ) const {
     if ( isPhrase ( idx ) ) return;
     const std::vector<std::string> source = getRHSSplitSource ( idx );
     const std::vector<std::string> translation = getRHSSplitTranslation ( idx );
     getRuleMappings ( source, translation, mappings );
     return;
   }

   inline const bool isAcceptedByVocabulary ( const std::size_t idx,
       const unordered_set<std::string>& vcb ) const {
     if ( !vcb.size() ) return true;
     std::vector<std::string> tx = getRHSSplitTranslation ( idx );
     for ( uint k = 0; k < tx.size(); ++k ) {
       if ( tx[k] == "<dr>" || tx[k] == "<oov>" || tx[k] == "<s>" || tx[k] == "</s>"
            || tx[k] == "<sep>") continue;
       if ( !isTerminal ( tx[k] ) ) continue;
       if ( vcb.find ( tx[k] ) == vcb.end() ) return false;
     }
     return true;
   };

 };

 }
 } // end namespaces

 #endif
ucam::hifst::posindex::order
std::size_t order
absolute index
Definition: data.grammar.comparetool.hpp:42

ucam::hifst::GrammarData::GrammarData
GrammarData()
GrammarData constructor. Initializes GrammarData with empty information.
Definition: data.grammar.hpp:45

ucam::hifst::isTerminal
bool isTerminal(const std::string &word)
Determine if the element is a terminal (i.e. a word, represented by a number) or a non-terminal (i...
Definition: data.grammar.utilities.hpp:44

data.grammar.utilities.hpp
Contains structures and classes for GrammarData.

ucam::hifst::GrammarData::categories
grammar_categories_t categories
Ordered list of non-terminals (listed in hierarchical order according to identity rules) ...
Definition: data.grammar.hpp:68

ucam::hifst::posindex
Struct containing rule positions and offsets.
Definition: data.grammar.comparetool.hpp:36

ucam::hifst::grammar_inversecategories_t
unordered_map< uint, std::string > grammar_inversecategories_t
Definition: defs.grammar.hpp:28

ucam::hifst::GrammarData::getLHS
const std::string getLHS(std::size_t idx) const
Gets left-hand-side of the rule indexed by idx.
Definition: data.grammar.hpp:90

ucam::hifst::GrammarData::getRHSTranslationSize
const uint getRHSTranslationSize(std::size_t idx) const
Returns the number of elements in translation for a given rule.
Definition: data.grammar.hpp:145

LDEBUG
#define LDEBUG(msg)
Definition: logger.boost_log.hpp:107

ucam::hifst::GrammarData::isPhrase
const bool isPhrase(std::size_t idx) const
Checks whether the rule is a phrase or not (i.e. is hierarchical)
Definition: data.grammar.hpp:189

ucam::hifst::GrammarData::ct
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
Definition: data.grammar.hpp:65

ucam::hifst::GrammarData::getRHSSourceSize
const uint getRHSSourceSize(std::size_t idx) const
Gets number of elements in the RHS source.
Definition: data.grammar.hpp:123

ucam::hifst::GrammarData::patterns
unordered_set< std::string > patterns
Patterns in these rules.
Definition: data.grammar.hpp:63

ucam::hifst::GrammarData
Struct containing grammar rules.
Definition: data.grammar.hpp:42

ucam::hifst::GrammarData::getRHSSplitSource
const std::vector< std::string > getRHSSplitSource(std::size_t idx) const
Gets a splitted version of RHS (source)
Definition: data.grammar.hpp:115

ucam::hifst::GrammarData::vpos
posindex * vpos
Sorted Indices.
Definition: data.grammar.hpp:59

ucam::hifst::GrammarData::getIdx
const std::size_t getIdx(std::size_t idx) const
Gets the real position (line) in the (potentially unsorted) file.
Definition: data.grammar.hpp:198

ucam::hifst::GrammarData::getRHSSource
const std::string getRHSSource(std::size_t idx) const
Gets right-hand-side source for a rule using rule index idx.
Definition: data.grammar.hpp:96

ucam::hifst::GrammarData::getRHSSource
const std::string getRHSSource(std::size_t idx, uint rulepos) const
Gets element at position rulepos from the right-hand-side source for a rule indexed by idx...
Definition: data.grammar.hpp:102

ucam::hifst::GrammarData::~GrammarData
~GrammarData()
Destructor.
Definition: data.grammar.hpp:52

ucam::hifst::GrammarData::getWeight
const float getWeight(std::size_t idx) const
Returns weight of a rule accessed by index idx.
Definition: data.grammar.hpp:152

ucam::hifst::posindex::o
short o
offset
Definition: data.grammar.comparetool.hpp:40

ucam::hifst::GrammarData::isAcceptedByVocabulary
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb) const
Determines whether a particular rule is allowed within a vocabulary, i.e. all target words of the rul...
Definition: data.grammar.hpp:225

ucam::hifst::GrammarData::filecontents
std::string filecontents
The whole grammar.
Definition: data.grammar.hpp:57

ucam::hifst::GrammarData::getRule
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
Definition: data.grammar.hpp:83

ucam::util::count_needles
uint count_needles(const std::string &haystack, const char needle, std::size_t start, std::size_t end)
Convenience function that counts the number of times a needle appears.
Definition: global_funcs.hpp:107

ucam::hifst::GrammarData::getRHSSplitTranslation
const std::vector< std::string > getRHSSplitTranslation(std::size_t idx) const
Returns the translation as a vector of elements.
Definition: data.grammar.hpp:136

ucam::hifst::posindex::p
std::size_t p
position
Definition: data.grammar.comparetool.hpp:38

ucam::hifst::GrammarData::getLinks
void getLinks(std::size_t idx, std::vector< unsigned > &links) const
Definition: data.grammar.hpp:162

LERROR
#define LERROR(msg)
Definition: logger.boost_log.hpp:119

ucam::hifst::GrammarData::getRHSTranslation
const std::string getRHSTranslation(std::size_t idx) const
Returns RHS translation part of a rule accessed by index idx.
Definition: data.grammar.hpp:129

ucam::hifst::GrammarData::getMappings
void getMappings(std::size_t idx, unordered_map< uint, uint > *mappings) const
Returns the non-terminal mappings. For more details see getRuleMappings function. ...
Definition: data.grammar.hpp:207

ucam::hifst::GrammarData::vcat
grammar_inversecategories_t vcat
Definition: data.grammar.hpp:69

data.grammar.comparetool.hpp
Contains structures and classes for GrammarData.

ucam::hifst::GrammarData::sizeofvpos
std::size_t sizeofvpos
Number of rules.
Definition: data.grammar.hpp:61

ucam::hifst::CompareTool
Class that provides basic string comparison between two const char *.
Definition: data.grammar.comparetool.hpp:50

ucam::hifst::GrammarData::reset
void reset()
Reset object.
Definition: data.grammar.hpp:72

ucam
Definition: bleu.hpp:14

ucam::hifst::grammar_categories_t
unordered_map< std::string, uint > grammar_categories_t
Definition: defs.grammar.hpp:27

ucam::hifst::getRuleMappings
void getRuleMappings(const std::vector< std::string > &source, const std::vector< std::string > &translation, unordered_map< uint, uint > *mappings)
Given a source and translation of the same rule, sharing the same non-terminals in RHS...
Definition: data.grammar.utilities.hpp:73