tutorial/task_8grammar_8hpp_source.html

 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use these files except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne

 #ifndef RULEFILETASK_HPP
 #define RULEFILETASK_HPP

 #include "task.grammar.nonterminalhierarchy.hpp"

 namespace ucam {
 namespace hifst {

 template <class Data>
 class GrammarTask: public ucam::util::TaskInterface<Data> {

   //Private variables are shown here. Private methods go after public methods
  private:

   ucam::util::IntegerPatternAddress grammarfile_, patternfile_;
   std::string previous_;
   PatternCompareTool pct_;
   GrammarData gd_;
   uint pos_;
   std::priority_queue<posindex, std::vector<posindex>, PosIndexCompare> *vpq_;

   NonTerminalHierarchy nth_;

   std::vector<float> grammarscales_;
   std::string ntorderfile_;

  public:
   GrammarTask ( ucam::util::RegistryPO const& rg
                 , std::string const& featureweightskey = HifstConstants::kGrammarFeatureweights
                     , unsigned featureoffset = 0) :
     previous_ ( "" ),
     grammarfile_ ( rg.get<std::string> ( HifstConstants::kGrammarLoad ) ),
     patternfile_ ( rg.get<std::string> ( HifstConstants::kGrammarStorepatterns ) ) ,
     ntorderfile_ (rg.get<std::string> ( HifstConstants::kGrammarStorentorder) ),
     grammarscales_ ( ucam::util::ParseParamString<float> ( rg.get<std::string>
                      ( featureweightskey ) ) ) {
     gd_.ct = &pct_;
     if (featureoffset ) {
       std::vector<float> aux (grammarscales_.size() - featureoffset);
       std::copy (grammarscales_.begin() + featureoffset, grammarscales_.end(),
                  aux.begin() );
       grammarscales_ = aux;
     }
     USER_CHECK ( grammarscales_.size(),
                  "0 feature weights. So the grammar is not a probabilistic model? Not my cup of tea." );
   };

   GrammarTask ( const std::string& grammarfilekey = HifstConstants::kGrammarLoad,
                 const std::string& patternfilekey = HifstConstants::kGrammarStorepatterns ) :
     previous_ ( "" ),
     grammarfile_ ( grammarfilekey ),
     patternfile_ ( patternfilekey ) ,
     grammarscales_ ( ucam::util::ParseParamString<float> ( "1" ) ) {
   };

   inline GrammarData *getGrammarData() {
     return &gd_;
   };

   bool run ( Data& d ) {
     std::string thisgrammarfile = grammarfile_ ( d.sidx );
     if ( thisgrammarfile != previous_ ) {
       FORCELINFO ( "Loading hierarchical grammar: " << thisgrammarfile );
       USER_CHECK ( ucam::util::fileExists ( thisgrammarfile ),
                    "This grammar does not exist" );
       d.stats->setTimeStart ( "load-grammar-patterns" );
       load ( thisgrammarfile );
       d.stats->setTimeEnd ( "load-grammar-patterns" );
       std::string patternfile = patternfile_ ( d.sidx );
       if ( patternfile != "" ) {
         ucam::util::oszfstream o ( patternfile );
         for ( unordered_set<std::string>::iterator itx = gd_.patterns.begin();
               itx != gd_.patterns.end(); ++itx ) o << *itx << endl;
         o.close();
       }
       previous_ = thisgrammarfile;
     } else {
       LINFO ( "Skipping grammar loading..." );
     }
     d.grammar = &gd_;
     return false;
   };

   inline void load ( const std::string& file ) {
     load_init();
     LINFO ( "=> Loading..." << file );
     ucam::util::readtextfile<GrammarTask> ( file, *this );
     load_sort();
     LINFO ( "Done! ****" );
     generate_ntorder();
   };

   inline void load ( std::stringstream& s ) {
     load_init();
     std::string myline;
     while ( getline ( s, myline ) ) {
       parse ( myline );
     }
     load_sort();
     LINFO ( "Done!" );
     generate_ntorder();
   };

   virtual ~GrammarTask() {};

  private:

   void generate_ntorder() {
     std::string ntorder;
     nth_ ( ntorder );
     LINFO ( "ntorder=" << ntorder );
     std::vector<std::string> aux;
     boost::algorithm::split ( aux, ntorder, boost::algorithm::is_any_of ( " ," ) );
     for ( uint k = 0; k < aux.size(); ++k ) {
       gd_.vcat[k + 1] = aux[k]; //Note that mapped indices always start from 1
       gd_.categories[aux[k]] = k + 1;
     }
     if (ntorderfile_ != "") {
       ucam::util::oszfstream o ( ntorderfile_ );
       for ( uint k = 0; k < gd_.vcat.size(); ++k )
         o << gd_.vcat[k + 1] << "\t" << k + 1 << std::endl;
     }
   }

   inline void load_init() {
     pos_ = 0;
     gd_.reset();
     gd_.ct = &pct_;
     vpq_ = new
     std::priority_queue<posindex, std::vector<posindex>, PosIndexCompare>
     ( PosIndexCompare ( &gd_.filecontents, gd_.ct ) );
   };

   inline void load_sort() {
     LINFO ( "Sorting indices..." );
     uint newidx = 0;
     gd_.sizeofvpos = vpq_->size();
     gd_.vpos = new
     posindex[gd_.sizeofvpos]; //peak memory footprint here, we could avoid this by enforcing sorted grammar input (although it would have to meet the same pattern sorting criterion...)
     LINFO ( gd_.sizeofvpos << " indices" );
     while ( !vpq_->empty() ) {
       gd_.vpos[newidx++] = vpq_->top();
       vpq_->pop();
       LDEBUG2 ( gd_.getRule ( newidx - 1 ) << " at " << gd_.vpos[newidx - 1].order );
     }
     delete vpq_;
   };

   __always_inline void parse ( std::string& line ) {
     using namespace std;
     using namespace ucam::util;

     boost::algorithm::trim ( line );
     if ( line == "" ) return;
     size_t pos1 = line.find_first_of ( " " ); // src
     size_t pos2 = line.find_first_of ( " ", pos1 + 1 ); // trg
     size_t pos3 = line.find_first_of ( " ", pos2 + 1 ); // weight

     if (pos3 == std::string::npos) {
       LERROR("Grammar not valid. At least one weight is needed: \n=>\t" << line);
       exit(EXIT_FAILURE);
     }
     size_t pos4 = line.find_first_of ( "\t"); // optional alignments
     if (pos4 == std::string::npos) pos4 = line.size();
     LDEBUG("pos1=" << pos1 << ",pos2=" << pos2 << ",pos3=" << pos3 << ",pos4=" << pos4);

     vector<float> weights;
     ParseParamString<float> ( line, weights, pos3 + 1 , pos4 - pos3 - 1 );
     string sweight = toString<float>
         ( dotproduct (weights, grammarscales_ ), numeric_limits<unsigned>::max() );
     trim_trailing_zeros ( sweight );
     line = ( pos4 <line.size() )
         ? line.substr ( 0, pos3 + 1 ) + sweight + line.substr(pos4)
         : line.substr ( 0, pos3 + 1 ) + sweight;


     LDEBUG("Adding line=[" << line  << "]");
     gd_.filecontents += line + '\n';
     posindex pi;
     bool waitingfornextfield = false;
     unsigned cf = 2; //Second field
     char previous = ' ';
     for ( unsigned k = 0; k < line.size(); ++k ) {
       if ( previous == ' ' && line[k] != ' ' ) --cf;
       if ( !cf ) {
         pi.o = k;
         break;
       }
       previous = line[k];
     }
     pi.p = pos_ + pi.o;
     string pattern;
     bool word = false;
     bool nt = false;
     for ( unsigned k = pi.o; k < line.size(); ++k ) {
       if ( line[k] == ' ' ) break;
       if ( line[k] >= '0' && line[k] <= '9' ) {
         if ( !word && !nt ) {
           pattern += 'w';
           word = true;
           nt = false;
         }
       } else if ( line[k] >= 'A' && line[k] <= 'Z' ) {
         if ( !nt ) {
           pattern += 'X';
           nt = true;
           word = false;
         }
       } else {
         pattern += line[k];
         nt = word = false;
       }
     }
     if ( gd_.patterns.find ( pattern ) == gd_.patterns.end() ) {
       gd_.patterns.insert ( pattern );
     }
     pi.order = vpq_->size();
     vpq_->push ( pi );
     pos_ += line.size() + 1;
     LDEBUG2 ( "reading rule " << line << ", at line " << pi.order << ", pattern=" <<
               pattern );
     if ( pattern == "X" ) {
       LINFO ( "Identity rule detected:" << line << "===" );
       nth_.insertIdentityRule ( line );
     } else {
       nth_.insertLHS ( line.substr ( 0, pi.o - 1 ) );
     }
   };

   template <typename FM>
   friend inline void ucam::util::readtextfile ( const std::string& filename,
       FM& fm );

   ZDISALLOW_COPY_AND_ASSIGN ( GrammarTask );

 };

 }
 }  // end namespaces
 #endif
ucam::util::oszfstream
Wrapper stream class that writes to pipes, text files or gzipped files.
Definition: szfstream.hpp:200

ucam::hifst::posindex::order
std::size_t order
absolute index
Definition: data.grammar.comparetool.hpp:42

ZDISALLOW_COPY_AND_ASSIGN
#define ZDISALLOW_COPY_AND_ASSIGN(TypeName)
Definition: global_decls.hpp:60

ucam::hifst::GrammarTask::GrammarTask
GrammarTask(const std::string &grammarfilekey=HifstConstants::kGrammarLoad, const std::string &patternfilekey=HifstConstants::kGrammarStorepatterns)
Constructor used for unit testing.
Definition: task.grammar.hpp:94

ucam::hifst::PatternCompareTool
Class that provides "pattern" comparison between two const char *. The "patterns" are an abstraction ...
Definition: data.grammar.comparetool.hpp:93

ucam::util::PatternAddress
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
Definition: addresshandler.hpp:33

ucam::util::RegistryPO
Definition: registrypo.hpp:95

ucam::hifst::GrammarData::categories
grammar_categories_t categories
Ordered list of non-terminals (listed in hierarchical order according to identity rules) ...
Definition: data.grammar.hpp:68

LDEBUG2
#define LDEBUG2(msg)
Definition: logger.boost_log.hpp:97

ucam::hifst::posindex
Struct containing rule positions and offsets.
Definition: data.grammar.comparetool.hpp:36

LINFO
#define LINFO(msg)
Definition: logger.boost_log.hpp:78

HifstConstants::kGrammarFeatureweights
const std::string kGrammarFeatureweights
Definition: constants-hifst.hpp:15

FORCELINFO
#define FORCELINFO(msg)
Definition: logger.boost_log.hpp:79

ucam::util
Definition: main.applylm.hpp:25

LDEBUG
#define LDEBUG(msg)
Definition: logger.boost_log.hpp:107

ucam::hifst::GrammarData::ct
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
Definition: data.grammar.hpp:65

ucam::util::ParseParamString
std::vector< T > ParseParamString(const std::string &stringparams, size_t pos=0)
Function to parse string of parameters, e.g. separated by commas.
Definition: params.hpp:51

ucam::hifst::GrammarData::patterns
unordered_set< std::string > patterns
Patterns in these rules.
Definition: data.grammar.hpp:63

ucam::hifst::GrammarData
Struct containing grammar rules.
Definition: data.grammar.hpp:42

ucam::hifst::GrammarData::vpos
posindex * vpos
Sorted Indices.
Definition: data.grammar.hpp:59

ucam::hifst::GrammarTask
Task class that loads a grammar into memory.
Definition: task.grammar.hpp:38

HifstConstants
Definition: constants-fsttools.hpp:4

ucam::util::TaskInterface
Templated (hybrid) Interface for Task classes.
Definition: taskinterface.hpp:37

ucam::util::getline
iszfstream & getline(iszfstream &izs, std::string &line)
Definition: szfstream.hpp:178

ucam::util::trim_trailing_zeros
void trim_trailing_zeros(std::string &snumber)
Definition: global_funcs.hpp:128

ucam::hifst::GrammarTask::~GrammarTask
virtual ~GrammarTask()
Definition: task.grammar.hpp:175

ucam::hifst::posindex::o
short o
offset
Definition: data.grammar.comparetool.hpp:40

ucam::hifst::GrammarData::filecontents
std::string filecontents
The whole grammar.
Definition: data.grammar.hpp:57

ucam::hifst::GrammarData::getRule
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
Definition: data.grammar.hpp:83

ucam::hifst::GrammarTask::load
void load(const std::string &file)
Loads rules from a grammar file.
Definition: task.grammar.hpp:148

ucam::util::dotproduct
float dotproduct(std::vector< float > &v1, std::vector< float > &v2)
Implements dot product.
Definition: global_funcs.hpp:183

ucam::util::readtextfile
void readtextfile(const std::string &filename, FM &fm)
Function that reads from a file. Templated on any external class with a parse method.
Definition: szfstream.hpp:359

ucam::util::fileExists
bool fileExists(const std::string &fileName)
Definition: global_funcs.hpp:171

task.grammar.nonterminalhierarchy.hpp
this class decides automatically the hierarchy of non-terminals

ucam::hifst::PosIndexCompare
Functor Class that provides comparison accross the posindex structure. This is typically used e...
Definition: data.grammar.comparetool.hpp:68

USER_CHECK
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Definition: custom_assert.hpp:41

ucam::hifst::posindex::p
std::size_t p
position
Definition: data.grammar.comparetool.hpp:38

ucam::hifst::GrammarTask::getGrammarData
GrammarData * getGrammarData()
Returns GrammarData.
Definition: task.grammar.hpp:106

HifstConstants::kGrammarStorepatterns
const std::string kGrammarStorepatterns
Definition: constants-hifst.hpp:16

LERROR
#define LERROR(msg)
Definition: logger.boost_log.hpp:119

ucam::hifst::GrammarTask::run
bool run(Data &d)
ucam::util::TaskInterface mandatory method implementation. This method loads the hierarchical grammar...
Definition: task.grammar.hpp:117

ucam::hifst::NonTerminalHierarchy::insertIdentityRule
void insertIdentityRule(const std::string &identityrule)
Method to store identity rules, i.e. S -> X X , etc.
Definition: task.grammar.nonterminalhierarchy.hpp:47

ucam::hifst::NonTerminalHierarchy
This is a functor with additional methods to include relevant rules (i.e. identify SCFG rules...
Definition: task.grammar.nonterminalhierarchy.hpp:33

HifstConstants::kGrammarStorentorder
const std::string kGrammarStorentorder
Definition: constants-hifst.hpp:17

ucam::hifst::GrammarData::vcat
grammar_inversecategories_t vcat
Definition: data.grammar.hpp:69

ucam::hifst::GrammarTask::GrammarTask
GrammarTask(ucam::util::RegistryPO const &rg, std::string const &featureweightskey=HifstConstants::kGrammarFeatureweights, unsigned featureoffset=0)
Constructor.
Definition: task.grammar.hpp:67

ucam::hifst::NonTerminalHierarchy::insertLHS
void insertLHS(const std::string &nt)
Definition: task.grammar.nonterminalhierarchy.hpp:53

ucam::hifst::GrammarData::sizeofvpos
std::size_t sizeofvpos
Number of rules.
Definition: data.grammar.hpp:61

ucam::hifst::GrammarData::reset
void reset()
Reset object.
Definition: data.grammar.hpp:72

ucam
Definition: bleu.hpp:14

HifstConstants::kGrammarLoad
const std::string kGrammarLoad
Definition: constants-hifst.hpp:14

ucam::hifst::GrammarTask::load
void load(std::stringstream &s)
Loads rules from a stringstream.
Definition: task.grammar.hpp:164

ucam::util::oszfstream::close
void close()
Closes the file.
Definition: szfstream.hpp:323