Cambridge SMT System
task.patternstoinstances.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef PATTERNTOINSTANCESTASK_HPP
16 #define PATTERNTOINSTANCESTASK_HPP
17 
26 namespace ucam {
27 namespace hifst {
28 
35 template <class Data>
37 
38  //Private variables are shown here. Private methods go after public methods
39  private:
41  unsigned maxspan_;
42 
44  unsigned gapmaxspan_;
47 
48  public:
54  maxspan_ ( rg.get<unsigned> ( "patternstoinstances.maxspan" ) ),
55  gapmaxspan_ ( rg.get<unsigned> ( "patternstoinstances.gapmaxspan" ) ),
56  instancefile_ ( rg.get<std::string> ( "patternstoinstances.store" ) ) {
57  LDEBUG ( "Ready!" );
58  };
59 
65  bool run ( Data& d ) {
66  LINFO ( "instancing " << d.grammar->patterns.size() <<
67  " patterns over this sentence:" << d.sentence );
68  d.stats->setTimeStart ("instantiate-patterns");
69  instantiatePatternsHash ( d );
70  d.stats->setTimeEnd ("instantiate-patterns");
71  writeHashToFile ( d );
72  LINFO ( "Finished!" );
73  return false;
74  };
75 
78 
79 #ifndef TESTING
80  private:
81 #endif
82 
88  void instantiatePatternsHash ( Data& d ) {
89  d.hpinstances.clear();
90  LINFO ( "maxspan_=" << maxspan_ << ",gapmaxspan=" << gapmaxspan_ );
91  std::vector<std::string> ss;
92  boost::algorithm::split ( ss, d.sentence, boost::algorithm::is_any_of ( " " ) );
93  const unordered_set<std::string>& patterns = d.grammar->patterns;
94  for ( unordered_set<std::string>::const_iterator itx = patterns.begin();
95  itx != patterns.end(); ++itx ) {
96  LDEBUG ( "pattern:" << *itx );
97  std::vector<std::string> spattern;
98  boost::algorithm::split ( spattern, *itx, boost::algorithm::is_any_of ( "_" ) );
99  for ( unsigned j = 0; j < ss.size(); ++j ) { // for each word in the sentence
100  std::vector< std::vector<std::string> > pinstances;
101  LDEBUG ( "starting word:" << ss[j] );
102  //Map each pattern into words.
103  //If there are gaps, then expand them from 1 to given threshold gapmaxspan
104  if ( spattern.size() <= maxspan_ && j + spattern.size() - 1 < ss.size() ) {
105  std::vector<std::string> empty;
106  pinstances.push_back ( empty ); //add empty one.
107  //Create all instances that apply to this particular pattern.
108  buildNextElementFromPattern ( spattern, ss, pinstances, j, 0 );
109  }
110  for ( unsigned k = 0; k < pinstances.size(); ++k ) {
111  LDEBUG ( "pattern:" << *itx << ":" << "Inserting in " <<
112  boost::algorithm::join ( pinstances[k],
113  "_" ) << "values=(" << j << "," << spattern.size() - 1 );
114  d.hpinstances[boost::algorithm::join ( pinstances[k],
115  "_" )].push_back ( pair<unsigned, unsigned> ( j, spattern.size() - 1 ) );
116  }
117  }
118  }
119  }
120 
131  void buildNextElementFromPattern ( std::vector<std::string>& spattern,
132  std::vector<std::string>& ss,
133  std::vector< std::vector<std::string> >& pinstances,
134  unsigned ps,
135  unsigned pp,
136  unsigned gaphistory = 0 ) {
137  LDEBUG ( "startingword:" << ss[ps] << ",thisword:" << ss[ps + pp +
138  gaphistory ] << ",thiselement:" << spattern[pp] << ",ps=" << ps << ",pp=" << pp
139  << ",spatternsize=" << spattern.size() << ",gaphistory=" << gaphistory );
140  if ( spattern[pp] == "w" ) {
141  pinstances[pinstances.size() - 1].push_back ( ucam::util::toString (
142  ss[ps + pp + gaphistory] ) );
143  if ( ( pp + 1 < spattern.size() )
144  && ( ps + spattern.size() + gaphistory <= ss.size() ) )
145  buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1,
146  gaphistory );
147  } else if ( spattern[pp] == "X" ) {
148  LDEBUG ( "X,with gapmaxspan=" << gapmaxspan_ );
149  pinstances[pinstances.size() - 1].push_back ( "X" );
150  std::vector<std::string> replicate = pinstances[pinstances.size() - 1];
151  for ( unsigned k = 1;
152  ( k <= gapmaxspan_ )
153  && ( pp + 1 < spattern.size() )
154  && ( ps + spattern.size() - 1 + gaphistory + k - 1 < ss.size() )
155  && ( spattern.size() + gaphistory + k - 1 <= maxspan_ );
156  ++k ) {
157  LDEBUG ( "GAPSPAN=" << k );
158  if ( k > 1 ) pinstances.push_back (
159  replicate ); //clone previous one and run recursively.
160  buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1,
161  gaphistory + k - 1 );
162  }
163  } else {
165  USER_CHECK ( spattern[pp] == "X" || spattern[pp] == "w", "Incorrect pattern!" );
166  }
167  };
168 
174  void writeHashToFile ( Data& d ) {
175  std::string file = instancefile_ ( d.sidx );
176  if ( file != "" ) {
177  LINFO ( "file to output:" << file );
178  ucam::util::oszfstream o ( file + ".hash" );
179  for ( unordered_map<std::string, std::vector <pair <unsigned, unsigned> > >::iterator
180  itx = d.hpinstances.begin(); itx != d.hpinstances.end(); ++itx ) {
181  o << itx->first << ":" ;
182  for ( unsigned k = 0; k < itx->second.size(); ++k )
183  o << itx->second[k].first << "," << itx->second[k].second << ";";
184  o << endl;
185  }
186  o.close();
187  }
188  };
189 
191  PatternsToInstancesTask ( unsigned maxspan_
192  , unsigned gapmaxspan_
193  , const std::string& instancefile_ ) :
194  maxspan_ ( maxspan_ ),
195  gapmaxspan_ ( gapmaxspan_ ),
196  instancefile_ ( instancefile_ ) {
197  LDEBUG ( "Ready!" );
198  };
199 
200  ZDISALLOW_COPY_AND_ASSIGN ( PatternsToInstancesTask );
201 };
202 
203 }
204 } // End namespaces
205 
206 #endif
Wrapper stream class that writes to pipes, text files or gzipped files.
Definition: szfstream.hpp:200
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
std::string toString(const T &x, uint pr=2)
Converts an arbitrary type to string Converts to string integers, floats, doubles Quits execution if ...
#define LINFO(msg)
Converts patterns to instanced patterns.
#define LDEBUG(msg)
PatternsToInstancesTask(const ucam::util::RegistryPO &rg)
Constructor.
Templated (hybrid) Interface for Task classes.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
bool run(Data &d)
Runs this task and modifies Data object inserting the instanced sentence-specific source patterns we ...
Definition: bleu.hpp:14
void close()
Closes the file.
Definition: szfstream.hpp:323