Cambridge SMT System
wordmapper.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef WORDMAPPER_HPP
16 #define WORDMAPPER_HPP
17 
24 namespace ucam {
25 namespace util {
26 
33  private:
34  std::string *pdata_;
35  unsigned *pt_;
36 
37  public:
39  inline PQwmapcompare ( std::string *pd, unsigned *pt ) : pdata_ ( pd ),
40  pt_ ( pt ) {};
42  inline bool operator() ( const unsigned& i1, const unsigned& i2 ) const {
43  return strcmp ( pdata_->c_str() + pt_[i1], pdata_->c_str() + pt_[i2] ) >= 0;
44  };
45 };
46 
63 class WordMapper {
64 
65  private:
67  bool reverse_;
69  unsigned *pt_;
71  unsigned *ptr_;
73  unsigned size_;
75  std::string data_;
76 
78  unsigned oovid_;
79  unordered_map<std::size_t, std::string>
80  oovwmap_; // pass this one to target and we will effectively have oov passthru.
81  unordered_map<std::string, std::size_t> oovrwmap_;
82 
83  public:
91  WordMapper ( const std::string& wordmapfile, bool reverse = false ) :
92  size_ ( 0 ),
93  pt_ ( NULL ),
94  reverse_ ( reverse ) {
95  if ( wordmapfile == "" ) {
96  LINFO ( "No word/integer map file!" );
97  return;
98  }
99  FORCELINFO ( "Loading word mapper " << wordmapfile );
100  iszfstream aux ( wordmapfile );
101  load ( aux );
102  };
103 
104  WordMapper ( iszfstream& wordmapstream, bool reverse = false ) :
105  size_ ( 0 ),
106  pt_ ( NULL ),
107  reverse_ ( reverse ) {
108  load ( wordmapstream );
109  }
110 
118  inline void operator () ( const std::string& is, std::string *os,
119  bool reverse = false ) {
120  LDEBUG ("Searching " << is);
121  reverse ? mapstr2i ( is, os ) : mapi2str ( is, os );
122  boost::algorithm::trim ( *os );
123  return;
124  }
125 
131  inline unsigned operator () ( const std::string& is) {
132  int result = this->bs ( data_, is + "\n" , 0, size_ - 1 );
133  if ( result < 0 ) return std::numeric_limits<unsigned>::max();
134  return ptr_[result];
135  };
136 
139  if ( pt_ == NULL ) return;
140  delete [] pt_;
141  if ( reverse_ ) delete [] ptr_;
142  }
143 
145  inline unordered_map<std::size_t, std::string>& get_oovwmap() {
146  return oovwmap_;
147  };
148 
149  //Reset oovwmap with an external hash
150  inline void set_oovwmap ( unordered_map<std::size_t, std::string>& oovmap ) {
151  oovwmap_ = oovmap;
152  };
154  inline void reset_oov_id() {
155  oovid_ = OOVID;
156  };
157 
158  //Returns actual oovid_...
159  inline std::size_t get_oov_id() {
160  return oovid_;
161  };
162 
163  private:
165  void load ( iszfstream& aux ) {
166  std::string line;
167  while ( getline ( aux, line ) ) {
168  size_++;
169  std::stringstream x ( line );
170  std::string aux;
171  x >> aux;
172  data_ += aux + "\n";
173  unsigned s;
174  x >> s; // Integer ids discarded, but must agree with file position.
175  if (s != size_ - 1 ) {
176  LERROR ("Wrong wordmap file format. Wordmap with sequential ids is required! =>"
177  << aux << "<->" << s);
178  exit (EXIT_FAILURE);
179  }
180  }
181  aux.close();
182  LINFO ( "number of lines: " << size_ );
183  pt_ = new unsigned[size_];
184  unsigned ci = 0;
185  pt_[ci++] = 0;
186  for ( unsigned k = 0; k < data_.size(); ++k ) {
187  if ( data_[k] == '\n' && ci < size_ ) {
188  pt_[ci++] = k + 1;
189  }
190  }
191  if ( !reverse_ ) return;
192  LINFO ( "generating sorted indices for reverse mapping of " << size_ <<
193  " elements" );
194  std::priority_queue<unsigned, std::vector<unsigned>, PQwmapcompare> *vpq = new
195  std::priority_queue<unsigned, std::vector<unsigned>, PQwmapcompare>
196  ( PQwmapcompare ( &data_, pt_ ) );
197  for ( unsigned k = 0; k < size_; ++k ) vpq->push ( k );
198  unsigned k = 0;
199  ptr_ = new unsigned[size_];
200  while ( !vpq->empty() ) {
201  ptr_[k++] = vpq->top();
202  vpq->pop();
203  }
204  delete vpq;
205  LINFO ( "Done" );
206  };
207 
216  int bs ( std::string& sn, std::string needle, unsigned low, unsigned high ) {
217  if ( high < low ) return -1; // not found
218  int mid = low + ( high - low ) / 2;
219  if ( strncmp ( sn.c_str() + pt_[ptr_[mid]], needle.c_str(),
220  needle.size() ) > 0 ) return bs ( sn, needle, low, mid - 1 );
221  else if ( strncmp ( sn.c_str() + pt_[ptr_[mid]], needle.c_str(),
222  needle.size() ) < 0 ) return bs ( sn, needle, mid + 1, high );
223  return mid; // found
224  }
225 
232  void mapi2str ( std::string is, std::string *os ) {
233  boost::algorithm::trim ( is );
234  *os = "";
235  if ( is == "" ) return;
236  std::vector<std::string> words;
237  boost::algorithm::split ( words, is, boost::algorithm::is_any_of ( " " ) );
238  for ( unsigned k = 0; k < words.size(); ++k ) {
239  unsigned index = toNumber<unsigned> ( words[k] );
240  if ( index >= size_ ) {
241  if ( index == OOV || index == DR ) continue;
242  LINFO ( "idx OOV detected:" << index );
243  USER_CHECK ( oovwmap_.find ( index ) != oovwmap_.end(),
244  "OOV index does not exist in the word map!" );
245  *os += oovwmap_[ index ] + " ";
246  continue;
247  }
248  if ( index < size_ - 1 )
249  *os += data_.substr ( pt_[index], pt_[index + 1] - pt_[index] - 1 ) + " ";
250  else
251  *os += data_.substr ( pt_[index], data_.size() - 1 ) + " ";
252  }
253  };
254 
261  void mapstr2i ( std::string is, std::string *os ) {
262  USER_CHECK ( reverse_, "Reverse search not implemented for this object." );
263  boost::algorithm::trim ( is );
264  *os = "";
265  if ( is == "" ) return;
266  std::vector<std::string> words;
267  boost::algorithm::split ( words, is, boost::algorithm::is_any_of ( " " ) );
268  for ( unsigned k = 0; k < words.size(); ++k ) {
269  int result = this->bs ( data_, words[k] + "\n" , 0, size_ - 1 );
270  if ( result < 0 ) { // OOVs handler.
271  if ( oovrwmap_.find ( words[k] ) == oovrwmap_.end() ) {
272  oovwmap_[ oovid_ ] = words[k];
273  oovrwmap_[words[k]] = oovid_++ ;
274  }
275  *os += toString<std::size_t> ( oovrwmap_[words[k]] ) + " ";
276  continue;
277  }
278  *os += toString<unsigned> ( ptr_[result] ) + " ";
279  }
280  };
281  DISALLOW_COPY_AND_ASSIGN ( WordMapper );
282 
283 };
284 
285 }
286 } // end namespaces
287 
288 #endif
~WordMapper()
Destructor.
Definition: wordmapper.hpp:138
void reset_oov_id()
Resets oovid to lowest value.
Definition: wordmapper.hpp:154
#define LINFO(msg)
bool operator()(const unsigned &i1, const unsigned &i2) const
string comparison between two positions i1 and i2.
Definition: wordmapper.hpp:42
WordMapper(const std::string &wordmapfile, bool reverse=false)
Constructor.
Definition: wordmapper.hpp:91
void set_oovwmap(unordered_map< std::size_t, std::string > &oovmap)
Definition: wordmapper.hpp:150
#define FORCELINFO(msg)
#define LDEBUG(msg)
#define DR
WordMapper(iszfstream &wordmapstream, bool reverse=false)
Definition: wordmapper.hpp:104
iszfstream & getline(iszfstream &izs, std::string &line)
Definition: szfstream.hpp:178
unordered_map< std::size_t, std::string > & get_oovwmap()
Return oovwmap.
Definition: wordmapper.hpp:145
std::size_t get_oov_id()
Definition: wordmapper.hpp:159
#define OOVID
PQwmapcompare(std::string *pd, unsigned *pt)
constructor
Definition: wordmapper.hpp:39
void close()
Closes file.
Definition: szfstream.hpp:147
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
#define OOV
#define LERROR(msg)
comparison functor for queue sorting.
Definition: wordmapper.hpp:32
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
Definition: bleu.hpp:14