15 #ifndef WORDMAPPER_HPP 16 #define WORDMAPPER_HPP 42 inline bool operator() (
const unsigned& i1,
const unsigned& i2 )
const {
43 return strcmp ( pdata_->c_str() + pt_[i1], pdata_->c_str() + pt_[i2] ) >= 0;
79 unordered_map<std::size_t, std::string>
81 unordered_map<std::string, std::size_t> oovrwmap_;
91 WordMapper (
const std::string& wordmapfile,
bool reverse =
false ) :
94 reverse_ ( reverse ) {
95 if ( wordmapfile ==
"" ) {
96 LINFO (
"No word/integer map file!" );
99 FORCELINFO (
"Loading word mapper " << wordmapfile );
107 reverse_ ( reverse ) {
108 load ( wordmapstream );
119 bool reverse =
false ) {
120 LDEBUG (
"Searching " << is);
121 reverse ? mapstr2i ( is, os ) : mapi2str ( is, os );
122 boost::algorithm::trim ( *os );
132 int result = this->bs ( data_, is +
"\n" , 0, size_ - 1 );
133 if ( result < 0 )
return std::numeric_limits<unsigned>::max();
139 if ( pt_ == NULL )
return;
141 if ( reverse_ )
delete [] ptr_;
150 inline void set_oovwmap ( unordered_map<std::size_t, std::string>& oovmap ) {
167 while (
getline ( aux, line ) ) {
169 std::stringstream x ( line );
175 if (s != size_ - 1 ) {
176 LERROR (
"Wrong wordmap file format. Wordmap with sequential ids is required! =>" 177 << aux <<
"<->" << s);
182 LINFO (
"number of lines: " << size_ );
183 pt_ =
new unsigned[size_];
186 for (
unsigned k = 0; k < data_.size(); ++k ) {
187 if ( data_[k] ==
'\n' && ci < size_ ) {
191 if ( !reverse_ )
return;
192 LINFO (
"generating sorted indices for reverse mapping of " << size_ <<
194 std::priority_queue<unsigned, std::vector<unsigned>,
PQwmapcompare> *vpq =
new 195 std::priority_queue<unsigned, std::vector<unsigned>,
PQwmapcompare>
197 for (
unsigned k = 0; k < size_; ++k ) vpq->push ( k );
199 ptr_ =
new unsigned[size_];
200 while ( !vpq->empty() ) {
201 ptr_[k++] = vpq->top();
216 int bs ( std::string& sn, std::string needle,
unsigned low,
unsigned high ) {
217 if ( high < low )
return -1;
218 int mid = low + ( high - low ) / 2;
219 if ( strncmp ( sn.c_str() + pt_[ptr_[mid]], needle.c_str(),
220 needle.size() ) > 0 )
return bs ( sn, needle, low, mid - 1 );
221 else if ( strncmp ( sn.c_str() + pt_[ptr_[mid]], needle.c_str(),
222 needle.size() ) < 0 )
return bs ( sn, needle, mid + 1, high );
232 void mapi2str ( std::string is, std::string *os ) {
233 boost::algorithm::trim ( is );
235 if ( is ==
"" )
return;
236 std::vector<std::string> words;
237 boost::algorithm::split ( words, is, boost::algorithm::is_any_of (
" " ) );
238 for (
unsigned k = 0; k < words.size(); ++k ) {
239 unsigned index = toNumber<unsigned> ( words[k] );
240 if ( index >= size_ ) {
241 if ( index ==
OOV || index ==
DR )
continue;
242 LINFO (
"idx OOV detected:" << index );
243 USER_CHECK ( oovwmap_.find ( index ) != oovwmap_.end(),
244 "OOV index does not exist in the word map!" );
245 *os += oovwmap_[ index ] +
" ";
248 if ( index < size_ - 1 )
249 *os += data_.substr ( pt_[index], pt_[index + 1] - pt_[index] - 1 ) +
" ";
251 *os += data_.substr ( pt_[index], data_.size() - 1 ) +
" ";
261 void mapstr2i ( std::string is, std::string *os ) {
262 USER_CHECK ( reverse_,
"Reverse search not implemented for this object." );
263 boost::algorithm::trim ( is );
265 if ( is ==
"" )
return;
266 std::vector<std::string> words;
267 boost::algorithm::split ( words, is, boost::algorithm::is_any_of (
" " ) );
268 for (
unsigned k = 0; k < words.size(); ++k ) {
269 int result = this->bs ( data_, words[k] +
"\n" , 0, size_ - 1 );
271 if ( oovrwmap_.find ( words[k] ) == oovrwmap_.end() ) {
272 oovwmap_[ oovid_ ] = words[k];
273 oovrwmap_[words[k]] = oovid_++ ;
275 *os += toString<std::size_t> ( oovrwmap_[words[k]] ) +
" ";
278 *os += toString<unsigned> ( ptr_[result] ) +
" ";
void reset_oov_id()
Resets oovid to lowest value.
bool operator()(const unsigned &i1, const unsigned &i2) const
string comparison between two positions i1 and i2.
WordMapper(const std::string &wordmapfile, bool reverse=false)
Constructor.
void set_oovwmap(unordered_map< std::size_t, std::string > &oovmap)
WordMapper(iszfstream &wordmapstream, bool reverse=false)
iszfstream & getline(iszfstream &izs, std::string &line)
unordered_map< std::size_t, std::string > & get_oovwmap()
Return oovwmap.
PQwmapcompare(std::string *pd, unsigned *pt)
constructor
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
comparison functor for queue sorting.
Wrapper stream class that reads pipes, text files or gzipped files.