15 #ifndef TASK_PREPRO_HPP 16 #define TASK_PREPRO_HPP 42 bool addsentencemarkers_;
48 const std::string wordmapkey_;
51 std::string tokenizelanguage_;
63 wordmapkey_ ( wordmapkey ),
65 src2idxwmap_ ( NULL ),
81 bool run ( Data& d ) {
82 LINFO (
"Reading sentence #" << d.sidx );
83 d.stats->setTimeStart (
"sent-dec" );
85 "Empty source sentence?" ) )
return true;
87 if ( tokenizeinput_ ) {
90 LINFO (
"Tokenized :" << d.tokenizedsentence );
91 }
else d.tokenizedsentence = d.originalsentence;
92 if ( addsentencemarkers_ )
94 if ( d.wm.find ( wordmapkey_ ) != d.wm.end() ) src2idxwmap_ = d.wm[wordmapkey_];
95 else src2idxwmap_ = NULL;
98 ( *src2idxwmap_ ) ( d.tokenizedsentence, &d.sentence , true );
100 LINFO (
"mapped:" << d.sentence );
101 }
else d.sentence = d.tokenizedsentence;
104 "Wrong sentence format, should be a sequence of numbers at this point!" ) ) {
void trim_spaces(const std::string &input, std::string *output)
Trims spaces at the edges (no spaces) and also between words (only one space)
void reset_oov_id()
Resets oovid to lowest value.
Reads text file, performs tokenization and integer-mapping.
const std::string kPreproTokenizeEnable
const std::string kPreproWordmapLoad
void tokenize(const std::string &is, std::string *os, const std::string languagespecific="")
Not implemented, just pass through.
PreProTask(const ucam::util::RegistryPO &rg, const std::string &wordmapkey=HifstConstants::kPreproWordmapLoad)
Constructor.
void setTokenize(bool tok)
Templated (hybrid) Interface for Task classes.
unordered_map< std::size_t, std::string > & get_oovwmap()
Return oovwmap.
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
const std::string kPreproAddsentencemarkers
void addSentenceMarkers(std::string &sentence)
Adds sentence markers <s>, </s> to a sentence.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
bool run(Data &d)
Reads an input sentence, tokenizes and integer-maps.
const std::string kPreproTokenizeLanguage
bool validate_source_sentence(const std::string &s)
Checks whether the sentence is in format ^\d+( \d+)*$.