Cambridge SMT System
task.prepro.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef TASK_PREPRO_HPP
16 #define TASK_PREPRO_HPP
17 
25 namespace ucam {
26 namespace hifst {
27 
32 template <class Data>
34 
35  //Private variables are shown here. Private methods go after public methods
36  private:
37 
38  // Tokenize the input or not
39  bool tokenizeinput_;
40 
41  // Add sentence markers (if missing) or not
42  bool addsentencemarkers_;
43 
44  // Pointer to a wordmap object
45  ucam::util::WordMapper *src2idxwmap_;
46 
47  // Key to find wordmap in the data object
48  const std::string wordmapkey_;
49 
50  // Tokenization language. Only French is specified. There currently is a common scheme for english|spanish
51  std::string tokenizelanguage_;
52 
53  public:
54 
61  const std::string& wordmapkey = HifstConstants::kPreproWordmapLoad
62  ) :
63  wordmapkey_ ( wordmapkey ),
64  addsentencemarkers_ ( rg.exists ( HifstConstants::kPreproAddsentencemarkers ) ),
65  src2idxwmap_ ( NULL ),
66  tokenizeinput_ ( rg.getBool ( HifstConstants::kPreproTokenizeEnable ) ),
67  tokenizelanguage_ (rg.exists (HifstConstants::kPreproTokenizeLanguage)
68  ? rg.get<std::string> (HifstConstants::kPreproTokenizeLanguage) : "") {
69  };
70 
72  }
73 
74  inline void setTokenize ( bool tok ) {
75  tokenizeinput_ = tok;
76  };
77 
81  bool run ( Data& d ) {
82  LINFO ( "Reading sentence #" << d.sidx );
83  d.stats->setTimeStart ( "sent-dec" );
84  if ( !USER_CHECK ( d.originalsentence != "",
85  "Empty source sentence?" ) ) return true;
86  ucam::util::trim_spaces ( d.originalsentence, &d.originalsentence );
87  if ( tokenizeinput_ ) {
88  ucam::util::tokenize ( d.originalsentence, &d.tokenizedsentence ,
89  tokenizelanguage_ );
90  LINFO ( "Tokenized :" << d.tokenizedsentence );
91  } else d.tokenizedsentence = d.originalsentence;
92  if ( addsentencemarkers_ )
93  ucam::util::addSentenceMarkers ( d.tokenizedsentence );
94  if ( d.wm.find ( wordmapkey_ ) != d.wm.end() ) src2idxwmap_ = d.wm[wordmapkey_];
95  else src2idxwmap_ = NULL;
96  if ( src2idxwmap_ ) {
97  src2idxwmap_->reset_oov_id();
98  ( *src2idxwmap_ ) ( d.tokenizedsentence, &d.sentence , true );
99  d.oovwmap = src2idxwmap_->get_oovwmap();
100  LINFO ( "mapped:" << d.sentence );
101  } else d.sentence = d.tokenizedsentence;
102  ucam::util::trim_spaces ( d.sentence, &d.sentence );
103  if ( !USER_CHECK ( ucam::util::validate_source_sentence ( d.sentence ),
104  "Wrong sentence format, should be a sequence of numbers at this point!" ) ) {
105  FORCELINFO ( "Bad Sentence:" << d.sentence );
106  return true;
107  }
108  return false;
109  };
110 
111  private:
112 
113  ZDISALLOW_COPY_AND_ASSIGN ( PreProTask );
114 
115 };
116 
117 }
118 } // End namespaces
119 
120 #endif
void trim_spaces(const std::string &input, std::string *output)
Trims spaces at the edges (no spaces) and also between words (only one space)
void reset_oov_id()
Resets oovid to lowest value.
Definition: wordmapper.hpp:154
Reads text file, performs tokenization and integer-mapping.
Definition: task.prepro.hpp:33
const std::string kPreproTokenizeEnable
#define LINFO(msg)
const std::string kPreproWordmapLoad
#define FORCELINFO(msg)
void tokenize(const std::string &is, std::string *os, const std::string languagespecific="")
Not implemented, just pass through.
PreProTask(const ucam::util::RegistryPO &rg, const std::string &wordmapkey=HifstConstants::kPreproWordmapLoad)
Constructor.
Definition: task.prepro.hpp:60
void setTokenize(bool tok)
Definition: task.prepro.hpp:74
Templated (hybrid) Interface for Task classes.
unordered_map< std::size_t, std::string > & get_oovwmap()
Return oovwmap.
Definition: wordmapper.hpp:145
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
const std::string kPreproAddsentencemarkers
void addSentenceMarkers(std::string &sentence)
Adds sentence markers <s>, </s> to a sentence.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
bool run(Data &d)
Reads an input sentence, tokenizes and integer-maps.
Definition: task.prepro.hpp:81
const std::string kPreproTokenizeLanguage
Definition: bleu.hpp:14
bool validate_source_sentence(const std::string &s)
Checks whether the sentence is in format ^\d+( \d+)*$.