Cambridge SMT System
task.postpro.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef TEXTOUTPUTTASK_HPP
16 #define TEXTOUTPUTTASK_HPP
17 
24 namespace ucam {
25 namespace hifst {
26 
31 template <class Data, class Arc = fst::StdArc>
33 
34  //Private variables are shown here. Private methods go after public methods
35  private:
36 
38  ucam::util::WordMapper *trgidx2wmap_;
39 
40  const std::string wordmapkey_;
41 
43  bool usewordmap_;
44 
46  bool detokenize_;
47 
49  std::string detokenizationlanguage_;
50 
51  const std::string inputkey_;
52 
53  //Capitalize systematically first word.
54  bool capitalizeFirstWord_;
55 
56  public:
59  const std::string& inputkey = HifstConstants::kPostproInput,
60  const std::string& wordmapkey = HifstConstants::kPostproWordmapLoad
61  )
62  : inputkey_ ( inputkey )
63  , wordmapkey_ ( wordmapkey )
64  , trgidx2wmap_ ( NULL )
65  , detokenize_ ( rg.getBool ( HifstConstants::kPostproDetokenizeEnable ) )
66  , detokenizationlanguage_ ( rg.exists (
67  HifstConstants::kPostproDetokenizeLanguage) ? rg.get<std::string>
69  , capitalizeFirstWord_ (rg.getBool (
71 
72  {
73  LDEBUG ( "Constructor ready..." );
74  };
75 
77  inline void setDetokenize ( bool detok ) {
78  detokenize_ = detok;
79  };
80 
86  bool run ( Data& d ) {
87  if ( !USER_CHECK ( d.fsts[inputkey_] != NULL,
88  "translation lattice not initialized?" ) ) return true;
89  if ( !USER_CHECK ( d.translation != NULL,
90  "d.translation not initialized?" ) ) return true;
91  fst::VectorFst<Arc> ofst ( * (static_cast< fst::VectorFst<Arc> *>
92  (d.fsts[inputkey_]) ) );
93  std::string text;
95  LINFO ( "1best is " << text );
96  std::string detokutext;
97  if ( d.wm.find ( wordmapkey_ ) != d.wm.end() )
98  trgidx2wmap_ = d.wm[wordmapkey_];
99  else trgidx2wmap_ = NULL;
100  if ( trgidx2wmap_ ) {
101  std::string utext;
102  trgidx2wmap_->set_oovwmap ( d.oovwmap );
103  ( *trgidx2wmap_ ) ( text, &utext , false );
104  LINFO ( "(unmapped) 1best is:" << utext );
105  //Take out 1 and 2 if they exist
107  if ( detokenize_ ) {
108  ucam::util::detokenize ( utext, &detokutext , detokenizationlanguage_ );
109  LINFO ( "1best (detok) is:" << detokutext );
110  } else detokutext = utext;
111  if ( capitalizeFirstWord_ ) {
112  ucam::util::capitalizeFirstWord ( detokutext );
113  }
114  } else detokutext = text;
115  FORCELINFO ( "Translation 1best is: " << detokutext );
116  *d.translation = detokutext;
117  return false;
118  };
119 
120  private:
121 
122  ZDISALLOW_COPY_AND_ASSIGN ( PostProTask );
123 
124 };
125 
126 }
127 } // end namespaces
128 
129 #endif
void FstGetBestStringHypothesis(const fst::VectorFst< Arc > &latfst, std::string &hyp)
Definition: fstutils.hpp:229
const std::string kPostproDetokenizeLanguage
#define LINFO(msg)
void capitalizeFirstWord(std::vector< std::string > &words)
Simple function that capitalizes first word and first word of sentence and first word.
void setDetokenize(bool detok)
Turn on/off tokenization.
void set_oovwmap(unordered_map< std::size_t, std::string > &oovmap)
Definition: wordmapper.hpp:150
void detokenize(const std::string &is, std::string *os, std::string languagespecific="")
Not implemented, just pass through.
#define FORCELINFO(msg)
Task that writes translation to a text file. This translation might be recased, wordmapped and tokeni...
#define LDEBUG(msg)
const std::string kPostproWordmapLoad
const std::string kPostproInput
PostProTask(const ucam::util::RegistryPO &rg, const std::string &inputkey=HifstConstants::kPostproInput, const std::string &wordmapkey=HifstConstants::kPostproWordmapLoad)
Constructor with ucam::util::RegistryPO object and keys to access lattice and wordmap.
Templated (hybrid) Interface for Task classes.
void deleteSentenceMarkers(std::string &sentence)
Deletes sentence markers 1/2 or <s>/</s> for a sentence.
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
bool run(Data &d)
Writes 1-best to file. Optionally, recases, maps back to words, and detokenizes.
const std::string kPostproDetokenizeEnable
const std::string kPostproCapitalizefirstwordEnable
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
Definition: bleu.hpp:14