Cambridge SMT System
hifst.task.postpro.gtest.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
21 #include <openfst.h>
22 #include <googletesting.h>
23 
24 #ifndef GMAINTEST
25 #include "main.custom_assert.hpp"
26 #include "main.logger.hpp"
27 #endif
28 
29 #include "addresshandler.hpp"
30 #include "taskinterface.hpp"
31 
32 #include "fstutils.hpp"
33 
34 #include "tokenizer.osr.hpp"
35 #include "wordmapper.hpp"
36 
37 #include "task.postpro.hpp"
38 
39 using boost::any_cast;
40 namespace bfs = boost::filesystem;
41 namespace uh = ucam::hifst;
42 
43 namespace googletesting {
44 
47  sidx ( 0 ),
48  translation ( NULL ) {
49  }
50  uint sidx;
51  unordered_map<std::size_t, std::string> oovwmap;
52  unordered_map<std::string, fst::VectorFst<fst::StdArc> *> fsts;
53  std::string *translation;
54 
56  unordered_map<std::string, uu::WordMapper *> wm;
57 
58 };
59 
60 #ifndef OSR
61 TEST ( HifstPostPro, basic_test ) {
64  fst::VectorFst<fst::StdArc> aux;
65  aux.AddState();
66  aux.AddState();
67  aux.AddState();
68  aux.AddState();
69  aux.AddState();
70  aux.AddState();
71  aux.AddState();
72  aux.AddState();
73  aux.AddState();
74  aux.SetStart ( 0 );
75  aux.SetFinal ( 8, fst::StdArc::Weight::One() );
76  aux.AddArc ( 0, fst::StdArc ( 1, 1, fst::StdArc::Weight ( 0 ), 1 ) );
77  aux.AddArc ( 1, fst::StdArc ( 2, 2, fst::StdArc::Weight ( 0 ), 2 ) );
78  aux.AddArc ( 2, fst::StdArc ( 3, 3, fst::StdArc::Weight ( 0 ), 3 ) );
79  aux.AddArc ( 3, fst::StdArc ( OOVID, OOVID, fst::StdArc::Weight ( 0 ), 4 ) );
80  aux.AddArc ( 4, fst::StdArc ( OOVID, OOVID, fst::StdArc::Weight ( 0 ), 5 ) );
81  aux.AddArc ( 5, fst::StdArc ( OOVID + 1, OOVID + 1, fst::StdArc::Weight ( 0 ),
82  6 ) );
83  aux.AddArc ( 6, fst::StdArc ( 4, 4, fst::StdArc::Weight ( 0 ), 7 ) );
84  aux.AddArc ( 7, fst::StdArc ( 5, 5, fst::StdArc::Weight ( 0 ), 8 ) );
85  std::stringstream ss;
86  ss << "epsilon\t0\n";
87  ss << "he\t1\n";
88  ss << "'s\t2\n";
89  ss << "eating\t3\n";
90  ss << "potatoes\t4\n";
91  ss << ".\t5\n";
92  uu::iszfstream x ( ss );
93  //Prepare RegistryPO object.
94  unordered_map<std::string, boost::any> v;
95  v[kPostproWordmapperLoad] = std::string ( "" );
96  v[kPostproDetokenizeEnable] = std::string ("yes");
97  v[kPostproDetokenizeLanguage] = std::string ( "" );
98  const uu::RegistryPO rg ( v );
99  uh::PostProTaskData d;
100  d.translation = new std::string;
101  d.oovwmap[OOVID] = "creamy";
102  d.oovwmap[OOVID + 1] = "lovely";
103  d.fsts["postpro.input"] = &aux;
104  uu::WordMapper wm ( x );
105  d.wm[kPostproWordmapLoad] = &wm;
106  {
108  t.setDetokenize ( true );
109  t.run ( d );
110  }
111  EXPECT_EQ ( *d.translation, "He's eating creamy creamy lovely potatoes." );
112  delete d.translation;
113 };
114 
115 #endif
116 
118 TEST ( hifstpostpro, deletesentencemarkers ) {
119  std::string s = "1 3 2";
121  EXPECT_EQ ( s, "3" );
122  s = "<s> hola </s>";
124  EXPECT_EQ ( s, "hola" );
125  s = "<s> </s>";
127  EXPECT_EQ ( s, "" );
128  s = "1 2";
130  EXPECT_EQ ( s, "" );
131 }
132 
133 };
134 
135 #ifndef GMAINTEST
136 
137 int main ( int argc, char **argv ) {
138  ::testing::InitGoogleTest ( &argc, argv );
139  return RUN_ALL_TESTS();
140 }
141 #endif
Handles simple wildcard expansion for strings.
Unit testing: google testing common header.
unordered_map< std::size_t, std::string > oovwmap
unordered_map< std::string, fst::VectorFst< fst::StdArc > * > fsts
const std::string kPostproDetokenizeLanguage
void setDetokenize(bool detok)
Turn on/off tokenization.
Task that writes translation to a text file. This translation might be recased, wordmapped and tokeni...
Lower casing/Tokenization/Detokenization not available for open source release.
const std::string kPostproWordmapLoad
Interfaces with basic methods for iteration.
TEST(FstIo, basic_test)
Definition: fstio.gtest.cpp:38
test-specific classes and functions
Definition: fstio.gtest.cpp:34
void deleteSentenceMarkers(std::string &sentence)
Deletes sentence markers 1/2 or <s>/</s> for a sentence.
Static variables for logger. Include only once from main file.
Utilites to extract vocabulary, pseudo-determinize lattices and build substring transducers.
bool run(Data &d)
Writes 1-best to file. Optionally, recases, maps back to words, and detokenizes.
unordered_map< std::string, uu::WordMapper * > wm
Wordmap/Integer map objects.
#define OOVID
int main(int argc, char **argv)
Task that writes translation to a text file. This translation might be wordmapped and tokenized...
const std::string kPostproDetokenizeEnable
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
class WordMapper
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
Unit testing: google testing common header.
Static variable for custom_assert. Include only once from main file.