Cambridge SMT System
hifst.task.prepro.gtest.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
21 #include <openfst.h>
22 #include <googletesting.h>
23 
24 #ifndef GMAINTEST
25 #include "main.custom_assert.hpp"
26 #include "main.logger.hpp"
27 #endif
28 
29 #include "addresshandler.hpp"
30 #include "taskinterface.hpp"
31 
32 #include "fstutils.hpp"
33 
34 #include "tokenizer.osr.hpp"
35 #include "wordmapper.hpp"
36 
37 #include "task.prepro.hpp"
38 #include "data.stats.hpp"
39 
40 using boost::any_cast;
41 namespace bfs = boost::filesystem;
42 namespace uh = ucam::hifst;
43 namespace uf = ucam::fsttools;
44 
45 namespace googletesting {
46 
49  sidx ( 0 ),
50  stats ( new uf::StatsData ) {
51  }
52  uint sidx;
53  std::string originalsentence;
54  std::string tokenizedsentence;
55  std::string sentence;
56  unordered_map<std::size_t, std::string> oovwmap;
57  boost::scoped_ptr<uf::StatsData> stats;
58 
60  unordered_map<std::string, uu::WordMapper *> wm;
61 
62 };
63 
64 #ifndef OSR
65 TEST ( HifstPrePro, basic_test ) {
68  unordered_map<std::string, boost::any> v;
69  v[kPreproWordmapLoad] = std::string ( "" );
70  v[kPreproTokenizeLanguage] = std::string ( "" );
71  v[kPreproTokenizeEnable] = std::string ("no");
72  const uu::RegistryPO rg ( v );
73  uh::PreProTaskData d;
74  d.originalsentence = "He's eating creamy creamy lovely potatoes.";
75  stringstream ss;
76  ss << "he\t0\n";
77  ss << "'s\t1\n";
78  ss << "eating\t2\n";
79  ss << "potatoes\t3\n";
80  ss << ".\t4\n";
81  uu::iszfstream x ( ss );
82  uu::WordMapper wm ( x, true );
83  d.wm[kPreproWordmapLoad] = &wm;
84  {
85  uu::PreProTask<PreProTaskData> t ( rg );
86  t.setTokenize ( true );
87  t.run ( d );
88  }
89  EXPECT_EQ ( d.tokenizedsentence,
90  "he 's eating creamy creamy lovely potatoes ." );
91  EXPECT_EQ ( d.sentence, "0 1 2 " + toString ( OOVID ) + " " + toString (
92  OOVID ) + " " + toString ( OOVID + 1 ) + " 3 4" );
93  EXPECT_EQ ( d.oovwmap[OOVID], "creamy" );
94  EXPECT_EQ ( d.oovwmap[OOVID + 1], "lovely" );
95 };
96 
97 #endif
98 
100 TEST ( stringutil, addsentencemarkers ) {
101  std::string x = "a";
103  EXPECT_EQ ( x, "<s> a </s>" );
104  x = "";
106  EXPECT_EQ ( x, "<s> </s>" );
107  x = " it is time to fly ";
109  EXPECT_EQ ( x, "<s> it is time to fly </s>" );
110 }
111 };
112 
113 #ifndef GMAINTEST
114 
115 int main ( int argc, char **argv ) {
116  ::testing::InitGoogleTest ( &argc, argv );
117  return RUN_ALL_TESTS();
118 }
119 #endif
Handles simple wildcard expansion for strings.
Unit testing: google testing common header.
Relative to Stats across the pipeline.
std::string toString(const T &x, uint pr=2)
Converts an arbitrary type to string Converts to string integers, floats, doubles Quits execution if ...
const std::string kPreproTokenizeEnable
const std::string kPreproWordmapLoad
Lower casing/Tokenization/Detokenization not available for open source release.
Interfaces with basic methods for iteration.
TEST(FstIo, basic_test)
Definition: fstio.gtest.cpp:38
test-specific classes and functions
Definition: fstio.gtest.cpp:34
Static variables for logger. Include only once from main file.
Utilites to extract vocabulary, pseudo-determinize lattices and build substring transducers.
boost::scoped_ptr< uf::StatsData > stats
#define OOVID
void addSentenceMarkers(std::string &sentence)
Adds sentence markers <s>, </s> to a sentence.
unordered_map< std::size_t, std::string > oovwmap
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
class WordMapper
Describes class PreProTask, which preprocesses (tokenizes and maps to integers with WordMapper) sourc...
int main(int argc, char **argv)
unordered_map< std::string, uu::WordMapper * > wm
Wordmap/Integer map objects.
const std::string kPreproTokenizeLanguage
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
Unit testing: google testing common header.
Static variable for custom_assert. Include only once from main file.