40 using boost::any_cast;
41 namespace bfs = boost::filesystem;
56 unordered_map<std::size_t, std::string>
oovwmap;
57 boost::scoped_ptr<uf::StatsData>
stats;
60 unordered_map<std::string, uu::WordMapper *>
wm;
65 TEST ( HifstPrePro, basic_test ) {
68 unordered_map<std::string, boost::any> v;
74 d.originalsentence =
"He's eating creamy creamy lovely potatoes.";
79 ss <<
"potatoes\t3\n";
85 uu::PreProTask<PreProTaskData> t ( rg );
86 t.setTokenize (
true );
89 EXPECT_EQ ( d.tokenizedsentence,
90 "he 's eating creamy creamy lovely potatoes ." );
93 EXPECT_EQ ( d.oovwmap[
OOVID],
"creamy" );
94 EXPECT_EQ ( d.oovwmap[
OOVID + 1],
"lovely" );
100 TEST ( stringutil, addsentencemarkers ) {
103 EXPECT_EQ ( x,
"<s> a </s>" );
106 EXPECT_EQ ( x,
"<s> </s>" );
107 x =
" it is time to fly ";
109 EXPECT_EQ ( x,
"<s> it is time to fly </s>" );
115 int main (
int argc,
char **argv ) {
116 ::testing::InitGoogleTest ( &argc, argv );
117 return RUN_ALL_TESTS();
Handles simple wildcard expansion for strings.
Unit testing: google testing common header.
Relative to Stats across the pipeline.
std::string toString(const T &x, uint pr=2)
Converts an arbitrary type to string Converts to string integers, floats, doubles Quits execution if ...
const std::string kPreproTokenizeEnable
std::string tokenizedsentence
const std::string kPreproWordmapLoad
Lower casing/Tokenization/Detokenization not available for open source release.
Interfaces with basic methods for iteration.
test-specific classes and functions
Static variables for logger. Include only once from main file.
Utilites to extract vocabulary, pseudo-determinize lattices and build substring transducers.
boost::scoped_ptr< uf::StatsData > stats
void addSentenceMarkers(std::string &sentence)
Adds sentence markers <s>, </s> to a sentence.
std::string originalsentence
unordered_map< std::size_t, std::string > oovwmap
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Describes class PreProTask, which preprocesses (tokenizes and maps to integers with WordMapper) sourc...
int main(int argc, char **argv)
unordered_map< std::string, uu::WordMapper * > wm
Wordmap/Integer map objects.
const std::string kPreproTokenizeLanguage
Wrapper stream class that reads pipes, text files or gzipped files.
Unit testing: google testing common header.
Static variable for custom_assert. Include only once from main file.