29 inline void tokenize (
const std::string& is, std::string *os,
30 const std::string languagespecific =
"" ) {
35 inline void detokenize (
const std::string& is, std::string *os,
36 std::string languagespecific =
"" ) {
46 if ( sentence.size() < 3 ) {
47 sentence =
"<s> " + sentence +
" </s>";
50 }
else if ( sentence.substr ( 0, 3 ) !=
"<s>" )
51 sentence =
"<s> " + sentence;
52 if ( sentence.substr ( sentence.size() - 5, 4 ) !=
"</s>" )
63 boost::regex pattern1 (
"^\\s*1\\s|^\\s*<s>\\s+|^\\s*1\\s*$",
64 boost::regex_constants::icase | boost::regex_constants::perl );
65 boost::regex pattern2 (
"\\s+2\\s*$|\\s+</s>\\s*$|^\\s*2\\s*$|^\\s*</s>\\s*$",
66 boost::regex_constants::icase | boost::regex_constants::perl );
67 std::string replace (
"" );
68 sentence = boost::regex_replace ( sentence, pattern1, replace );
69 sentence = boost::regex_replace ( sentence, pattern2, replace );
78 "This function assumes non empty sequence of words!" );
79 words[0][0] = toupper ( words[0][0] );
80 for ( uint k = 1; k < words.size(); ++k ) {
81 if ( words[k - 1] ==
"\"" || words[k - 1] ==
"." ) {
82 words[k][0] = toupper ( words[k][0] );
89 std::vector<std::string> w;
90 boost::algorithm::split ( w, words, boost::algorithm::is_any_of (
"_" ) );
92 words = boost::algorithm::join ( w,
" " );
void trim_spaces(const std::string &input, std::string *output)
Trims spaces at the edges (no spaces) and also between words (only one space)
void capitalizeFirstWord(std::vector< std::string > &words)
Simple function that capitalizes first word and first word of sentence and first word.
void detokenize(const std::string &is, std::string *os, std::string languagespecific="")
Not implemented, just pass through.
void tokenize(const std::string &is, std::string *os, const std::string languagespecific="")
Not implemented, just pass through.
void deleteSentenceMarkers(std::string &sentence)
Deletes sentence markers 1/2 or <s>/</s> for a sentence.
void addSentenceMarkers(std::string &sentence)
Adds sentence markers <s>, </s> to a sentence.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.