Cambridge SMT System
tokenizer.osr.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
22 #ifndef TOKENIZER_HPP
23 #define TOKENIZER_HPP
24 
25 namespace ucam {
26 namespace util {
27 
29 inline void tokenize ( const std::string& is, std::string *os,
30  const std::string languagespecific = "" ) {
31  *os = is;
32 };
33 
35 inline void detokenize ( const std::string& is, std::string *os,
36  std::string languagespecific = "" ) {
37  *os = is;
38 };
39 
44 inline void addSentenceMarkers ( std::string& sentence ) {
45  trim_spaces ( sentence, &sentence );
46  if ( sentence.size() < 3 ) {
47  sentence = "<s> " + sentence + " </s>";
48  trim_spaces ( sentence, &sentence );
49  return;
50  } else if ( sentence.substr ( 0, 3 ) != "<s>" )
51  sentence = "<s> " + sentence;
52  if ( sentence.substr ( sentence.size() - 5, 4 ) != "</s>" )
53  sentence += " </s>";
54  trim_spaces ( sentence, &sentence );
55 };
56 
62 inline void deleteSentenceMarkers ( std::string& sentence ) {
63  boost::regex pattern1 ( "^\\s*1\\s|^\\s*<s>\\s+|^\\s*1\\s*$",
64  boost::regex_constants::icase | boost::regex_constants::perl );
65  boost::regex pattern2 ( "\\s+2\\s*$|\\s+</s>\\s*$|^\\s*2\\s*$|^\\s*</s>\\s*$",
66  boost::regex_constants::icase | boost::regex_constants::perl );
67  std::string replace ( "" );
68  sentence = boost::regex_replace ( sentence, pattern1, replace );
69  sentence = boost::regex_replace ( sentence, pattern2, replace );
70  trim_spaces ( sentence, &sentence );
71 }
72 
74 inline void capitalizeFirstWord ( std::vector<std::string>& words ) {
75  // Always capitalize first word, or second if first is "...
76  // Capitalize if previous is . or "
77  USER_CHECK ( words.size(),
78  "This function assumes non empty sequence of words!" );
79  words[0][0] = toupper ( words[0][0] );
80  for ( uint k = 1; k < words.size(); ++k ) {
81  if ( words[k - 1] == "\"" || words[k - 1] == "." ) {
82  words[k][0] = toupper ( words[k][0] );
83  }
84  }
85 };
86 
88 inline void capitalizeFirstWord ( std::string& words ) {
89  std::vector<std::string> w;
90  boost::algorithm::split ( w, words, boost::algorithm::is_any_of ( "_" ) );
91  capitalizeFirstWord ( w );
92  words = boost::algorithm::join ( w, " " );
93 };
94 
95 }
96 } // end namespaces
97 
98 #endif
void trim_spaces(const std::string &input, std::string *output)
Trims spaces at the edges (no spaces) and also between words (only one space)
void capitalizeFirstWord(std::vector< std::string > &words)
Simple function that capitalizes first word and first word of sentence and first word.
void detokenize(const std::string &is, std::string *os, std::string languagespecific="")
Not implemented, just pass through.
void tokenize(const std::string &is, std::string *os, const std::string languagespecific="")
Not implemented, just pass through.
void deleteSentenceMarkers(std::string &sentence)
Deletes sentence markers 1/2 or <s>/</s> for a sentence.
void addSentenceMarkers(std::string &sentence)
Adds sentence markers <s>, </s> to a sentence.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
Definition: bleu.hpp:14