Cambridge SMT System
bleu.hpp
Go to the documentation of this file.
1 #ifndef FSTTOOLS_BLEU_HPP
2 #define FSTTOOLS_BLEU_HPP
3 
4 #include <iostream>
5 #include <boost/iostreams/device/file_descriptor.hpp>
6 #include <boost/iostreams/stream_buffer.hpp>
7 #include <boost/thread/mutex.hpp>
8 #include <unordered_map>
9 #include <boost/functional/hash.hpp>
10 
11 typedef boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_sink> pipe_out;
12 typedef boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_source> pipe_in;
13 
14 namespace ucam {
15 namespace fsttools {
16 
17 // parameter vector
18 typedef std::vector<float> PARAMS32;
19 typedef unsigned Sid;
20 // word and sentence typedefs
21 typedef long long Wid;
22 typedef std::vector<Wid> SentenceIdx;
23 
24 
25 class Bleu {
26  public:
27  Bleu ( const double bleu = -std::numeric_limits<double>::infinity(),
28  const double brev = 0 ) : m_bleu ( bleu ), m_brev ( brev ) {}
29  inline double GetError() const {
30  return m_bleu;
31  }
32 
33  double m_bleu;
34  double m_brev;
35 };
36 
37 std::ostream& operator<< ( std::ostream& o, const Bleu& b ) {
38  o << b.m_bleu << " (" << b.m_brev << ")";
39  return o;
40 }
41 
42 bool operator> ( Bleu& b1, Bleu& b2 ) {
43  return b1.m_bleu > b2.m_bleu;
44 }
45 
46 class BleuStats {
47  public:
48  static const unsigned int MAX_BLEU_ORDER = 4;
49  std::vector<int> tots_; // order-specific ngram totals
50  std::vector<int> hits_; // order-specific ngram hits
51  long refLength_; // effective reference length
52 
53  BleuStats () : refLength_ ( 0 ) {
54  tots_.resize ( MAX_BLEU_ORDER, 0 );
55  hits_.resize ( MAX_BLEU_ORDER, 0 );
56  }
57 
58  BleuStats ( std::vector<int> const &tots
59  , std::vector<int> const &hits
60  , long const refLength ) :
61  tots_ ( tots ),
62  hits_ ( hits ),
63  refLength_ ( refLength )
64  {}
65 };
66 
67 std::ostream& operator<< ( std::ostream& o, const BleuStats& b ) {
68  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER; ++n ) {
69  if ( n > 0 ) {
70  o << "; ";
71  }
72  o << b.hits_[n] << "/" << b.tots_[n];
73  }
74  o << "; " << b.refLength_;
75  return o;
76 }
77 
78 BleuStats operator+ ( const BleuStats& bs1, const BleuStats& bs2 ) {
79  BleuStats bs;
80  bs.refLength_ = bs1.refLength_ + bs2.refLength_;
81 
82  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER; ++n ) {
83  bs.hits_[n] = bs1.hits_[n] + bs2.hits_[n];
84  bs.tots_[n] = bs1.tots_[n] + bs2.tots_[n];
85  }
86  return bs;
87 }
88 
89 BleuStats operator- ( const BleuStats& bs1, const BleuStats& bs2 ) {
90  BleuStats bs;
91  bs.refLength_ = bs1.refLength_ - bs2.refLength_;
92 
93  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER; ++n ) {
94  bs.tots_[n] = bs1.tots_[n] - bs2.tots_[n];
95  bs.hits_[n] = bs1.hits_[n] - bs2.hits_[n];
96  }
97  return bs;
98 }
99 
100 class LRUCache {
101  typedef std::pair< SentenceIdx, BleuStats > EntryPair;
102  typedef std::list< EntryPair > CacheList;
103  typedef unordered_map< SentenceIdx, CacheList::iterator, boost::hash<SentenceIdx> > CacheMap;
104 
105 public:
106  LRUCache(unsigned int cacheSize=10000): cacheSize_(cacheSize), entries(0){};
107 
108  void insert(const SentenceIdx& hyp, const BleuStats& bs) {
109  // push it to the front;
110  cacheList.push_front(std::make_pair(hyp, bs));
111  // add it to the cache map
112  cacheMap[hyp] = cacheList.begin();
113  // increase count of entries
114  entries++;
115  // check if it's time to remove the last element
116  if (entries > cacheSize_) {
117  // erase from the map the last cache list element
118  cacheMap.erase(cacheList.back().first);
119  // erase it from the list
120  cacheList.pop_back();
121  // decrease count
122  entries--;
123  }
124  }
125 
126  bool contains(const SentenceIdx& hyp) const {
127  CacheMap::const_iterator it = cacheMap.find(hyp);
128  return it != cacheMap.end();
129  }
130 
131  bool get(const SentenceIdx& hyp, BleuStats& bs) {
132  CacheMap::const_iterator it = cacheMap.find(hyp);
133  if (it == cacheMap.end()) {
134  return false;
135  }
136  EntryPair entry = *(it->second);
137  cacheList.erase(it->second);
138  cacheList.push_front(entry);
139  cacheMap[hyp] = cacheList.begin();
140  bs = entry.second;
141  return true;
142  }
143 
144  BleuStats get(const SentenceIdx& hyp) {
145  CacheMap::const_iterator it = cacheMap.find(hyp);
146  if (it == cacheMap.end()) {
147  BleuStats zero;
148  return zero;
149  }
150  EntryPair entry = *(it->second);
151  cacheList.erase(it->second);
152  cacheList.push_front(entry);
153  cacheMap[hyp] = cacheList.begin();
154  return entry.second;
155  }
156 
157 private:
158  unsigned int cacheSize_;
159  unsigned int entries;
160  CacheList cacheList;
161  CacheMap cacheMap;
162 };
163 
164 
165 class BleuScorer {
166 public:
167  typedef std::vector<Wid> NGram;
168  typedef unordered_map<NGram, unsigned int,
171  std::vector<NGramToCountMap> refCounts;
172  std::vector< std::vector<unsigned int> > refLengths;
173 
174  BleuScorer(std::string const & refFiles, std::string const & extTokCmd,
175  unsigned int const & cacheSize, bool const & intRefs,
176  std::string const & wordMapFile) :
177  chits_(0), cmisses_(0), intRefs_(intRefs){
178  useCache_ = (cacheSize > 0);
179  externalTokenizer_ = (extTokCmd.size() > 0);
180  useWidMap_ = false;
181  if (refFiles == "") {
182  FORCELINFO("Must provide reference files(s)");
183  exit(EXIT_FAILURE);
184  }
185  if (wordMapFile != "" ) {
186  FORCELINFO ("Loading word map file...");
187  ucam::util::iszfstream f (wordMapFile);
188  unsigned id;
189  std::string word;
190  oovId_ = 0;
191  while (f >> word >> id) {
192  widMap_[id] = word;
193  refWordMap_[word] = id;
194  if (id > oovId_)
195  oovId_ = id + 100;
196  }
197  FORCELINFO ("Loaded " << widMap_.size() << " symbols");
198  useWidMap_=true;
199  }
200  if (!useWidMap_ && !intRefs_) {
201  FORCELINFO("Must provide a wordmap for use with word references");
202  exit(1);
203  }
204  // -- pipez
205  if (externalTokenizer_) {
206  FORCELINFO("Starting external tokenizer");
207  int fd[2];
208  OpenPipe(fd, extTokCmd);
209  pipe_in *pIn = new pipe_in();
210  pipe_out *pOut = new pipe_out();
211  pOut->open( boost::iostreams::file_descriptor_sink(fd[0],
212  boost::iostreams::close_handle));
213  pIn->open( boost::iostreams::file_descriptor_source(fd[1],
214  boost::iostreams::close_handle));
215  normalIn = new std::istream(pIn);
216  intOut = new std::ostream(pOut);
217  }
218  LoadReferences(refFiles);
219  if (useCache_) {
220  bleuStatsCache.resize( refCounts.size() );
221  FORCELINFO("bleu stats cache size: " << cacheSize << " x " << refCounts.size());
222  }
223  }
224 
225  void LoadReferences(std::string const & refFiles) {
226  FORCELINFO("Processing reference(s) " << refFiles );
227  std::vector<std::string> fv;
228  boost::split(fv, refFiles, boost::is_any_of(","));
229  int nr;
230  for (int k = 0; k < fv.size(); k++) {
231  std::string refFile = fv[k];
232  FORCELINFO("Processing reference " << refFile );
233  std::ifstream ifs ( refFile.c_str() );
234  std::string line;
235  int nrk = 0;
236  while ( getline ( ifs, line ) ) {
237  SentenceIdx sidx;
238  sidx = intRefs_ ? LoadIntRef(line) : LoadWordRef(line);
239  NGramToCountMap ngc = ScanNGrams ( sidx );
240  if ( k == 0 ) {
241  std::vector<unsigned int> l;
242  l.push_back(sidx.size());
243  refLengths.push_back(l);
244  refCounts.push_back(ngc);
245  } else {
246  refLengths[nrk].push_back( sidx.size() );
247  for (NGramToCountMap::const_iterator it = ngc.begin(); it != ngc.end(); it++) {
248  NGramToCountMap::const_iterator it2 = refCounts[nrk].find(it->first);
249  if (it2 == refCounts[k].end()) {
250  refCounts[nrk][it->first] = it->second;
251  } else {
252  refCounts[nrk][it->first] = std::max(refCounts[nrk][it->first], it->second);
253  }
254  }
255  }
256  nrk++;
257  }
258  ifs.close();
259  if (k==0)
260  nr = nrk;
261  if (nr != nrk) {
262  FORCELINFO("unequal number of reference sentences");
263  exit ( EXIT_FAILURE );
264  }
265  }
266  if (nr == 0) {
267  FORCELINFO("Unable to load any references.");
268  exit(EXIT_FAILURE);
269  }
270  FORCELINFO("refLengths.size() " << refLengths.size());
271  FORCELINFO("refCounts.size() " << refCounts.size() );
272  }
273 
274  string CacheStats() {
275  std::ostringstream os;
276  os << "BleuStats Cache Stats: Cache Hits=" << chits_ << "; Cache Misses=" << cmisses_ << "; Rate=";
277  os.precision(3);
278  os << (float) chits_ / (float) (chits_ + cmisses_);
279  return os.str();
280  }
281 
282  unsigned int ClosestReferenceLength ( Sid sid,
283  const unsigned int hypLength ) const {
284  unsigned int rD = std::numeric_limits<unsigned int>::max();
285  unsigned int rL;
286  for (unsigned int k=0; k<refLengths[sid].size(); k++) {
287  unsigned int d = abs( (int) refLengths[sid][k] - (int) hypLength);
288  if (d < rD || d == rD && rL > refLengths[sid][k]) {
289  rD = d;
290  rL = refLengths[sid][k];
291  }
292  }
293  return rL;
294  }
295 
296  BleuStats SentenceBleuStats ( const Sid sid, const SentenceIdx& hypIdx ) {
297  BleuStats bs;
298  if (useCache_ && bleuStatsCache[sid].get(hypIdx, bs)) {
299  chits_++;
300  return bs;
301  }
302  cmisses_++;
303  SentenceIdx hyp = (externalTokenizer_) ? HypExternalTokenizer(hypIdx) : hypIdx;
304  bs.refLength_ = ClosestReferenceLength ( sid, hyp.size() );
305  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER
306  && n < hyp.size(); ++n ) {
307  NGramToCountMap hypCounts;
308  if ( hyp.size() > n ) {
309  bs.tots_[n] = hyp.size() - n;
310  }
311  for ( unsigned int i = 0; i < hyp.size() - n; ++i ) {
312  hypCounts[SubStr ( hyp, i, n )]++;
313  }
314  for ( NGramToCountMap::const_iterator hit = hypCounts.begin();
315  hit != hypCounts.end(); ++hit ) {
316  NGramToCountMap::const_iterator rit = refCounts[sid].find ( hit->first );
317  bs.hits_[n] += std::min ( rit == refCounts[sid].end() ? 0 : rit->second,
318  hit->second ); // Sum clipped counts
319  }
320  }
321  if (useCache_)
322  bleuStatsCache[sid].insert(hypIdx, bs);
323  return bs;
324  }
325 
326  Bleu ComputeBleu ( const BleuStats& bs ) {
327  double logBleu = 0.0;
328  double logBrev = 0.0;
329 
330  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER; ++n ) {
331  logBleu += std::log ( ( double ) bs.hits_[n] / ( double ) bs.tots_[n] );
332  }
333  logBleu *= 1 / ( double ) BleuStats::MAX_BLEU_ORDER;
334  logBrev = std::min ( 0.0, 1 - bs.refLength_ / ( double ) ( bs.tots_[0] ) );
335  return Bleu ( exp ( logBleu + logBrev ), exp ( logBrev ) );
336  }
337 
338  Bleu ComputeSBleu ( const BleuStats& bs ) {
339  double logBleu = 0.0;
340  double logBrev = 0.0;
341 
342  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER; ++n ) {
343  logBleu += std::log ( ( double ) (bs.hits_[n] + 1.0) / ( double ) (bs.tots_[n] + 1.0) );
344  }
345  logBleu *= 1 / ( double ) BleuStats::MAX_BLEU_ORDER;
346  logBrev = std::min ( 0.0, 1 - (bs.refLength_ + 1.0)/ ( double ) ( bs.tots_[0] + 1.0 ) );
347  return Bleu ( exp ( logBleu + logBrev ), exp ( logBrev ) );
348  }
349 
350 
351  NGram SubStr ( const SentenceIdx& s, const unsigned int n,
352  const unsigned int l ) const {
353  return NGram ( s.begin() + n, s.begin() + n + l + 1 );
354  }
355 
356  NGramToCountMap ScanNGrams ( SentenceIdx const &ref ) const {
357  NGramToCountMap m;
358 
359  for ( unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER; ++n ) {
360  // needs to be an int, not a unsigned int in case it is negative
361  int refssize_minus_n = ref.size() - n;
362  for ( int i = 0; i < refssize_minus_n; ++i ) {
363  NGram u = SubStr ( ref, i, n );
364  ++m[u];
365  }
366  }
367  return m;
368  }
369 
370 
371 private:
372  unordered_map<Wid, std::string> widMap_;
373  unordered_map<std::string, Wid> refWordMap_;
374  Wid oovId_;
375  bool externalTokenizer_;
376  bool useWidMap_;
377  bool intRefs_;
378  boost::mutex mutex;
379  vector< LRUCache > bleuStatsCache;
380  unsigned int chits_;
381  unsigned int cmisses_;
382  bool useCache_;
383 
384  // n.b. not to be multithreaded - references are loaded only once by main
385  // load in references in integer format.
386  SentenceIdx LoadIntRef(std::string const & s) {
387  SentenceIdx rv;
388  std::istringstream is(s);
389  Wid w;
390  while (is >> w)
391  rv.push_back(w);
392  return rv;
393  }
394 
395  // VESTIGIAL. References in integer format are piped through the external
396  // tokenizer prior and then mapped to integers.
397  // Rather than include this here, references should be processed prior
398  // to running lmert, etc.
399  // n.b. not to be multithreaded - references are loaded only once by main
400  SentenceIdx LoadIntRefExternalTokenizer(std::string const &s) {
401  string si;
402  (*intOut) << s << endl;
403  getline(*normalIn, si);
404  std::istringstream is(si);
405  SentenceIdx rs;
406  string w;
407  while (is >> w) {
408  unordered_map<std::string, Wid>::iterator it = refWordMap_.find(w);
409  if (it == refWordMap_.end()) {
410  rs.push_back(oovId_);
411  refWordMap_[w] = oovId_++;
412  } else {
413  rs.push_back(it->second);
414  }
415  }
416  return rs;
417  }
418 
419  // read in ascii refs that are already correctly tokenized
420  // for use with external tokenization of hyps
421  // n.b. not to be multithreaded - references are loaded only once by main
422  SentenceIdx LoadWordRef(std::string const &s) {
423  std::istringstream is(s);
424  SentenceIdx rs;
425  string w;
426  while (is >> w) {
427  unordered_map<std::string, Wid>::iterator it = refWordMap_.find(w);
428  if (it == refWordMap_.end()) {
429  rs.push_back(oovId_);
430  widMap_[oovId_] = w;
431  refWordMap_[w] = oovId_;
432  oovId_++;
433  } else {
434  rs.push_back(it->second);
435  }
436  }
437  return rs;
438  }
439 
440  // returns wordid if word is in references; oov otherwise
441  // ok for threading
442  SentenceIdx HypExternalTokenizer(SentenceIdx const &s) {
443  if (s.size() == 0)
444  return s;
445  std::ostringstream os;
446  if (useWidMap_) {
447  os << ((widMap_.find(s[0])==widMap_.end())?"#OOV#":widMap_[s[0]]);
448  for (int i=1; i<s.size(); i++)
449  os << " " << ((widMap_.find(s[i])==widMap_.end())?"#OOV#":widMap_[s[i]]);
450  } else {
451  os << s[0];
452  for (int i=1; i<s.size(); i++)
453  os << " " << s[i];
454  }
455  string si;
456  mutex.lock();
457  (*intOut) << os.str() << endl;
458  getline(*normalIn, si);
459  mutex.unlock();
460  std::istringstream is(si);
461  SentenceIdx rs;
462  if (useWidMap_) {
463  string w;
464  while (is >> w) {
465  unordered_map<std::string, Wid>::iterator it = refWordMap_.find(w);
466  if (it == refWordMap_.end()) {
467  rs.push_back(oovId_);
468  } else {
469  rs.push_back(it->second);
470  }
471  }
472  } else {
473  Wid w;
474  while (is >> w) {
475  rs.push_back(w);
476  }
477  }
478  return rs;
479  }
480 
481  void OpenPipe(int *fd, string command) {
482  int outfd[2];
483  int infd[2];
484  int oldstdin, oldstdout;
485  int rs1 = pipe(outfd); // Where the parent is going to write to
486  int rs2 = pipe(infd); // From where parent is going to read
487  oldstdin = dup(0); // Save current stdin
488  oldstdout = dup(1); // Save stdout
489  close(0);
490  close(1);
491  dup2(outfd[0], 0); // Make the read end of outfd pipe as stdin
492  dup2(infd[1], 1); // Make the write end of infd as stdout
493  if (!fork()) {
494  close(outfd[0]); // Not required for the child
495  close(outfd[1]);
496  close(infd[0]);
497  close(infd[1]);
498  char * cstr = new char[command.size() + 1];
499  strcpy(cstr, command.c_str());
500  char * const argv[] = { (char *) "/bin/bash", (char *) "-c", cstr, (char *) 0 };
501  int err = execv(argv[0], argv);
502  dup2(oldstdout, 1);
503  FORCELINFO("Cannot run command: " << command);
504  exit(1);
505  } else {
506  close(0); // Restore the original std fds of parent
507  close(1);
508  dup2(oldstdin, 0);
509  dup2(oldstdout, 1);
510  close(outfd[0]); // These are being used by the child
511  close(infd[1]);
512  fd[0] = outfd[1];
513  fd[1] = infd[0];
514  }
515  }
516 
517  std::istream* normalIn;
518  std::ostream* intOut;
519 };
520 
521 }
522 } // end namespaces
523 #endif
std::vector< NGramToCountMap > refCounts
Definition: bleu.hpp:171
NGramToCountMap ScanNGrams(SentenceIdx const &ref) const
Definition: bleu.hpp:356
std::vector< Wid > SentenceIdx
Definition: bleu.hpp:22
BleuScorer(std::string const &refFiles, std::string const &extTokCmd, unsigned int const &cacheSize, bool const &intRefs, std::string const &wordMapFile)
Definition: bleu.hpp:174
bool contains(const SentenceIdx &hyp) const
Definition: bleu.hpp:126
HashFVec< std::vector< long long > > hashfvecint64
boost::iostreams::stream_buffer< boost::iostreams::file_descriptor_source > pipe_in
Definition: bleu.hpp:12
void insert(const SentenceIdx &hyp, const BleuStats &bs)
Definition: bleu.hpp:108
boost::iostreams::stream_buffer< boost::iostreams::file_descriptor_sink > pipe_out
Definition: bleu.hpp:11
#define FORCELINFO(msg)
unordered_map< NGram, unsigned int, ucam::util::hashfvecint64, ucam::util::hasheqvecint64 > NGramToCountMap
Definition: bleu.hpp:170
std::vector< int > hits_
Definition: bleu.hpp:50
BleuStats SentenceBleuStats(const Sid sid, const SentenceIdx &hypIdx)
Definition: bleu.hpp:296
std::vector< Wid > NGram
Definition: bleu.hpp:167
double GetError() const
Definition: bleu.hpp:29
bool operator>(Bleu &b1, Bleu &b2)
Definition: bleu.hpp:42
BleuStats operator-(const BleuStats &bs1, const BleuStats &bs2)
Definition: bleu.hpp:89
iszfstream & getline(iszfstream &izs, std::string &line)
Definition: szfstream.hpp:178
Bleu ComputeSBleu(const BleuStats &bs)
Definition: bleu.hpp:338
long long Wid
Definition: bleu.hpp:21
std::vector< int > tots_
Definition: bleu.hpp:49
std::vector< float > PARAMS32
Definition: bleu.hpp:18
unsigned Sid
Definition: bleu.hpp:19
LRUCache(unsigned int cacheSize=10000)
Definition: bleu.hpp:106
NGram SubStr(const SentenceIdx &s, const unsigned int n, const unsigned int l) const
Definition: bleu.hpp:351
BleuStats(std::vector< int > const &tots, std::vector< int > const &hits, long const refLength)
Definition: bleu.hpp:58
std::vector< std::vector< unsigned int > > refLengths
Definition: bleu.hpp:172
Bleu(const double bleu=-std::numeric_limits< double >::infinity(), const double brev=0)
Definition: bleu.hpp:27
void LoadReferences(std::string const &refFiles)
Definition: bleu.hpp:225
std::vector< Wid > NGram
Definition: MertCommon.h:47
unsigned int ClosestReferenceLength(Sid sid, const unsigned int hypLength) const
Definition: bleu.hpp:282
Bleu ComputeBleu(const BleuStats &bs)
Definition: bleu.hpp:326
static const unsigned int MAX_BLEU_ORDER
Definition: bleu.hpp:48
BleuStats operator+(const BleuStats &bs1, const BleuStats &bs2)
Definition: bleu.hpp:78
Wrapper stream class that reads pipes, text files or gzipped files.
Definition: szfstream.hpp:34
std::ostream & operator<<(std::ostream &o, const Bleu &b)
Definition: bleu.hpp:37
Definition: bleu.hpp:14