1 #ifndef FSTTOOLS_BLEU_HPP 2 #define FSTTOOLS_BLEU_HPP 5 #include <boost/iostreams/device/file_descriptor.hpp> 6 #include <boost/iostreams/stream_buffer.hpp> 7 #include <boost/thread/mutex.hpp> 8 #include <unordered_map> 9 #include <boost/functional/hash.hpp> 11 typedef boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_sink>
pipe_out;
12 typedef boost::iostreams::stream_buffer<boost::iostreams::file_descriptor_source>
pipe_in;
21 typedef long long Wid;
27 Bleu (
const double bleu = -std::numeric_limits<double>::infinity(),
28 const double brev = 0 ) :
m_bleu ( bleu ),
m_brev ( brev ) {}
48 static const unsigned int MAX_BLEU_ORDER = 4;
54 tots_.resize ( MAX_BLEU_ORDER, 0 );
55 hits_.resize ( MAX_BLEU_ORDER, 0 );
59 , std::vector<int>
const &hits
60 ,
long const refLength ) :
63 refLength_ ( refLength )
101 typedef std::pair< SentenceIdx, BleuStats > EntryPair;
102 typedef std::list< EntryPair > CacheList;
103 typedef unordered_map< SentenceIdx, CacheList::iterator, boost::hash<SentenceIdx> > CacheMap;
106 LRUCache(
unsigned int cacheSize=10000): cacheSize_(cacheSize), entries(0){};
110 cacheList.push_front(std::make_pair(hyp, bs));
112 cacheMap[hyp] = cacheList.begin();
116 if (entries > cacheSize_) {
118 cacheMap.erase(cacheList.back().first);
120 cacheList.pop_back();
127 CacheMap::const_iterator it = cacheMap.find(hyp);
128 return it != cacheMap.end();
132 CacheMap::const_iterator it = cacheMap.find(hyp);
133 if (it == cacheMap.end()) {
136 EntryPair entry = *(it->second);
137 cacheList.erase(it->second);
138 cacheList.push_front(entry);
139 cacheMap[hyp] = cacheList.begin();
145 CacheMap::const_iterator it = cacheMap.find(hyp);
146 if (it == cacheMap.end()) {
150 EntryPair entry = *(it->second);
151 cacheList.erase(it->second);
152 cacheList.push_front(entry);
153 cacheMap[hyp] = cacheList.begin();
158 unsigned int cacheSize_;
159 unsigned int entries;
168 typedef unordered_map<
NGram,
unsigned int,
174 BleuScorer(std::string
const & refFiles, std::string
const & extTokCmd,
175 unsigned int const & cacheSize,
bool const & intRefs,
176 std::string
const & wordMapFile) :
177 chits_(0), cmisses_(0), intRefs_(intRefs){
178 useCache_ = (cacheSize > 0);
179 externalTokenizer_ = (extTokCmd.size() > 0);
181 if (refFiles ==
"") {
182 FORCELINFO(
"Must provide reference files(s)");
185 if (wordMapFile !=
"" ) {
191 while (f >> word >>
id) {
193 refWordMap_[word] = id;
197 FORCELINFO (
"Loaded " << widMap_.size() <<
" symbols");
200 if (!useWidMap_ && !intRefs_) {
201 FORCELINFO(
"Must provide a wordmap for use with word references");
205 if (externalTokenizer_) {
208 OpenPipe(fd, extTokCmd);
211 pOut->open( boost::iostreams::file_descriptor_sink(fd[0],
212 boost::iostreams::close_handle));
213 pIn->open( boost::iostreams::file_descriptor_source(fd[1],
214 boost::iostreams::close_handle));
215 normalIn =
new std::istream(pIn);
216 intOut =
new std::ostream(pOut);
218 LoadReferences(refFiles);
220 bleuStatsCache.resize( refCounts.size() );
221 FORCELINFO(
"bleu stats cache size: " << cacheSize <<
" x " << refCounts.size());
226 FORCELINFO(
"Processing reference(s) " << refFiles );
227 std::vector<std::string> fv;
228 boost::split(fv, refFiles, boost::is_any_of(
","));
230 for (
int k = 0; k < fv.size(); k++) {
231 std::string refFile = fv[k];
232 FORCELINFO(
"Processing reference " << refFile );
233 std::ifstream ifs ( refFile.c_str() );
236 while (
getline ( ifs, line ) ) {
238 sidx = intRefs_ ? LoadIntRef(line) : LoadWordRef(line);
239 NGramToCountMap ngc = ScanNGrams ( sidx );
241 std::vector<unsigned int> l;
242 l.push_back(sidx.size());
243 refLengths.push_back(l);
244 refCounts.push_back(ngc);
246 refLengths[nrk].push_back( sidx.size() );
247 for (NGramToCountMap::const_iterator it = ngc.begin(); it != ngc.end(); it++) {
248 NGramToCountMap::const_iterator it2 = refCounts[nrk].find(it->first);
249 if (it2 == refCounts[k].end()) {
250 refCounts[nrk][it->first] = it->second;
252 refCounts[nrk][it->first] = std::max(refCounts[nrk][it->first], it->second);
262 FORCELINFO(
"unequal number of reference sentences");
263 exit ( EXIT_FAILURE );
270 FORCELINFO(
"refLengths.size() " << refLengths.size());
271 FORCELINFO(
"refCounts.size() " << refCounts.size() );
275 std::ostringstream os;
276 os <<
"BleuStats Cache Stats: Cache Hits=" << chits_ <<
"; Cache Misses=" << cmisses_ <<
"; Rate=";
278 os << (float) chits_ / (
float) (chits_ + cmisses_);
283 const unsigned int hypLength )
const {
284 unsigned int rD = std::numeric_limits<unsigned int>::max();
286 for (
unsigned int k=0; k<refLengths[sid].size(); k++) {
287 unsigned int d = abs( (
int) refLengths[sid][k] - (
int) hypLength);
288 if (d < rD || d == rD && rL > refLengths[sid][k]) {
290 rL = refLengths[sid][k];
298 if (useCache_ && bleuStatsCache[sid].
get(hypIdx, bs)) {
303 SentenceIdx hyp = (externalTokenizer_) ? HypExternalTokenizer(hypIdx) : hypIdx;
304 bs.
refLength_ = ClosestReferenceLength ( sid, hyp.size() );
306 && n < hyp.size(); ++n ) {
307 NGramToCountMap hypCounts;
308 if ( hyp.size() > n ) {
309 bs.
tots_[n] = hyp.size() - n;
311 for (
unsigned int i = 0; i < hyp.size() - n; ++i ) {
312 hypCounts[SubStr ( hyp, i, n )]++;
314 for ( NGramToCountMap::const_iterator hit = hypCounts.begin();
315 hit != hypCounts.end(); ++hit ) {
316 NGramToCountMap::const_iterator rit = refCounts[sid].find ( hit->first );
317 bs.
hits_[n] += std::min ( rit == refCounts[sid].end() ? 0 : rit->second,
322 bleuStatsCache[sid].insert(hypIdx, bs);
327 double logBleu = 0.0;
328 double logBrev = 0.0;
331 logBleu += std::log ( (
double ) bs.
hits_[n] / (
double ) bs.
tots_[n] );
333 logBleu *= 1 / ( double ) BleuStats::MAX_BLEU_ORDER;
334 logBrev = std::min ( 0.0, 1 - bs.
refLength_ / (
double ) ( bs.
tots_[0] ) );
335 return Bleu ( exp ( logBleu + logBrev ), exp ( logBrev ) );
339 double logBleu = 0.0;
340 double logBrev = 0.0;
343 logBleu += std::log ( (
double ) (bs.
hits_[n] + 1.0) / (
double ) (bs.
tots_[n] + 1.0) );
345 logBleu *= 1 / ( double ) BleuStats::MAX_BLEU_ORDER;
346 logBrev = std::min ( 0.0, 1 - (bs.
refLength_ + 1.0)/ (
double ) ( bs.
tots_[0] + 1.0 ) );
347 return Bleu ( exp ( logBleu + logBrev ), exp ( logBrev ) );
351 NGram
SubStr (
const SentenceIdx& s,
const unsigned int n,
352 const unsigned int l )
const {
353 return NGram ( s.begin() + n, s.begin() + n + l + 1 );
356 NGramToCountMap
ScanNGrams ( SentenceIdx
const &ref )
const {
361 int refssize_minus_n = ref.size() - n;
362 for (
int i = 0; i < refssize_minus_n; ++i ) {
363 NGram u = SubStr ( ref, i, n );
372 unordered_map<Wid, std::string> widMap_;
373 unordered_map<std::string, Wid> refWordMap_;
375 bool externalTokenizer_;
379 vector< LRUCache > bleuStatsCache;
381 unsigned int cmisses_;
386 SentenceIdx LoadIntRef(std::string
const & s) {
388 std::istringstream is(s);
400 SentenceIdx LoadIntRefExternalTokenizer(std::string
const &s) {
402 (*intOut) << s << endl;
404 std::istringstream is(si);
408 unordered_map<std::string, Wid>::iterator it = refWordMap_.find(w);
409 if (it == refWordMap_.end()) {
410 rs.push_back(oovId_);
411 refWordMap_[w] = oovId_++;
413 rs.push_back(it->second);
422 SentenceIdx LoadWordRef(std::string
const &s) {
423 std::istringstream is(s);
427 unordered_map<std::string, Wid>::iterator it = refWordMap_.find(w);
428 if (it == refWordMap_.end()) {
429 rs.push_back(oovId_);
431 refWordMap_[w] = oovId_;
434 rs.push_back(it->second);
442 SentenceIdx HypExternalTokenizer(SentenceIdx
const &s) {
445 std::ostringstream os;
447 os << ((widMap_.find(s[0])==widMap_.end())?
"#OOV#":widMap_[s[0]]);
448 for (
int i=1; i<s.size(); i++)
449 os <<
" " << ((widMap_.find(s[i])==widMap_.end())?
"#OOV#":widMap_[s[i]]);
452 for (
int i=1; i<s.size(); i++)
457 (*intOut) << os.str() << endl;
460 std::istringstream is(si);
465 unordered_map<std::string, Wid>::iterator it = refWordMap_.find(w);
466 if (it == refWordMap_.end()) {
467 rs.push_back(oovId_);
469 rs.push_back(it->second);
481 void OpenPipe(
int *fd,
string command) {
484 int oldstdin, oldstdout;
485 int rs1 = pipe(outfd);
486 int rs2 = pipe(infd);
498 char * cstr =
new char[command.size() + 1];
499 strcpy(cstr, command.c_str());
500 char *
const argv[] = { (
char *)
"/bin/bash", (
char *)
"-c", cstr, (
char *) 0 };
501 int err = execv(argv[0], argv);
503 FORCELINFO(
"Cannot run command: " << command);
517 std::istream* normalIn;
518 std::ostream* intOut;
HashFVec< std::vector< long long > > hashfvecint64
boost::iostreams::stream_buffer< boost::iostreams::file_descriptor_source > pipe_in
boost::iostreams::stream_buffer< boost::iostreams::file_descriptor_sink > pipe_out
iszfstream & getline(iszfstream &izs, std::string &line)
Wrapper stream class that reads pipes, text files or gzipped files.