14 RefsData::RefsData() {};
18 RefsData::RefsData (
const std::vector<Sentence>& refs) {
20 std::vector<NGramToCountMap> tmpCounts (refs.size() );
21 for (
unsigned int k = 0; k < refs.size(); ++k) {
22 int refssize_minus_n = refs[k].size() -
24 for (
int i = 0; i < refssize_minus_n; ++i) {
25 NGram u = SubStr (refs[k], i, n);
30 for (
unsigned int k = 0; k < refs.size(); ++k) {
31 for (NGramToCountMap::iterator it = refCounts.begin();
32 it != refCounts.end(); ++it) {
33 it->second = max (it->second, tmpCounts[k][it->first]);
37 for (
unsigned int k = 0; k < refs.size(); ++k) {
38 refLengths.push_back (refs[k].size() );
40 sort (refLengths.begin(), refLengths.end() );
45 bs.
refLength = ClosestReferenceLength (hyp.size() );
46 for (
unsigned int n = 0; n < BleuStats::MAX_BLEU_ORDER && n < hyp.size();
50 bs.
tots[n] = hyp.size() - n;
52 for (
unsigned int i = 0; i < hyp.size() - n; ++i) {
53 hypCounts[SubStr (hyp, i, n)]++;
55 for (NGramToCountMap::const_iterator hit = hypCounts.begin();
56 hit != hypCounts.end(); ++hit) {
57 NGramToCountMap::const_iterator rit = refCounts.find (hit->first);
58 bs.
hits[n] += min (rit == refCounts.end() ? 0 : rit->second,
65 NGram RefsData::SubStr (
const Sentence& s,
const unsigned int n,
66 const unsigned int l)
const {
67 return NGram (s.begin() + n, s.begin() + n + l + 1);
70 unsigned int RefsData::ClosestReferenceLength (
71 const unsigned int hypLength)
const {
72 unsigned int refLength = 0;
73 unsigned int refDistance = numeric_limits<unsigned int>::max();
74 for (
unsigned int k = 0; k < refLengths.size(); ++k) {
75 unsigned int distance = abs ( (
int) refLengths[k] - (
int) hypLength);
76 if (distance < refDistance) {
77 refDistance = distance;
78 refLength = refLengths[k];
86 for (NGramToCountMap::const_iterator it = refCounts.begin();
87 it != refCounts.end(); ++it) {
89 unsigned int count = it->second;
90 for (
int w = 0; w < u.size(); ++w) {
96 oss <<
'\t' << count << separator;
101 std::vector<std::vector<std::string> >
LoadRefFiles (std::vector<std::string>
103 tracer <<
"loading and initializing reference ngrams...\n";
104 std::vector<std::vector<std::string> > refsStr (files.size() );
105 for (
unsigned int r = 0; r < files.size(); ++r) {
106 std::ifstream ifs (files[r].c_str() );
108 cerr <<
"ERROR: unable to open file " << files[r] <<
'\n';
112 refsStr[r].push_back (
"");
114 refsStr[r].push_back (line);
117 tracer <<
"loaded " << refsStr[r].size() - 1 <<
" translations from " 126 std::vector<std::vector<std::string> > refsStr =
LoadRefFiles (files);
127 refsData.push_back (RefsData::dummy);
128 for (
unsigned int sid = 0; sid < refsStr[0].size() - 1 ; ++sid) {
129 std::vector<Sentence> refsIdx;
130 for (
unsigned int r = 0; r < files.size(); ++r) {
131 std::istringstream iss (refsStr[r][sid + 1]);
137 refsIdx.push_back (s);
139 refsData.push_back (
RefsData (refsIdx) );
145 if (sid + 1 > refsData.size() ) {
146 cerr <<
"ERROR: no references loaded for sentence s=" << sid <<
'\n';
149 return refsData[sid].ComputeBleuStats (h);
unordered_map< NGram, unsigned int, hashfvecint64, hasheqvecint64 > NGramToCountMap
void LoadRefData(vector< string >)
std::vector< Wid > Sentence
iszfstream & getline(iszfstream &izs, std::string &line)
static const unsigned int MAX_BLEU_ORDER
BleuStats ComputeBleuStats(const Sentence &hyp) const
ErrorStats ComputeErrorStats(Sid, Sentence) const
std::vector< std::vector< std::string > > LoadRefFiles(std::vector< std::string > files)
string ToString(const char separator) const