Cambridge SMT System
ErrorSurface.h
Go to the documentation of this file.
1 //Copyright (c) 2012, University of Cambridge
2 //All rights reserved.
3 //
4 //Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met://
5 //
6 // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 // * Neither the name of the University of Cambridge nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9 //
10 //THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
11 
12 #ifndef ERRORSURFACE_H_
13 #define ERRORSURFACE_H_
14 
15 #include <iostream>
16 #include <iomanip>
17 #include <cmath>
18 #include <limits>
19 #include <vector>
20 #include <algorithm>
21 #include <sstream>
22 #include <boost/thread/mutex.hpp>
23 
24 #include "MertCommon.h"
25 #include "BleuStats.h"
26 #include "IntervalData.h"
27 
28 template<typename IntervalBoundary>
29 bool IntervalBoundarySortPredicate (const IntervalBoundary& b1,
30  const IntervalBoundary& b2) {
31  return b1.gamma < b2.gamma;
32 }
33 
34 template<typename RD>
35 class ErrorSurface {
36  public:
37  typedef RD RefData;
38  typedef typename RefData::ErrorStats ErrorStats;
40  typedef typename ErrorStats::Error Error;
41 
42  private:
43 
44  ErrorStats MergeInitials() const {
45  ErrorStats es;
46  for (typename std::vector<IntervalBoundary>::const_iterator it =
47  ++ (initials.begin() ); it != initials.end(); ++it) {
48  es = es + it->errorStats;
49  }
50  return es;
51  }
52 
53  /*
54  * Slightly hacky way of updating both the translation and expected translation score. First element of
55  * referenced array is the translation score and the second is the expected score.
56  */
57  void MergeInitialScores (double scores[]) const {
58  scores[0] = 0.0;
59  scores[1] = 0.0;
60  for (typename std::vector<IntervalBoundary>::const_iterator it =
61  ++ (initials.begin() ); it != initials.end(); ++it) {
62  scores[0] += it->deltaScore;
63  scores[1] += it->deltaExpScore;
64  }
65  }
66 
67  double optimalGamma;
68  Error optimalError;
69  std::vector<ErrorStats> prev;
70  RefData* refs;
71  bool unbounded;
72  std::vector<IntervalBoundary>
73  initials; // Interval boundaries for left-most line segments (gamma = -infinity)
74  std::vector<IntervalBoundary> prevBestIBs;
75  boost::mutex mutex;
76 
77  public:
78 
79  ErrorSurface (unsigned int noOfSentences, RefData* refs) :
80  refs (refs) {
81  initials.resize (noOfSentences + 1);
82  prevBestIBs.resize (noOfSentences + 1);
83  prev.resize (noOfSentences + 1);
84  }
85 
86  ErrorSurface (const ErrorSurface& other) :
87  optimalGamma (other.optimalGamma), optimalError (other.optimalError), prev (
88  other.prev), refs (other.refs), unbounded (other.unbounded), initials (
89  other.initials), prevBestIBs (other.prevBestIBs) {
90  }
91 
93  if (this == &rhs) {
94  return *this;
95  }
96  optimalGamma = rhs.optimalGamma;
97  optimalError = rhs.optimalError;
98  prev = rhs.prev;
99  refs = rhs.refs;
100  unbounded = rhs.unbounded;
101  initials = rhs.initials;
102  prevBestIBs = rhs.prevBestIBs;
103  return *this;
104  }
105 
106  double GetOptimalGamma() {
107  return optimalGamma;
108  }
109 
110  Error GetOptimalError() {
111  return optimalError;
112  }
113 
114  bool GetUnbounded() {
115  return unbounded;
116  }
117 
118  void ComputeSurface() {
119  std::vector<IntervalBoundary> currentIBs (initials);
120  double scores[2];
121  MergeInitialScores (scores);
122  ErrorStats aggregateErrorStats = MergeInitials();
123  sort (boundaries.begin(), boundaries.end(),
124  IntervalBoundarySortPredicate<IntervalBoundary>);
125  optimalGamma = boundaries.front().gamma - 1;
126  optimalError = aggregateErrorStats.ComputeError();
127  typename std::vector<IntervalBoundary>::iterator itNext =
128  ++ (boundaries.begin() );
129  unbounded = true;
130  for (typename std::vector<IntervalBoundary>::iterator it =
131  boundaries.begin(); it != boundaries.end(); ++it) {
132  scores[0] = it->score + it->deltaScore;
133  scores[1] = it->expScore + it->deltaExpScore;
134  it->score = scores[0];
135  it->expScore = scores[1];
136  aggregateErrorStats = aggregateErrorStats + it->errorStats;
137  currentIBs[it->sentence].errorStats =
138  currentIBs[it->sentence].errorStats + it->errorStats;
139  Error current = aggregateErrorStats.ComputeError();
140  if (current > optimalError) {
141  copy (++ (currentIBs.begin() ), currentIBs.end(),
142  ++ (prevBestIBs.begin() ) );
143  optimalError = current;
144  if (itNext == boundaries.end() ) {
145  optimalGamma = it->gamma + 1;
146  unbounded = true;
147  } else {
148  optimalGamma = 0.5 * (it->gamma + itNext->gamma);
149  unbounded = false;
150  }
151  }
152  ++itNext;
153  }
154  }
155 
156  void Reset() {
157  boundaries.clear();
158  optimalGamma = 0.0;
159  Error zero;
160  optimalError = zero;
161  copy (++ (prevBestIBs.begin() ), prevBestIBs.end(), ++ (initials.begin() ) );
162  }
163 
164  void PrintErrorSurface (ostream& os) const {
165  ErrorStats aggregate;
166  os << optimalGamma << " " << optimalError << endl;
167  for (typename std::vector<IntervalBoundary>::const_iterator it =
168  initials.begin(); it != initials.end(); ++it) {
169  aggregate = aggregate + it->errorStats;
170  os << *it << endl;
171  }
172  IntervalBoundary temp;
173  for (typename std::vector<IntervalBoundary>::const_iterator it =
174  boundaries.begin(); it != boundaries.end(); ++it) {
175  temp = *it;
176  aggregate = aggregate + it->errorStats;
177  temp.errorStats = aggregate;
178  os << temp << endl;
179  }
180  }
181 
182  void WriteErrorSurface (const std::string& filename) const {
183  static unsigned int lineOptCount = 0;
184  std::stringstream sstream;
185  sstream << filename << lineOptCount;
186  std::ofstream ofs (sstream.str().c_str() );
187  if (!ofs.good() ) {
188  cerr << "ERROR: can't write: " << sstream.str() << '\n';
189  exit (1);
190  }
191  tracer << "writing error surface to " << sstream.str() << '\n';
192  PrintErrorSurface (ofs);
193  ofs.close();
194  ++lineOptCount;
195  }
196 
197  void CreateInitial (Sid sid, const double gamma, const Sentence h,
198  const double modelScore, const double expScore) {
199  prev[sid] = refs->ComputeErrorStats (sid, h);
200  initials[sid] = IntervalBoundary (sid, gamma, prev[sid], modelScore,
201  expScore);
202  }
203 
204  void CreateInterval (Sid sid, const double gamma, const Sentence h,
205  const double modelScore, const double expScore) {
206  ErrorStats next = refs->ComputeErrorStats (sid, h);
207  IntervalBoundary b (sid, gamma, next - prev[sid], modelScore, expScore);
208  prev[sid] = next;
209  mutex.lock();
210  boundaries.push_back (b);
211  mutex.unlock();
212  }
213 
214  std::vector<IntervalBoundary>
215  boundaries; // Interval boundaries where top-most line segment changes
216 
217 };
218 
219 #endif /* ERRORSURFACE_H_ */
Error GetOptimalError()
Definition: ErrorSurface.h:110
ErrorStats errorStats
Definition: IntervalData.h:33
IntervalData< ErrorStats > IntervalBoundary
Definition: ErrorSurface.h:39
bool IntervalBoundarySortPredicate(const IntervalBoundary &b1, const IntervalBoundary &b2)
Definition: ErrorSurface.h:29
void CreateInterval(Sid sid, const double gamma, const Sentence h, const double modelScore, const double expScore)
Definition: ErrorSurface.h:204
void PrintErrorSurface(ostream &os) const
Definition: ErrorSurface.h:164
#define tracer
Definition: data.lmbr.hpp:18
bool GetUnbounded()
Definition: ErrorSurface.h:114
void CreateInitial(Sid sid, const double gamma, const Sentence h, const double modelScore, const double expScore)
Definition: ErrorSurface.h:197
unsigned int Sid
Definition: MertCommon.h:45
ErrorSurface(unsigned int noOfSentences, RefData *refs)
Definition: ErrorSurface.h:79
std::vector< Wid > Sentence
Definition: MertCommon.h:48
std::vector< IntervalBoundary > boundaries
Definition: ErrorSurface.h:215
RefData::ErrorStats ErrorStats
Definition: ErrorSurface.h:38
ErrorSurface & operator=(const ErrorSurface &rhs)
Definition: ErrorSurface.h:92
ErrorSurface(const ErrorSurface &other)
Definition: ErrorSurface.h:86
ErrorStats::Error Error
Definition: ErrorSurface.h:40
void WriteErrorSurface(const std::string &filename) const
Definition: ErrorSurface.h:182
void ComputeSurface()
Definition: ErrorSurface.h:118
double GetOptimalGamma()
Definition: ErrorSurface.h:106