Cambridge SMT System
samplehyps.main.cpp
Go to the documentation of this file.
1 #include <main.custom_assert.hpp>
2 #include <main.logger.hpp>
3 #include <main.samplehyps.hpp>
4 
5 using fst::Hyp;
6 
7 template<class Arc>
8 struct HypW: public Hyp<Arc> {
9 
10  HypW (std::basic_string<unsigned> const& h, typename Arc::Weight const& c)
11  : Hyp<Arc> (h, c) {
12  }
13  HypW (HypW<Arc> const& h)
14  : Hyp<Arc> (h) {
15  }
16 };
17 
18 template<class Arc>
19 std::ostream& operator<< (std::ostream& os, const Hyp<Arc>& obj) {
20  for (unsigned k = 0; k < obj.hyp.size(); ++k) {
21  if (obj.hyp[k] == OOV) continue;
22  if (obj.hyp[k] == DR) continue;
23  if (obj.hyp[k] == EPSILON) continue;
24  if (obj.hyp[k] == SEP) continue;
25  os << obj.hyp[k] << " ";
26  }
27  os << "\t";
28  os << obj.cost;
29  return os;
30 };
31 
32 
33 class Sample {
34 public:
35  Sample(const unsigned j1, const unsigned j2, const double scoreDiff) :
36  j1(j1), j2(j2), scoreDiff(scoreDiff) {}
37 
38  unsigned int j1;
39  unsigned int j2;
40  double scoreDiff;
41 };
42 
43 bool SampleSortPredicate(const Sample& s1, const Sample& s2) {
44  return s1.scoreDiff > s2.scoreDiff;
45 }
46 
47 template <class HypT>
48 ucam::fsttools::Bleu LBleuScorer(ucam::fsttools::BleuScorer& bleuScorer, unsigned const& sid, HypT const& hyp) {
50  unsigned offset = (hyp.size() < 2 ? 0 : 1);
51  for (unsigned z=offset; z< hyp.size()-offset; z++)
52  h.push_back( hyp[z] );
53  return bleuScorer.ComputeSBleu(bleuScorer.SentenceBleuStats(sid, h));
54 }
55 
56 template <class Value, class Weight>
58  Value value;
59  Weight fea;
60 };
61 
62 template <class Weight, class HypT>
63 vector< LabeledFeature<float, Weight> >
65  std::vector<HypT> const& hyps, unsigned const& sid,
66  unsigned const& n, unsigned const &ns, double const& alpha, bool negatives=false, bool negate=true ) {
67 
68  std::set< std::pair<unsigned, unsigned> > indexpairs;
69  vector< Sample > samples;
70  for (unsigned s=0; s<n; s++) {
71  LINFO("s="<<s);
72  unsigned j1 = rand() % hyps.size();
73  unsigned j2 = rand() % hyps.size();
74  LINFO("1 [" << j1 <<"] " <<hyps[j1]);
75  LINFO("2: ["<<j2<<"] "<<hyps[j2]);
76  if (indexpairs.find( std::make_pair(j1, j2)) != indexpairs.end() ) {
77  LINFO("--skipping - already done");
78  continue;
79  }
80  indexpairs.insert( std::make_pair(j1, j2) );
81  ucam::fsttools::Bleu bs1 = LBleuScorer(bleuScorer, sid, hyps[j1].hyp);
82  ucam::fsttools::Bleu bs2 = LBleuScorer(bleuScorer, sid, hyps[j2].hyp);
83  LINFO("SBLUE1= "<<bs1.m_bleu<<" ; SBLEU2="<<bs2.m_bleu << " ; DIFF=" << fabs(bs1.m_bleu - bs2.m_bleu));
84  if ( bs1.m_bleu - bs2.m_bleu > alpha )
85  samples.push_back(Sample(j1, j2, bs1.m_bleu - bs2.m_bleu));
86  if ( bs2.m_bleu - bs1.m_bleu > alpha ) {
87  samples.push_back(Sample(j2, j1, bs2.m_bleu - bs1.m_bleu));
88  }
89  }
90  LINFO("Positive samples found: " << samples.size());
91  sort(samples.begin(), samples.end(), SampleSortPredicate);
92  std::vector< LabeledFeature< float, Weight> > ss;
93  int np=0;
94  for (unsigned s=0; s<ns && s < samples.size(); s++) {
95  unsigned j1 = samples[s].j1;
96  unsigned j2 = samples[s].j2;
97  if (negate) {
98  unsigned x = j1;
99  j1 = j2;
100  j2 = x;
101  }
103  lf.value = samples[s].scoreDiff;
104  lf.fea = Divide(hyps[j1].cost, hyps[j2].cost);
105  ss.push_back(lf);
106  LINFO("Sample " << s << " score diff " << samples[s].scoreDiff);
107  np++;
108  if (!negatives)
109  continue;
110  lf.value = -samples[s].scoreDiff;
111  lf.fea = Divide(hyps[j2].cost, hyps[j1].cost);
112  ss.push_back(lf);
113  }
114  LINFO("Positive samples found: " << np << " of " << n);
115  return ss;
116 };
117 
118 template <class Arc, class HypT>
122  PatternAddress<unsigned> input(rg.get<std::string>(HifstConstants::kInput.c_str()));
123  PatternAddress<unsigned> output(rg.get<std::string>(HifstConstants::kOutput.c_str()));
124  unsigned n = rg.get<unsigned>(HifstConstants::kNbest.c_str());
125  unsigned ns = rg.get<unsigned>(HifstConstants::kNSamples.c_str());
126  float alpha = rg.get<float>(HifstConstants::kAlpha.c_str());
127  bool negatives = rg.exists(HifstConstants::kNegativeExamples.c_str());
128  bool binarytarget = rg.exists(HifstConstants::kBinaryTarget.c_str());
129  bool negate = !rg.exists(HifstConstants::kDontNegate.c_str());
130  std::string extTok = rg.getString(HifstConstants::kExternalTokenizer.c_str());
131  std::string wMap = rg.getString(HifstConstants::kWordMap.c_str());
132  // std::string wMap = "";
133  //
134  bool printOutputLabels = rg.exists(HifstConstants::kPrintOutputLabels.c_str());
135  std::string refFiles;
136  bool intRefs;
138  refFiles = rg.getString(HifstConstants::kWordRefs);
139  intRefs = false;
140  }
142  refFiles = rg.getString(HifstConstants::kIntRefs);
143  intRefs = true;
144  }
145  std::cerr << refFiles << "**" <<std::endl;
146  ucam::fsttools::BleuScorer bleuScorer(refFiles, extTok, n, intRefs, wMap);
147  ucam::fsttools::TuneSet< Arc > tuneSet(rg);
148  ucam::fsttools::Bleu ibs = tuneSet.ComputeBleu(bleuScorer);
149  FORCELINFO("Set level Bleu: " << ibs);
150  unsigned seed = time(NULL);
151  if (rg.exists(HifstConstants::kRandomSeed.c_str()))
152  seed = rg.get<unsigned>(HifstConstants::kRandomSeed.c_str());
153  FORCELINFO("random seed: " << seed);
154  srand(seed);
155  boost::scoped_ptr<oszfstream> out;
156  std::string old;
157  for (unsigned i=0; i<tuneSet.cachedLats.size(); i++) {
158  fst::VectorFst<Arc> ifst(*tuneSet.cachedLats[i]);
159  fst::VectorFst<Arc> nfst;
160  if (old != output (i) ) {
161  out.reset(new oszfstream (output(i)));
162  old = output(i);
163  }
164  if (!ifst.NumStates() ) {
165  FORCELINFO("EMPTY: " << i);
166  continue;
167  }
168  // Projecting allows unique to work for all cases.
169  fst::Project(&ifst, (printOutputLabels?PROJECT_OUTPUT:PROJECT_INPUT));
170  ShortestPath (ifst, &nfst, n, true );
171  std::vector<HypT> hyps;
172  fst::printStrings<Arc> (nfst, &hyps);
173  std::vector< LabeledFeature< float, typename Arc::Weight> > fea =
174  ProSBLEUSample<typename Arc::Weight, HypT>(bleuScorer, hyps, i, n, ns, alpha, negatives, negate);
175  for (unsigned s=0; s<fea.size(); s++) {
176  *out << (binarytarget ? (fea[s].value > 0.0 ? 1 : 0) : fea[s].value);
177  *out << " " << fea[s].fea << std::endl;
178  }
179  }
180  FORCELINFO("Done Sample WFSAs");
181 };
182 
183 
184 
185 int main ( int argc, const char* argv[] ) {
186  ucam::util::initLogger ( argc, argv );
187  FORCELINFO ( argv[0] << " starts!" );
188  ucam::util::RegistryPO rg ( argc, argv );
189  FORCELINFO ( rg.dump ( "CONFIG parameters:\n=====================",
190  "====================="));
191  std::string const& semiring = rg.get<std::string> (HifstConstants::kHifstSemiring);
192  if (semiring == HifstConstants::kHifstSemiringStdArc) {
193  SampleWFSAs<fst::StdArc, Hyp<fst::StdArc> > (rg);
194  FORCELINFO("semiring StdArc");
195  } else if (semiring == HifstConstants::kHifstSemiringLexStdArc) {
196  FORCELINFO("semiring LexStdArc");
197  SampleWFSAs<fst::LexStdArc, Hyp<fst::LexStdArc> > (rg);
198  } else if (semiring == HifstConstants::kHifstSemiringTupleArc) {
199  FORCELINFO("semiring TupleArc32");
200  const std::string& tuplearcWeights =
202  ? rg.get<std::string> (HifstConstants::kTupleArcWeights.c_str()) : "");
203  if (tuplearcWeights.empty() ) {
204  LERROR ("The tuplearc.weights option needs to be specified "
205  "for the tropical sparse tuple weight semiring "
206  "(--semiring=tuplearc)");
207  exit (EXIT_FAILURE);
208  }
209  TupleW32::Params() = ucam::util::ParseParamString<float> (tuplearcWeights);
210  SampleWFSAs<TupleArc32, Hyp<TupleArc32> > (rg);
211  } else {
212  LERROR ("Sorry, semiring option not correctly defined");
213  }
214  FORCELINFO ( argv[0] << " finished!" );
215 }
216 
Wrapper stream class that writes to pipes, text files or gzipped files.
Definition: szfstream.hpp:200
std::string const kHifstSemiring
int SampleWFSAs(ucam::util::RegistryPO const &rg)
bool SampleSortPredicate(const Sample &s1, const Sample &s2)
std::string const kBinaryTarget
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
std::string const kPrintOutputLabels
std::vector< Wid > SentenceIdx
Definition: bleu.hpp:22
std::string const kAlpha
#define LINFO(msg)
#define SEP
unsigned int j2
T get(const std::string &key) const
Returns parsed value associated to key.
Definition: registrypo.hpp:194
void initLogger(int argc, const char *argv[])
Inits logger, parses param options checking for –logger.verbose.
std::string const kInput
std::string const kExternalTokenizer
#define FORCELINFO(msg)
std::string const kIntRefs
std::string const kDontNegate
VectorFstPtrVector cachedLats
Definition: tuneset.hpp:19
double scoreDiff
Bleu ComputeBleu(BleuScorer &bs)
Definition: tuneset.hpp:47
std::string const kNSamples
#define DR
std::string const kNbest
std::string const kOutput
BleuStats SentenceBleuStats(const Sid sid, const SentenceIdx &hypIdx)
Definition: bleu.hpp:296
std::string const kWordRefs
Struct template that represents a hypothesis in a lattice.
Definition: fstio.hpp:142
std::string const kRandomSeed
Same as Hyp but the printing will convert integer ids to words.
Sample(const unsigned j1, const unsigned j2, const double scoreDiff)
TropicalSparseTupleWeight< T > Divide(const TropicalSparseTupleWeight< T > &w1, const TropicalSparseTupleWeight< T > &w2, DivideType type=DIVIDE_ANY)
HypW(std::basic_string< unsigned > const &h, typename Arc::Weight const &c)
std::string const kHifstSemiringLexStdArc
Bleu ComputeSBleu(const BleuStats &bs)
Definition: bleu.hpp:338
std::string const kHifstSemiringStdArc
Static variables for logger. Include only once from main file.
std::string getString(const std::string &key) const
Performs get<string> and checks whether the real value is to be loaded from file (–param=file://.....)
Definition: registrypo.hpp:205
std::string const kHifstSemiringTupleArc
bool exists(const std::string &key) const
Determines whether a program option (key) has been defined by the user.
Definition: registrypo.hpp:235
std::string const kWordMap
#define EPSILON
std::basic_string< unsigned > hyp
Definition: fstio.hpp:143
unsigned int j1
std::string dump(const std::string &decorator_start="", const std::string &decorator_end="")
Dumps all configuration parameters into a string with a reasonably pretty format. ...
Definition: registrypo.hpp:108
std::string const kNegativeExamples
std::string const kTupleArcWeights
#define OOV
Arc::Weight cost
Definition: fstio.hpp:144
#define LERROR(msg)
HypW(HypW< Arc > const &h)
ucam::fsttools::Bleu LBleuScorer(ucam::fsttools::BleuScorer &bleuScorer, unsigned const &sid, HypT const &hyp)
int main(int argc, const char *argv[])
vector< LabeledFeature< float, Weight > > ProSBLEUSample(ucam::fsttools::BleuScorer &bleuScorer, std::vector< HypT > const &hyps, unsigned const &sid, unsigned const &n, unsigned const &ns, double const &alpha, bool negatives=false, bool negate=true)
Static variable for custom_assert. Include only once from main file.