Cambridge SMT System
fstutils.extractngrams.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef FSTUTILS_EXTRACTNGRAMS_HPP
16 #define FSTUTILS_EXTRACTNGRAMS_HPP
17 
18 namespace fst {
19 
20 typedef unsigned WordId;
21 typedef std::basic_string<WordId> NGram;
22 typedef std::unordered_map<NGram
23 , StdArc::Weight
26 typedef std::vector<NGram> NGramVector;
27 
28 template<class Arc>
29 inline void filterTransducerByLength (VectorFst<Arc>& myfst, unsigned min,
30  unsigned max) {
31  // Create size limiting fst
32  VectorFst<StdArc> sizefst;
33  sizefst.AddState();
34  sizefst.SetStart (0);
35  for (unsigned k = 1; k < min; ++k) {
36  sizefst.AddState();
37  sizefst.AddArc (k - 1, StdArc (RHO, RHO, 0, k) );
38  }
39  for (unsigned k = min; k <= max; ++k) {
40  sizefst.AddState();
41  sizefst.SetFinal (k, StdArc::Weight::One() );
42  sizefst.AddArc (k - 1, StdArc (RHO, RHO, 0, k) );
43  }
44  myfst = (RRhoCompose (myfst, sizefst) );
45 }
46 
52 template<class Arc>
53 class GetNGrams {
54  private:
55  NGram v;
56  public:
57 
58  inline void operator() (std::vector<NGram>& ngrams, const VectorFst<Arc>& count,
59  unsigned order, typename Arc::StateId s = 0) {
60  if (!order) return;
61  for (ArcIterator< VectorFst<Arc> > i (count, s); !i.Done(); i.Next() ) {
62  Arc a = i.Value();
63  v.push_back (a.ilabel);
64  if (count.Final (a.nextstate) != Arc::Weight::Zero() ) ngrams.push_back ( v );
65  (*this) (ngrams, count, order - 1, a.nextstate);
66  v.resize (v.size() - 1);
67  }
68  }
69 };
70 
71 template<class Arc>
72 inline void extractNGrams (VectorFst<Arc>& myfst, std::vector<NGram>& ngrams,
73  unsigned maxorder) {
74  if (!myfst.NumStates() ) return;
75  LINFO ("Building substring transducer");
76  buildSubstringTransducer (&myfst);
77  LDBG_EXECUTE (FstWrite (myfst, "fsts/extractngrams/ss.fst") );
78  LINFO ("Filtering transducer by maxorder");
79  filterTransducerByLength (myfst, 1, maxorder);
80  LDBG_EXECUTE (FstWrite (myfst, "fsts/extractngrams/ss+maxorder.fst") );
81  LINFO ("Determinizing...");
82  Determinize (myfst, &myfst);
83  LDBG_EXECUTE (FstWrite (myfst, "fsts/extractngrams/ss+maxorder+det.fst") );
84  LINFO ("Arcsorting...");
85  ArcSort (&myfst, StdILabelCompare() );
86  LDBG_EXECUTE (FstWrite (myfst, "fsts/extractngrams/ss+maxorder+det+as.fst") );
87  GetNGrams<Arc>() (ngrams, myfst, maxorder,
88  myfst.Start() ); //call recursive function to store ngrams
89 }
90 
91 } // end namespaces
92 
93 inline std::ostream& operator<< (std::ostream& o, const fst::NGram& n) {
94  for (int i = 0; i < n.size(); i++) {
95  if (i > 0) {
96  o << " ";
97  }
98  o << n[i];
99  }
100  return o;
101 }
102 
103 #endif //FSTUTILS_EXTRACTNGRAMS_HPP
#define LINFO(msg)
Definition: fstio.hpp:27
#define LDBG_EXECUTE(order)
std::vector< NGram > NGramVector
unsigned WordId
HashFVec< std::basic_string< unsigned > > hashfvecuint
void FstWrite(const Fst< Arc > &fst, const std::string &filename, const std::string &txtname="txt")
Templated method that writes an fst either in binary or text format.
Definition: fstio.hpp:111
void buildSubstringTransducer(fst::VectorFst< Arc > *myfst)
Builds substring version of an fst. This is a destructive implementation.
Definition: fstutils.hpp:118
Functor with recursive procedure that extracts into a vector all the possible ngrams of a lattice...
void extractNGrams(VectorFst< Arc > &myfst, std::vector< NGram > &ngrams, unsigned maxorder)
ComposeFst< Arc > RRhoCompose(const VectorFst< Arc > &fstlhs, const VectorFst< Arc > &fstrhs, const typename Arc::Label kSpecialLabel=RHO)
Performs composition with RHO, based on OpenFST matchers RHO transitions are expected on fstrhs...
std::basic_string< WordId > NGram
std::unordered_map< NGram, StdArc::Weight, ucam::util::hashfvecuint, ucam::util::hasheqvecuint > NGramList
void operator()(std::vector< NGram > &ngrams, const VectorFst< Arc > &count, unsigned order, typename Arc::StateId s=0)
#define RHO
std::ostream & operator<<(std::ostream &o, const fst::NGram &n)
void filterTransducerByLength(VectorFst< Arc > &myfst, unsigned min, unsigned max)