15 #ifndef FSTUTILS_EXTRACTNGRAMS_HPP 16 #define FSTUTILS_EXTRACTNGRAMS_HPP 21 typedef std::basic_string<WordId>
NGram;
22 typedef std::unordered_map<
NGram 32 VectorFst<StdArc> sizefst;
35 for (
unsigned k = 1; k < min; ++k) {
37 sizefst.AddArc (k - 1, StdArc (
RHO,
RHO, 0, k) );
39 for (
unsigned k = min; k <= max; ++k) {
41 sizefst.SetFinal (k, StdArc::Weight::One() );
42 sizefst.AddArc (k - 1, StdArc (
RHO,
RHO, 0, k) );
58 inline void operator() (std::vector<NGram>& ngrams,
const VectorFst<Arc>& count,
59 unsigned order,
typename Arc::StateId s = 0) {
61 for (ArcIterator< VectorFst<Arc> > i (count, s); !i.Done(); i.Next() ) {
63 v.push_back (a.ilabel);
64 if (count.Final (a.nextstate) != Arc::Weight::Zero() ) ngrams.push_back ( v );
65 (*this) (ngrams, count, order - 1, a.nextstate);
66 v.resize (v.size() - 1);
72 inline void extractNGrams (VectorFst<Arc>& myfst, std::vector<NGram>& ngrams,
74 if (!myfst.NumStates() )
return;
75 LINFO (
"Building substring transducer");
78 LINFO (
"Filtering transducer by maxorder");
81 LINFO (
"Determinizing...");
82 Determinize (myfst, &myfst);
84 LINFO (
"Arcsorting...");
85 ArcSort (&myfst, StdILabelCompare() );
94 for (
int i = 0; i < n.size(); i++) {
103 #endif //FSTUTILS_EXTRACTNGRAMS_HPP
#define LDBG_EXECUTE(order)
std::vector< NGram > NGramVector
HashFVec< std::basic_string< unsigned > > hashfvecuint
void FstWrite(const Fst< Arc > &fst, const std::string &filename, const std::string &txtname="txt")
Templated method that writes an fst either in binary or text format.
void buildSubstringTransducer(fst::VectorFst< Arc > *myfst)
Builds substring version of an fst. This is a destructive implementation.
Functor with recursive procedure that extracts into a vector all the possible ngrams of a lattice...
void extractNGrams(VectorFst< Arc > &myfst, std::vector< NGram > &ngrams, unsigned maxorder)
ComposeFst< Arc > RRhoCompose(const VectorFst< Arc > &fstlhs, const VectorFst< Arc > &fstrhs, const typename Arc::Label kSpecialLabel=RHO)
Performs composition with RHO, based on OpenFST matchers RHO transitions are expected on fstrhs...
std::basic_string< WordId > NGram
std::unordered_map< NGram, StdArc::Weight, ucam::util::hashfvecuint, ucam::util::hasheqvecuint > NGramList
void operator()(std::vector< NGram > &ngrams, const VectorFst< Arc > &count, unsigned order, typename Arc::StateId s=0)
void filterTransducerByLength(VectorFst< Arc > &myfst, unsigned min, unsigned max)