16 #ifndef TASK_RULES2FLOWERFST_HPP 17 #define TASK_RULES2FLOWERFST_HPP 69 const unsigned offset =
71 const std::string& alignmentlattices =
"",
77 alilats_ ( rg.
exists ( alignmentlattices ) ? rg.get<std::string>
78 ( alignmentlattices ) :
"" ),
79 grammar_ ( rg.get<std::string> ( grammarloadkey ) ),
80 fscales_ (
fst::TropicalSparseTupleWeight<float>::Params() ),
81 filterbyalilats_ ( rg.
exists ( alignmentlattices ) ),
82 grammarstorekey_ ( grammarstorekey ) {
86 bool run ( DataT& d ) {
87 load ( grammar_ ( d.sidx ) );
95 std::vector<std::string> vgn;
96 boost::algorithm::split ( vgn, filename, boost::algorithm::is_any_of (
"." ) );
97 if ( vgn[vgn.size() - 1] ==
"fst" ) directload =
true;
98 else if ( vgn.size() > 2 )
if ( vgn[vgn.size() - 1] ==
"gz" 99 && vgn[vgn.size() - 2] ==
"fst" ) directload =
true;
101 LINFO (
"Loading FST directly (assumes arcsorted flower) =" << filename );
102 fst::VectorFst<TupleArc32> *yupi = fst::VectorFstRead<TupleArc32> ( filename );
103 flowerlattice_ = *yupi;
112 if ( !
USER_CHECK ( filename !=
"" ,
"No grammar to load?" ) )
return false;
113 if ( filename == previousfile_ ) {
114 LINFO (
"Skipping grammar loading..." );
121 previousfile_ = filename;
125 ,
bool filterbyalilats
131 if ( filterbyalilats )
135 VectorFst<LexStdArc> *alilatsfst =
136 VectorFstRead<LexStdArc> (
alilats_ ( ir->get() ) );
137 extractSourceVocabulary<LexStdArc> ( *alilatsfst, &idxrules );
143 flowerlattice_.AddState();
144 flowerlattice_.SetStart ( 0 );
145 flowerlattice_.SetFinal ( 0, TupleArc32::Weight::One() );
148 virtual void fillStructure(
unsigned label, TupleArc32::Weight
const & vtcost) {
149 flowerlattice_.AddArc ( 0,
TupleArc32 ( label, label, vtcost, 0 ) );
153 fst::ArcSort<TupleArc32> ( &
flowerlattice_, fst::ILabelCompare<TupleArc32>() );
162 bool load (
const std::string& filename ) {
166 FORCELINFO (
"loading grammar from " << filename );
168 unordered_set<unsigned> idxrules;
172 unsigned lc = 0, llc = 0;
174 myrulefile.
open ( filename.c_str() );
175 LINFO (
"Opening rule file " << filename );
177 LERROR (
"Failed to open " << filename );
184 if ( line.size() > 0 ) {
185 while ( line.at ( line.length() - 1 ) ==
' ' )
186 line.erase ( line.length() - 1 );
187 while ( line.at ( 0 ) ==
' ' )
189 if ( line.at ( 0 ) ==
'#' || line.at ( 0 ) ==
'%' ) line =
"";
192 LDEBUG ( lc <<
"is an empty line/comment." );
195 if ( ! ( lc % 100000 ) )
LINFO ( lc <<
" rules parsed..." );
196 if ( !idxrules.empty() )
197 if ( idxrules.find ( lc ) == idxrules.end() )
continue;
200 std::vector<std::string> fields;
201 boost::algorithm::split ( fields, line, boost::algorithm::is_any_of (
" " ) );
202 for (
unsigned k = 3; k < fields.size(); ++k ) {
205 std::vector<std::string> splitfields;
206 boost::algorithm::split ( splitfields, fields[k],
207 boost::algorithm::is_any_of (
"@" ) );
209 if ( splitfields.size() == 2 ) {
210 if ( splitfields[0] ==
"" ) prob = 1.000f;
211 else prob = toNumber<float> ( splitfields[0] );
212 pos = offset_ + toNumber<unsigned> ( splitfields[1] );
214 prob = toNumber<float> ( fields[k] );
215 pos = offset_ + k - 2;
218 vtcost.Push ( pos, prob );
224 }
while ( myrulefile && !myrulefile.
eof() );
225 LINFO (
"File: " << filename <<
" has been succesfully loaded" );
227 LINFO (
"Number of rules actually loaded for this job: " << llc );
229 return built_ =
true;
237 template<
class DataT>
240 unordered_map<unsigned,TupleArc32::Weight> weights_;
244 const unsigned offset =1,
252 d.weights = &weights_;
257 virtual void fillStructure(
unsigned label, TupleArc32::Weight
const & vtcost) {
259 weights_[label + 2 ] = vtcost;
260 weights_[label + 2 ].SetDefaultValue(0);
261 LDEBUG(
"Loading rule id=" << label + 2 <<
" with weights=" << weights_[label + 2 ] );
267 ,
bool filterbyalilats
273 if ( ! filterbyalilats )
return;
281 #endif // TASK_RULES2FLOWERFST_HPP #define ZDISALLOW_COPY_AND_ASSIGN(TypeName)
bool directload(const std::string &filename)
If it is an fst, load directly.
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
virtual void initStructure()
const std::string kRuleflowerlatticeLoad
bool run(DataT &d)
Inherited method from ucam::util::TaskInterface. Loads the flower lattice into the data object...
bool built_
Is the flower lattice built.
void open(const std::stringstream &ss)
bool filterbyalilats_
If true, the grammar flower lattice will be loaded only with rules that have been used in the alignme...
virtual void initStructure()
const ucam::util::RegistryPO & rg_
Registry object – contains program options.
boost::scoped_ptr< NumberRangeInterface< unsigned > > IntRangePtr
std::vector< float > & fscales_
sparse tuple-weight scales
virtual void closeStructure()
bool is_open()
Checks if the file/pipe is open.
Implements Tropical Sparse tuple weight semiring, extending from openfst SparsePowerWeight class...
bool run(DataT &d)
Inherited method from ucam::util::TaskInterface. Loads the flower lattice into the data object...
Templated (hybrid) Interface for Task classes.
virtual void fillStructure(unsigned label, TupleArc32::Weight const &vtcost)
iszfstream & getline(iszfstream &izs, std::string &line)
const std::string grammarstorekey_
Key to store in the data object.
const std::string grammarloadkey_
Key with access to registry object for the grammar file name.
virtual void closeStructure()
const std::string kRulesToWeightsLoadalilats
bool exists(const std::string &source, const std::string &needle)
Convenience function to find out whether a needle exists in a text.
const std::string kRuleflowerlatticeStore
ucam::util::IntegerPatternAddress alilats_
Alignment lattices file names.
const std::string kRulesToWeightsLoadGrammar
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
virtual void fillStructure(unsigned label, TupleArc32::Weight const &vtcost)
virtual void gatherRuleIds(unordered_set< unsigned > &idxrules, bool filterbyalilats, ucam::util::IntegerPatternAddress &alilats, ucam::util::RegistryPO const &rg)
T toNumber(const std::string &x)
Converts a string to an arbitrary number Converts strings to a number. Quits execution if conversion ...
void updateFilename(std::string const &filename)
virtual void gatherRuleIds(unordered_set< unsigned > &idxrules, bool filterbyalilats, ucam::util::IntegerPatternAddress &alilats, ucam::util::RegistryPO const &rg)
fst::VectorFst< TupleArc32 > flowerlattice_
Fst with the flower lattice itself.
bool load(const std::string &filename)
Load flower lattice from file.
bool checkGrammarFile(std::string const &filename)
LoadSparseWeightFlowerLatticeTask(const ucam::util::RegistryPO &rg, const unsigned offset=1, const std::string &alignmentlattices="", const std::string &grammarloadkey=HifstConstants::kRuleflowerlatticeLoad, const std::string &grammarstorekey=HifstConstants::kRuleflowerlatticeStore)
Constructor with registry object, offset and keys.
std::string previousfile_
Previous grammar file name.
const unsigned offset_
Number of language models.
virtual int eof()
Checks for end-of-file.
Wrapper stream class that reads pipes, text files or gzipped files.
ucam::util::IntegerPatternAddress grammar_
Grammar file name.
Implements a class that loads the grammar sparseweight flower lattice and stores a pointer on the dat...
LoadSparseWeightsTask(const ucam::util::RegistryPO &rg, const unsigned offset=1, const std::string &alignmentlattices=HifstConstants::kRulesToWeightsLoadalilats, const std::string &grammarloadkey=HifstConstants::kRulesToWeightsLoadGrammar)