15 #ifndef PATTERNTOINSTANCESTASK_HPP 16 #define PATTERNTOINSTANCESTASK_HPP 54 maxspan_ ( rg.get<unsigned> (
"patternstoinstances.maxspan" ) ),
55 gapmaxspan_ ( rg.get<unsigned> (
"patternstoinstances.gapmaxspan" ) ),
56 instancefile_ ( rg.get<std::string> (
"patternstoinstances.store" ) ) {
65 bool run ( Data& d ) {
66 LINFO (
"instancing " << d.grammar->patterns.size() <<
67 " patterns over this sentence:" << d.sentence );
68 d.stats->setTimeStart (
"instantiate-patterns");
69 instantiatePatternsHash ( d );
70 d.stats->setTimeEnd (
"instantiate-patterns");
71 writeHashToFile ( d );
72 LINFO (
"Finished!" );
88 void instantiatePatternsHash ( Data& d ) {
89 d.hpinstances.clear();
90 LINFO (
"maxspan_=" << maxspan_ <<
",gapmaxspan=" << gapmaxspan_ );
91 std::vector<std::string> ss;
92 boost::algorithm::split ( ss, d.sentence, boost::algorithm::is_any_of (
" " ) );
93 const unordered_set<std::string>& patterns = d.grammar->patterns;
94 for ( unordered_set<std::string>::const_iterator itx = patterns.begin();
95 itx != patterns.end(); ++itx ) {
96 LDEBUG (
"pattern:" << *itx );
97 std::vector<std::string> spattern;
98 boost::algorithm::split ( spattern, *itx, boost::algorithm::is_any_of (
"_" ) );
99 for (
unsigned j = 0; j < ss.size(); ++j ) {
100 std::vector< std::vector<std::string> > pinstances;
101 LDEBUG (
"starting word:" << ss[j] );
104 if ( spattern.size() <= maxspan_ && j + spattern.size() - 1 < ss.size() ) {
105 std::vector<std::string> empty;
106 pinstances.push_back ( empty );
108 buildNextElementFromPattern ( spattern, ss, pinstances, j, 0 );
110 for (
unsigned k = 0; k < pinstances.size(); ++k ) {
111 LDEBUG (
"pattern:" << *itx <<
":" <<
"Inserting in " <<
112 boost::algorithm::join ( pinstances[k],
113 "_" ) <<
"values=(" << j <<
"," << spattern.size() - 1 );
114 d.hpinstances[boost::algorithm::join ( pinstances[k],
115 "_" )].push_back ( pair<unsigned, unsigned> ( j, spattern.size() - 1 ) );
131 void buildNextElementFromPattern ( std::vector<std::string>& spattern,
132 std::vector<std::string>& ss,
133 std::vector< std::vector<std::string> >& pinstances,
136 unsigned gaphistory = 0 ) {
137 LDEBUG (
"startingword:" << ss[ps] <<
",thisword:" << ss[ps + pp +
138 gaphistory ] <<
",thiselement:" << spattern[pp] <<
",ps=" << ps <<
",pp=" << pp
139 <<
",spatternsize=" << spattern.size() <<
",gaphistory=" << gaphistory );
140 if ( spattern[pp] ==
"w" ) {
142 ss[ps + pp + gaphistory] ) );
143 if ( ( pp + 1 < spattern.size() )
144 && ( ps + spattern.size() + gaphistory <= ss.size() ) )
145 buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1,
147 }
else if ( spattern[pp] ==
"X" ) {
148 LDEBUG (
"X,with gapmaxspan=" << gapmaxspan_ );
149 pinstances[pinstances.size() - 1].push_back (
"X" );
150 std::vector<std::string> replicate = pinstances[pinstances.size() - 1];
151 for (
unsigned k = 1;
153 && ( pp + 1 < spattern.size() )
154 && ( ps + spattern.size() - 1 + gaphistory + k - 1 < ss.size() )
155 && ( spattern.size() + gaphistory + k - 1 <= maxspan_ );
157 LDEBUG (
"GAPSPAN=" << k );
158 if ( k > 1 ) pinstances.push_back (
160 buildNextElementFromPattern ( spattern, ss, pinstances, ps, pp + 1,
161 gaphistory + k - 1 );
165 USER_CHECK ( spattern[pp] ==
"X" || spattern[pp] ==
"w",
"Incorrect pattern!" );
174 void writeHashToFile ( Data& d ) {
175 std::string file = instancefile_ ( d.sidx );
177 LINFO (
"file to output:" << file );
179 for ( unordered_map<std::string, std::vector <pair <unsigned, unsigned> > >::iterator
180 itx = d.hpinstances.begin(); itx != d.hpinstances.end(); ++itx ) {
181 o << itx->first <<
":" ;
182 for (
unsigned k = 0; k < itx->second.size(); ++k )
183 o << itx->second[k].first <<
"," << itx->second[k].second <<
";";
192 ,
unsigned gapmaxspan_
193 ,
const std::string& instancefile_ ) :
194 maxspan_ ( maxspan_ ),
195 gapmaxspan_ ( gapmaxspan_ ),
196 instancefile_ ( instancefile_ ) {
Wrapper stream class that writes to pipes, text files or gzipped files.
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
std::string toString(const T &x, uint pr=2)
Converts an arbitrary type to string Converts to string integers, floats, doubles Quits execution if ...
Converts patterns to instanced patterns.
PatternsToInstancesTask(const ucam::util::RegistryPO &rg)
Constructor.
Templated (hybrid) Interface for Task classes.
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
bool run(Data &d)
Runs this task and modifies Data object inserting the instanced sentence-specific source patterns we ...
~PatternsToInstancesTask()
Destructor.
void close()
Closes the file.