15 #ifndef SENTENCESPECIFICGRAMMARTASK_HPP 16 #define SENTENCESPECIFICGRAMMARTASK_HPP 49 std::size_t oovindex_;
52 unsigned rule_id_offset_;
60 rule_id_offset_ ( 0 ),
64 deleteoovs_ ( rg.getBool (
66 LDEBUG (
"Constructor done!" );
73 bool run ( Data& d ) {
74 LDEBUG (
"HP size=" << d.hpinstances.size() );
75 USER_CHECK ( d.grammar,
"Big grammar data not available!" );
78 LDEBUG (
"HP size=" << d.hpinstances.size() );
79 d.stats->setTimeStart (
"ssgrammar-extract");
81 d.stats->setTimeEnd (
"ssgrammar-extract");
82 LDEBUG (
"HP size=" << d.hpinstances.size() );
85 if ( ssgrammarfile_ ( d.sidx ) !=
"" )
86 writessgrammar ( ssgrammarfile_ ( d.sidx ) );
87 LDEBUG (
"Finished run method" );
94 std::size_t createOOVRule (
const std::string& oov ) {
96 unsigned n = ucam::util::toNumber<unsigned> ( oov );
97 if ( n >=
OOVID && !deleteoovs_ )
98 ssgd_.
extrarules[ newindex ] =
"X " + oov +
" " + oov +
" 0";
100 ssgd_.
extrarules[ newindex ] =
"X " + oov +
" <oov> 0";
101 LINFO (
"New oov rule id=" << newindex <<
",rule=" <<
108 void writessgrammar (
const std::string& filename ) {
109 FORCELINFO (
"Saving ssgrammar to " << filename );
110 unordered_set<unsigned> seenrules;
115 for ( ssgrammar_firstelementmap_t::iterator itx2 = itx->second.begin();
116 itx2 != itx->second.end();
118 for (
unsigned k = 0; k < itx2->second.size(); ++k ) {
119 if ( seenrules.find ( itx2->second[k] ) != seenrules.end() )
continue;
120 seenrules.insert ( itx2->second[k] );
125 for ( ssgrammar_rulesmap_t::iterator itx =
129 for ( ssgrammar_firstelementmap_t::iterator itx2 = itx->second.begin();
130 itx2 != itx->second.end();
132 for (
unsigned k = 0; k < itx2->second.size(); ++k ) {
133 if ( seenrules.find ( itx2->second[k] ) != seenrules.end() )
continue;
134 seenrules.insert ( itx2->second[k] );
151 void get ( Data& d ) {
155 for ( ssgrammar_instancemap_t::iterator itx = hpinstances.begin();
156 itx != hpinstances.end(); ++itx ) {
157 LDEBUG (
"Search for [" << itx->first <<
"]" );
158 std::string needle = itx->first +
" ";
159 int pos = exists ( needle );
163 std::size_t ruleid = createOOVRule ( itx->first );
164 for (
unsigned k = 0; k < itx->second.size(); ++k ) {
165 unsigned& x = itx->second[k].first;
167 LDEBUG (
"***Adding (OOV) rule index " << ruleid <<
":" << ssgd_.
getRule (
171 LDEBUG (
"Pattern not found!" );
174 USER_CHECK ( pos >= 0,
"positive value required!" );
175 LDEBUG (
"Extracting indices for =>" << itx->first <<
",size of pattern=" <<
177 ", number of instances at which this was found: (x,span): " <<
178 itx->second.size() );
180 unordered_set<unsigned> seenx;
181 if (
getSize ( itx->first ) == 1 ) {
182 for (
unsigned k = 0; k < itx->second.size(); ++k ) {
183 unsigned& x = itx->second[k].first;
184 if ( seenx.find ( x ) != seenx.end() ) {
185 LDEBUG (
"Repeated:" << itx->first <<
" at x=" << x );
189 LDEBUG (
"*calling addRuleIndicesRHS (1) at x=" << x );
194 for (
unsigned k = 0; k < itx->second.size(); ++k ) {
195 unsigned& x = itx->second[k].first;
196 if ( seenx.find ( x ) != seenx.end() ) {
197 LDEBUG (
"Repeated:" << itx->first <<
" at x=" << x );
201 LDEBUG (
"*calling addRuleIndicesRHS (2) at x=" << x );
206 LDEBUG (
"Finished extracting indices for " << itx->first );
208 LDEBUG (
"Finished get method" );
220 void addRuleIndicesRHS (
const std::string& needle
222 ,
const unordered_set<std::string>& vcb ) {
223 USER_CHECK ( pos >= 0,
"pos needs to be positive" );
224 LDEBUG (
"**Adding indices for rules" );
226 for (
unsigned j = pos; j < g.
sizeofvpos ; ++j ) {
228 needle.size() ) )
break;
230 LDEBUG (
"skipping rule (rejected by vcb):" << g.
getRule ( j ) );
235 LDEBUG (
"***Adding rule #" << j <<
":" << g.
getRule ( j ) );
236 rules[firstelement].push_back ( j );
238 if ( pos == 0 )
return;
239 for (
int j = pos - 1; j >= 0 ; --j ) {
241 needle.size() ) )
break;
245 LDEBUG (
"***Adding rules # " << j <<
":" << g.
getRule ( j ) );
246 rules[firstelement].push_back ( j );
255 inline int exists (
const std::string& needle ) {
256 USER_CHECK ( needle.at ( needle.size() - 1 ) ==
' ',
257 "This method requires a space appended to que queried string" );
259 int oldmid = -1, mid = 0;
263 while ( first <= last ) {
264 mid = ( first + last ) / 2;
265 if ( mid == oldmid )
break;
268 if ( res < 0 ) first = mid + 1;
269 else if ( res > 0 ) last = mid - 1;
270 else first = last + 1;
277 LDEBUG (
"Could not find: [" << needle <<
"]" );
Wrapper stream class that writes to pipes, text files or gzipped files.
class that expands a wildcard into its actual value. This is useful e.g. for filenames ranging severa...
ssgrammar_rulesmap_t rulesWithRhsSpan1
bool run(Data &d)
run method, given a grammar and instantiated patterns, creates and returns the hashes ...
const std::string getRule(std::size_t idx)
Returns rule corresponding to index idx.
CompareTool * ct
Pointer to a Comparison object, assumed no ownership.
Struct containing grammar rules.
posindex * vpos
Sorted Indices.
Templated (hybrid) Interface for Task classes.
const std::string getRHSSource(std::size_t idx) const
Gets right-hand-side source for a rule using rule index idx.
SentenceSpecificGrammarTask(const ucam::util::RegistryPO &rg)
Constructor.
const std::string kSsgrammarAddoovsSourcedeletions
const std::string kSsgrammarAddoovsEnable
const bool isAcceptedByVocabulary(const std::size_t idx, const unordered_set< std::string > &vcb) const
Determines whether a particular rule is allowed within a vocabulary, i.e. all target words of the rul...
unordered_map< std::string, std::vector< pair< uint, uint > > > ssgrammar_instancemap_t
std::string filecontents
The whole grammar.
const std::string getRule(std::size_t idx) const
Gets a rule indexed by idx. Rule format: LHS RHSSource RHSTarget weight.
bool phraseIsTerminalWord(const std::string &phrase)
This class uses instantiated patterns to analyze the grammar and deliver two hashes providing candida...
ssgrammar_rulesmap_t rulesWithRhsSpan2OrMore
cells containing potentially applicable rules with two or more elements
unordered_map< std::string, ssgrammar_listofrules_t > ssgrammar_firstelementmap_t
#define USER_CHECK(exp, comment)
Tests whether exp is true. If not, comment is printed and program ends.
unordered_map< std::size_t, std::string > extrarules
const GrammarData * grammar
Pointer to the original grammar data (no ownership)
const std::string kSsgrammarStore
const uint getSize(const std::string &rhs)
A generic element counter that can be used to any string. It is intended to use with either source or...
Structure for sentence-specific grammar Rules will be queried by cyk per position and number of eleme...
std::size_t sizeofvpos
Number of rules.
void getFilteredNonTerminal(std::string &word)
Return the filtered non-terminal name. For example, for the rule Z 3_XT2_5 XT2, getFilteredNonTermina...
void close()
Closes the file.