Cambridge SMT System
task.grammar.nonterminalhierarchy.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, AdriĆ  de Gispert, William Byrne
14 
15 #ifndef TASK_GRAMMAR_NONTERMINALHIERARCHY
16 #define TASK_GRAMMAR_NONTERMINALHIERARCHY
17 
25 namespace ucam {
26 namespace hifst {
27 
34  private:
36  unordered_set<std::string> h_;
38  unordered_set<std::string> nt_;
39 
41  bool s_;
42 
43  public:
47  inline void insertIdentityRule ( const std::string& identityrule ) {
48  h_.insert ( identityrule );
49  };
50 
53  inline void insertLHS ( const std::string& nt ) {
54  nt_.insert ( nt );
55  };
56 
64  bool operator() ( std::string& ntlist ) {
65  s_ = false;
66  ntlist = "";
67  unordered_set<std::string> haux = h_;
68  unordered_set<std::string> nt = nt_;
69  populate_nt ( nt );
70  if ( !nt.size() ) return true;
71  uint count = 0;
72  uint previoussize;
73  do {
74  previoussize = haux.size();
75  LINFO ( "round " << ++count << ": haux.size=" << previoussize );
76  ntlist += identify_highest_nt ( haux, nt );
77  if ( !s_ ) {
78  LERROR ( "Incorrect grammar? S is expected to have the highest hierarchy!" );
79  boost::algorithm::trim ( ntlist );
80  ucam::util::find_and_replace ( ntlist, " ", "," );
81  return false;
82  }
83  } while ( haux.size() < previoussize );
84  boost::algorithm::trim ( ntlist );
85  if ( haux.size() ) {
86  LINFO ( "Conflict!" );
87  LINFO ( "Partial list of ordered non-terminals:" << ntlist );
88  LINFO ( "Rules containing a cycle:" );
89  for ( unordered_set<std::string>::iterator itx = haux.begin();
90  itx != haux.end(); ++itx )
91  LINFO ( "===" << *itx );
92  return false;
93  }
94  ntlist = "S " + ntlist;
95  ucam::util::find_and_replace ( ntlist, " ", "," );
96  LINFO ( "DONE!" );
97  return true;
98  }
99 
100  private:
101 
103  void populate_nt ( unordered_set<std::string>& nt ) {
104  for ( unordered_set<std::string>::iterator itx = h_.begin(); itx != h_.end();
105  ++itx ) {
106  std::vector<std::string> aux;
107  boost::algorithm::split ( aux, *itx, boost::algorithm::is_any_of ( " " ) );
108  getFilteredNonTerminal ( aux[1] );
109  nt.insert ( aux[0] );
110  nt.insert ( aux[1] );
111  }
112  LINFO ( "Number of non-terminals=" << nt.size() );
113  }
114 
119  std::string identify_highest_nt ( unordered_set<std::string>& h,
120  unordered_set<std::string>& nt ) {
121  std::map<std::string, uint> ntorder;
122  int order = 0;
123  for ( std::unordered_set<std::string>::iterator itx = nt.begin();
124  itx != nt.end(); ++itx ) {
125  ntorder[*itx] = 0;
126  LINFO ( "nt:" << *itx );
127  }
128  for ( std::unordered_set<std::string>::iterator itx = h.begin();
129  itx != h.end(); ++itx ) {
130  std::vector<std::string> aux;
131  boost::algorithm::split ( aux, *itx, boost::algorithm::is_any_of ( " " ) );
132  getFilteredNonTerminal ( aux[1] );
133  ntorder[aux[1]]++;
134  }
135  LINFO ( "Extract highest nt" );
136  std::string ntstring;
137  std::unordered_set<std::string> nttodelete;
138  for ( std::map<std::string, uint>::iterator itx = ntorder.begin();
139  itx != ntorder.end(); ++itx ) {
140  if ( !itx->second ) {
141  if ( itx->first != "S" ) {
142  LINFO ( "Found " << itx->first );
143  ntstring += " " + itx->first;
144  } else {
145  LINFO ( "Found category S" );
146  s_ = true; //This one should be encountered first call and must hold highest hierarchy. Will be prepended at the end
147  }
148  nt.erase ( itx->first );
149  nttodelete.insert ( itx->first );
150  }
151  }
152  unordered_set<std::string> nttodelete2;
153  for ( unordered_set<std::string>::iterator itx = h.begin(); itx != h.end();
154  ++itx ) {
155  std::vector<std::string> aux;
156  boost::algorithm::split ( aux, *itx, boost::algorithm::is_any_of ( " " ) );
157  if ( nttodelete.find ( aux[0] ) != nttodelete.end() ) {
158  nttodelete2.insert ( *itx );
159  }
160  }
161  for ( unordered_set<std::string>::iterator itx = nttodelete2.begin();
162  itx != nttodelete2.end(); ++itx ) {
163  h.erase ( *itx );
164  }
165  LINFO ( "ntstring=" << ntstring << ", HSIZE=" << h.size() << "NTSIZE=" <<
166  nt.size() );
167  return ntstring;
168  }
169 
170  ZDISALLOW_COPY_AND_ASSIGN ( NonTerminalHierarchy );
171 
172 };
173 
174 }
175 } // end namespaces
176 
177 #endif
#define LINFO(msg)
bool operator()(std::string &ntlist)
Determines hierarchical list of non-terminals and flags whether there has been a cycle or not Example...
#define LERROR(msg)
void insertIdentityRule(const std::string &identityrule)
Method to store identity rules, i.e. S -> X X , etc.
This is a functor with additional methods to include relevant rules (i.e. identify SCFG rules...
void find_and_replace(std::string &haystack, const std::string &needle, const std::string &replace)
Definition: bleu.hpp:14
void getFilteredNonTerminal(std::string &word)
Return the filtered non-terminal name. For example, for the rule Z 3_XT2_5 XT2, getFilteredNonTermina...