Cambridge SMT System
data-main.hifst.hpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use these files except in compliance with the License.
3 // You may obtain a copy of the License at
4 //
5 // http://www.apache.org/licenses/LICENSE-2.0
6 //
7 // Unless required by applicable law or agreed to in writing, software
8 // distributed under the License is distributed on an "AS IS" BASIS,
9 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 // See the License for the specific language governing permissions and
11 // limitations under the License.
12 
13 // Copyright 2012 - Gonzalo Iglesias, Adrià de Gispert, William Byrne
14 
15 #ifndef TASKDATA_HPP
16 #define TASKDATA_HPP
17 
24 namespace ucam {
25 namespace hifst {
26 
31 template <class ArcT = fst::LexicographicArc< fst::StdArc::Weight, fst::StdArc::Weight> >
32 class HifstTaskData {
33  typedef ucam::util::WordMapper WordMapper;
34  typedef ucam::fsttools::StatsData StatsData;
35  typedef typename ucam::fsttools::KenLMData KenLMData;
36 
37  public:
39  sidx ( 0 ),
40  grammar ( NULL ),
41  ssgd ( NULL ),
42  cykdata ( NULL ),
43  numlocallm ( 0 ),
44  stats ( new StatsData ),
45  translation ( NULL ) {
46  };
47 
49  unsigned sidx;
51  const GrammarData *grammar;
52 
54  unordered_map<std::size_t, std::string> oovwmap;
55 
57  std::string originalsentence;
58  std::string tokenizedsentence;
59  std::string sentence;
60 
62  std::vector<std::string> pinstances;
63 
67  unordered_map<std::string, std::vector< pair <unsigned, unsigned> > > hpinstances;
68 
71 
73  unordered_set<std::string> tvcb;
74 
77 
78  //Filters, e.g. translation lattice substring for alignment or others
79  //\todo delete and add in fsts ?
80  std::vector< fst::VectorFst<ArcT> *> filters;
81 
83  // unordered_map<string, fst::VectorFst<ArcT> * > fsts;
84  unordered_map<std::string, void * > fsts;
85 
86  inline fst::VectorFst<ArcT> *getFst(std::string const &key) {
87  unordered_map<std::string, void * >::iterator itx
88  = fsts.find ( key);
89  if ( itx == fsts.end() ) {
90  LWARN("Empty lattice key=" << key);
91  return NULL;
92  }
93  return reinterpret_cast<fst::VectorFst<ArcT> *>(itx->second);
94  }
95 
96 
97 
99  unordered_map<std::string, std::vector <const KenLMData *> > klm;
101  unsigned numlocallm;
102 
104  boost::shared_ptr<StatsData> stats;
105 
107  std::string *translation;
108 
110  unordered_set<std::string> *recasingvcblm;
111 
114 
116  unordered_map<std::string, WordMapper *> wm;
117 
118 };
119 
120 
121 
122 }} // end namespaces
123 
124 #endif
125 
Contains data for statistics, i.e. allows timing actions and methods called during execution...
Definition: data.stats.hpp:88
unordered_map< std::size_t, std::string > oovwmap
Contains oovs.
Data structure containing all cyk-related information.
unordered_map< uint, std::string > grammar_inversecategories_t
std::vector< fst::VectorFst< ArcT > * > filters
unordered_map< std::string, std::vector< const KenLMData * > > klm
Collections of language models accessed by keys (e.g. in translation we need a bunch for hifst and on...
boost::shared_ptr< ucam::fsttools::StatsData > stats
To collect statistics across the whole pipeline.
Struct containing grammar rules.
unordered_set< std::string > * recasingvcblm
mixed-case vocabulary of the recasing unigram language model
unordered_map< std::string, std::vector< pair< unsigned, unsigned > > > hpinstances
std::string * translation
Translated sentence will be stored here.
std::string originalsentence
source sentence
#define LWARN(msg)
Language Model data structure.
Definition: data.lm.hpp:35
unordered_map< std::string, WordMapper * > wm
Wordmap/Integer map objects.
unsigned sidx
Sentence index.
boost::shared_ptr< StatsData > stats
To collect statistics across the whole pipeline.
const GrammarData * grammar
Contains translation grammar.
grammar_inversecategories_t vcat
This information used for stats.
Loads efficiently a wordmap file and provides methods to map word-to-integer or integer-to-word. To avoid memory footprint issues, hashing the wordmap entries is avoided.
Definition: wordmapper.hpp:63
unordered_map< std::string, void * > fsts
Pointers to lattices (e.g. translation lattice, lmbr, etc) , and related, accessed by unique keys...
std::vector< std::string > pinstances
Pattern instances.
unsigned numlocallm
Number of local language models used in hifst.
Structure for sentence-specific grammar Rules will be queried by cyk per position and number of eleme...
unordered_set< std::string > tvcb
Target vocabulary.
SentenceSpecificGrammarData * ssgd
Sentence-specific grammar information – hashes to rule indices.
fst::VectorFst< ArcT > * getFst(std::string const &key)
Definition: bleu.hpp:14
CYKdata * cykdata
cyk data structures