Cambridge SMT System
FeatureRegistry.java
Go to the documentation of this file.
1 /*******************************************************************************
2  * Licensed under the Apache License, Version 2.0 (the "License");
3  * you may not use these files except in compliance with the License.
4  * You may obtain a copy of the License at
5  *
6  * http://www.apache.org/licenses/LICENSE-2.0
7  *
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11  * See the License for the specific language governing permissions and
12  * limitations under the License.
13  *
14  *******************************************************************************/
15 
16 package uk.ac.cam.eng.rule.features;
17 
18 import java.util.ArrayList;
19 import java.util.Arrays;
20 import java.util.Collections;
21 import java.util.HashMap;
22 import java.util.List;
23 import java.util.Map;
24 
25 import org.apache.hadoop.io.IntWritable;
26 
27 import uk.ac.cam.eng.extraction.Rule;
32 
33 public class FeatureRegistry {
34 
35  // Used for provenance features which don't have a probability
36  private final static double DEFAULT_S2T_PHRASE_LOG_PROB = -4.7;
37 
38  private final static double DEFAULT_T2S_PHRASE_LOG_PROB = -7;
39 
40  private final static double DEFAULT_LEX_VALUE = -40;
41 
42  private final List<Feature> allFeatures;
43 
44  private final Map<Feature, int[]> indexMappings = new HashMap<>();
45 
46  private final int noOfProvs;
47 
48  private final double[] zeroNonProv = new double[] { 0 };
49 
50  private final double[] zeroProv;
51 
52  private final Map<Integer, Double> defaultFeatures;
53 
54  private final Map<Integer, Double> defaultOOVFeatures;
55 
56  private final Map<Integer, Double> defaultPassThroughFeatures;
57 
58  private final Map<Integer, Double> defaultDeletionFeatures;
59 
60  private final Map<Integer, Double> defaultGlueFeatures;
61 
62  private final Map<Integer, Double> defaultDeleteGlueFeatures;
63 
64  private final Map<Integer, Double> defaultGlueStartOrEndFeatures;
65 
66  private final boolean hasLexicalFeatures;
67 
68  public FeatureRegistry(String featureString, String provenanceString) {
69  String[] featureSplit = featureString.split(",");
70  noOfProvs = provenanceString.split(",").length;
71  List<Feature> features = new ArrayList<>();
72  int indexCounter = 1; // 1-based
73  boolean lexFeatures = false;
74  for (String fString : featureSplit) {
75  int[] mappings;
76  Feature f = Feature.findFromConf(fString);
77  lexFeatures |= Feature.ComputeLocation.LEXICAL_SERVER == f.computed;
78  features.add(f);
79  if (Feature.Scope.PROVENANCE == f.scope) {
80  mappings = new int[noOfProvs];
81  for (int i = 0; i < noOfProvs; ++i) {
82  mappings[i] = indexCounter++;
83  }
84  } else {
85  mappings = new int[] { indexCounter++ };
86  }
87  indexMappings.put(f, mappings);
88  }
89  allFeatures = Collections.unmodifiableList(features);
90  zeroProv = new double[noOfProvs];
91  hasLexicalFeatures = lexFeatures;
92  Arrays.fill(zeroProv, 0.0);
93  defaultFeatures = createDefaultData();
94  defaultOOVFeatures = createOOVDefaultData();
95  defaultPassThroughFeatures = createPassThroughDefaultData();
96  defaultDeletionFeatures = createDeletionDefaultData();
97  defaultGlueFeatures = createGlueDefaultData();
98  defaultDeleteGlueFeatures = createDeleteGlueDefaultData();
99  defaultGlueStartOrEndFeatures = createGlueStartOrEndDefaultData();
100  }
101 
102  public int[] getFeatureIndices(Feature... features) {
103  List<int[]> mappings = new ArrayList<int[]>(features.length);
104  int totalSize = 0;
105  for (Feature feature : features) {
106  if (!indexMappings.containsKey(feature)) {
107  throw new IllegalArgumentException("Feature "
108  + feature.getConfName() + " is not in the registry");
109  }
110  int[] mapping = indexMappings.get(feature);
111  mappings.add(mapping);
112  totalSize += mapping.length;
113  }
114  int[] result = new int[totalSize];
115  int counter = 0;
116  for (int[] mapping : mappings) {
117  for (int index : mapping) {
118  result[counter++] = index;
119  }
120  }
121  return result;
122  }
123 
124  public boolean containsFeature(Feature f) {
125  return allFeatures.contains(f);
126  }
127 
128  public List<Feature> getFeatures() {
129  return allFeatures;
130  }
131 
137  public int getNoOfProvs() {
138  return noOfProvs;
139  }
140 
150  public double[] getZeros(Feature f) {
151  if (Feature.Scope.PROVENANCE == f.scope) {
152  return zeroProv;
153  } else {
154  return zeroNonProv;
155  }
156  }
157 
158  private void addDefault(Feature f, Map<Integer, Double> vals, double val) {
159  if (allFeatures.contains(f)) {
160  int[] mappings = getFeatureIndices(f);
161  for (int mapping : mappings) {
162  vals.put(mapping, val);
163  }
164  }
165  }
166 
172  private Map<Integer, Double> createDefaultData() {
173  // Provenance phrase probabilities need default values
174  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
176  defaultFeatures, DEFAULT_S2T_PHRASE_LOG_PROB);
178  defaultFeatures, DEFAULT_T2S_PHRASE_LOG_PROB);
179  addDefault(Feature.RULE_INSERTION_PENALTY, defaultFeatures, 1d);
180  return defaultFeatures;
181  }
182 
188  private Map<Integer, Double> createPassThroughDefaultData() {
189  // We need to add default values for lexical probs
190  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
191  addDefault(Feature.SOURCE2TARGET_LEXICAL_PROBABILITY, defaultFeatures,
192  DEFAULT_LEX_VALUE);
193  addDefault(Feature.TARGET2SOURCE_LEXICAL_PROBABILITY, defaultFeatures,
194  DEFAULT_LEX_VALUE);
196  defaultFeatures, DEFAULT_LEX_VALUE);
198  defaultFeatures, DEFAULT_LEX_VALUE);
199  return defaultFeatures;
200  }
201 
202  private Map<Integer, Double> createOOVDefaultData() {
203  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
204  addDefault(Feature.INSERT_SCALE, defaultFeatures, -1d);
205  return defaultFeatures;
206  }
207 
208  private Map<Integer, Double> createDeletionDefaultData() {
209  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
210  addDefault(Feature.INSERT_SCALE, defaultFeatures, -1d);
211  return defaultFeatures;
212  }
213 
214  private Map<Integer, Double> createGlueDefaultData() {
215  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
216  addDefault(Feature.GLUE_RULE, defaultFeatures, 1d);
217  return defaultFeatures;
218  }
219 
220  private Map<Integer, Double> createDeleteGlueDefaultData() {
221  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
222  addDefault(Feature.GLUE_RULE, defaultFeatures, 2d);
223  return defaultFeatures;
224  }
225 
226  private Map<Integer, Double> createGlueStartOrEndDefaultData() {
227  Map<Integer, Double> defaultFeatures = new HashMap<Integer, Double>();
228  addDefault(Feature.RULE_COUNT_GREATER_THAN_2, defaultFeatures, 1d);
229  addDefault(Feature.RULE_INSERTION_PENALTY, defaultFeatures, 1d);
230  addDefault(Feature.WORD_INSERTION_PENALTY, defaultFeatures, 1d);
231  return defaultFeatures;
232  }
233 
234  public Map<Integer, Double> getDefaultFeatures() {
235  return new HashMap<Integer, Double>(defaultFeatures);
236  }
237 
238  public Map<Integer, Double> getDefaultOOVFeatures() {
239  return new HashMap<Integer, Double>(defaultOOVFeatures);
240  }
241 
242  public Map<Integer, Double> getDefaultDeletionFeatures() {
243  return new HashMap<Integer, Double>(defaultDeletionFeatures);
244  }
245 
246  public Map<Integer, Double> getDefaultGlueFeatures() {
247  return new HashMap<Integer, Double>(defaultGlueFeatures);
248  }
249 
250  public Map<Integer, Double> getDefaultDeleteGlueFeatures() {
251  return new HashMap<Integer, Double>(defaultDeleteGlueFeatures);
252  }
253 
254  public Map<Integer, Double> getDefaultGlueStartOrEndFeatures() {
255  return new HashMap<Integer, Double>(defaultGlueStartOrEndFeatures);
256  }
257 
258  public Map<Integer, Double> getDefaultPassThroughRuleFeatures() {
259  return new HashMap<Integer, Double>(defaultPassThroughFeatures);
260  }
261 
262  private static final ProvenanceProbMap checkedGetProbs(Feature f,
263  FeatureMap features) {
264  ProvenanceProbMap probs = features.get(f);
265  if (probs == null) {
266  throw new RuntimeException("No data for feature " + f.getConfName());
267  }
268  return probs;
269  }
270 
279  public Map<Integer, Double> createFoundPassThroughRuleFeatures(
280  FeatureMap features) {
281  Map<Integer, Double> defaults = getDefaultPassThroughRuleFeatures();
282  allFeatures
283  .stream()
284  .filter((f) -> Feature.ComputeLocation.LEXICAL_SERVER == f.computed)
285  .forEach(
286  (f) -> {
287  int[] mappings = indexMappings.get(f);
288  ProvenanceProbMap probs = checkedGetProbs(f,
289  features);
290  for (int index : mappings) {
291  IntWritable indexIntW = IntWritableCache
292  .createIntWritable(index);
293  double ffVal = 0.0;
294  if (probs.containsKey(indexIntW)) {
295  ffVal = probs.get(indexIntW).get();
296  }
297  if (ffVal != 0.0) {
298  defaults.put(index, ffVal);
299  }
300  }
301  });
302  return defaults;
303  }
304 
305  private static void setVal(int mapping, double val,
306  Map<Integer, Double> features) {
307  // Default val in sparse tuple arc is 0. Delete default val.
308  if (val == 0.0) {
309  features.remove(mapping);
310  } else {
311  features.put(mapping, val);
312  }
313  }
314 
315  public Map<Integer, Double> processFeatures(Rule rule, RuleData data) {
316  Map<Integer, Double> processedFeatures = getDefaultFeatures();
317  for (Feature f : allFeatures) {
318  int[] mappings = indexMappings.get(f);
319  if (Feature.ComputeLocation.RETRIEVAL == f.computed) {
320  double[] results = FeatureFunctionRegistry.computeFeature(f,
321  rule, data, this);
322  if (results == null) {
323  continue;
324  }
325  for (int i = 0; i < results.length; ++i) {
326  setVal(mappings[i], results[i], processedFeatures);
327  }
328  } else {
329  ProvenanceProbMap probs = checkedGetProbs(f, data.getFeatures());
330  for (int i = 0; i < mappings.length; ++i) {
331  // Provenances are 1-indexed with the 0th element reserved
332  // for the global
333  // scope.
334  int index = Feature.Scope.PROVENANCE == f.scope ? i + 1 : i;
335  if (probs.containsKey(IntWritableCache
336  .createIntWritable(index))) {
337  double ffVal = probs.get(
339  .get();
340  setVal(mappings[i], ffVal, processedFeatures);
341  }
342  }
343  }
344  }
345  return processedFeatures;
346  }
347 
348  public boolean hasLexicalFeatures() {
349  return hasLexicalFeatures;
350  }
351 
352 }
Map< Integer, Double > createFoundPassThroughRuleFeatures(FeatureMap features)
FeatureRegistry(String featureString, String provenanceString)
Map< Integer, Double > getDefaultPassThroughRuleFeatures()
Map< Integer, Double > getDefaultGlueStartOrEndFeatures()
static Feature findFromConf(String name)
Definition: Feature.java:74
Map< Integer, Double > processFeatures(Rule rule, RuleData data)
Map< Integer, Double > getDefaultDeleteGlueFeatures()
Map< Integer, Double > getDefaultDeletionFeatures()