Cambridge SMT System
HFileRuleQuery.java
Go to the documentation of this file.
1 /*******************************************************************************
2  * Licensed under the Apache License, Version 2.0 (the "License");
3  * you may not use these files except in compliance with the License.
4  * You may obtain a copy of the License at
5  *
6  * http://www.apache.org/licenses/LICENSE-2.0
7  *
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11  * See the License for the specific language governing permissions and
12  * limitations under the License.
13  *
14  * Copyright 2014 - Juan Pino, Aurelien Waite, William Byrne
15  *******************************************************************************/
16 package uk.ac.cam.eng.rule.retrieval;
17 
18 import java.io.BufferedWriter;
19 import java.io.IOException;
20 import java.util.ArrayList;
21 import java.util.Collection;
22 import java.util.Collections;
23 import java.util.HashMap;
24 import java.util.List;
25 import java.util.Map;
26 import java.util.Map.Entry;
27 import java.util.Set;
28 
29 import org.apache.commons.lang.time.StopWatch;
30 import org.apache.hadoop.hbase.util.BloomFilter;
31 import org.apache.hadoop.io.DataOutputBuffer;
32 
33 import uk.ac.cam.eng.extraction.Rule;
34 import uk.ac.cam.eng.extraction.RuleString;
35 import uk.ac.cam.eng.extraction.Symbol;
39 import uk.ac.cam.eng.util.CLI;
40 import uk.ac.cam.eng.util.Pair;
41 
47 class HFileRuleQuery implements Runnable {
48 
49  private final HFileRuleReader reader;
50 
51  private final BloomFilter bf;
52 
53  private final BufferedWriter out;
54 
55  private final Collection<RuleString> query;
56 
57  private final RuleRetriever retriever;
58 
59  private final TTableClient s2tClient;
60 
61  private final TTableClient t2sClient;
62 
63  private final DataOutputBuffer tempOut = new DataOutputBuffer();
64 
65  private final Map<Rule, Pair<EnumRuleType, RuleData>> queue = new HashMap<>();
66 
67  private static final int BATCH_SIZE = 1000;
68 
69  public HFileRuleQuery(HFileRuleReader reader, BloomFilter bf,
70  BufferedWriter out, Collection<RuleString> query,
71  RuleRetriever retriever, CLI.ServerParams params) {
72  this.reader = reader;
73  this.bf = bf;
74  this.out = out;
75  this.query = query;
76  this.retriever = retriever;
77  this.s2tClient = new TTableClient();
78  this.t2sClient = new TTableClient();
79  if (retriever.fReg.hasLexicalFeatures()) {
80  s2tClient.setup(params, retriever.fReg.getNoOfProvs(), true);
81  t2sClient.setup(params, retriever.fReg.getNoOfProvs(), false);
82  }
83  }
84 
85  private void drainQueue()
86  throws IOException {
87  if (retriever.fReg.hasLexicalFeatures()) {
88  s2tClient.queryRules(queue);
89  t2sClient.queryRules(queue);
90  }
91  for (Entry<Rule, Pair<EnumRuleType, RuleData>> e : queue.entrySet()) {
92  Rule rule = e.getKey();
93  EnumRuleType type = e.getValue().getFirst();
94  RuleData rawFeatures = e.getValue().getSecond();
95  if (retriever.passThroughRules.contains(rule)) {
96  Rule asciiRule = new Rule(rule);
97  synchronized (retriever.foundPassThroughRules) {
98  retriever.foundPassThroughRules.add(asciiRule);
99  }
100  retriever.writeRule(type, rule, retriever.fReg
101  .createFoundPassThroughRuleFeatures(rawFeatures
102  .getFeatures()), out);
103  } else {
104  Map<Integer, Double> processed = retriever.fReg
105  .processFeatures(rule, rawFeatures);
106  retriever.writeRule(type, rule, processed, out);
107  }
108  }
109  queue.clear();
110  }
111 
112  @SuppressWarnings("unchecked")
113  @Override
114  public void run() {
115  List<RuleString> sortedQuery = new ArrayList<>(query);
116  query.clear();
117  StopWatch stopWatch = new StopWatch();
118  System.out.println("Sorting query");
119  stopWatch.start();
120  Collections.sort(sortedQuery, new MergeComparator());
121  System.out.printf("Query sort took %d seconds\n",
122  stopWatch.getTime() / 1000);
123  stopWatch.reset();
124  stopWatch.start();
125  try {
126  for (RuleString source : sortedQuery) {
127  tempOut.reset();
128  source.write(tempOut);
129  if (!bf.contains(tempOut.getData(), 0, tempOut.getLength(),
130  null)) {
131  continue;
132  }
133  if (reader.seek(source)) {
134  if (retriever.testVocab.contains(source)) {
135  synchronized (retriever.foundTestVocab) {
136  retriever.foundTestVocab.add(source);
137  }
138  }
139  List<Pair<Rule, RuleData>> rules = new ArrayList<>();
140  for (Pair<Rule, RuleData> entry : reader
141  .getRulesForSource()) {
142  rules.add(Pair.createPair(new Rule(entry.getFirst()),
143  new RuleData(entry.getSecond())));
144  }
145  SidePattern pattern = source.toPattern();
146  Map<Rule, RuleData> filtered = retriever.filter.filter(
147  pattern, rules);
148  EnumRuleType type = pattern.isPhrase() ? EnumRuleType.V
149  : EnumRuleType.X;
150  Set<Integer> sentenceIds = retriever.sourceToSentenceId.get(source);
151  for (Entry<Rule, RuleData> e : filtered.entrySet()) {
152  queue.put(e.getKey(), Pair.createPair(type, e.getValue()));
153  List<Symbol> words = e.getKey().target().getTerminals();
154  for(int id : sentenceIds){
155  synchronized(retriever.targetSideVocab){
156  retriever.targetSideVocab.get(id).addAll(words);
157  }
158  }
159  }
160  if(queue.size() > BATCH_SIZE){
161  drainQueue();
162  }
163  }
164  }
165  drainQueue();
166  } catch (IOException e) {
167  e.printStackTrace();
168  System.exit(1);
169  }
170 
171  System.out
172  .printf("Query took %d seconds\n", stopWatch.getTime() / 1000);
173 
174  }
175 }
void run(ucam::util::RegistryPO const &rg)