Cambridge SMT System
HFileRuleReader.java
Go to the documentation of this file.
1 /*******************************************************************************
2  * Licensed under the Apache License, Version 2.0 (the "License");
3  * you may not use these files except in compliance with the License.
4  * You may obtain a copy of the License at
5  *
6  * http://www.apache.org/licenses/LICENSE-2.0
7  *
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11  * See the License for the specific language governing permissions and
12  * limitations under the License.
13  *
14  * Copyright 2014 - Juan Pino, Aurelien Waite, William Byrne
15  *******************************************************************************/
16 package uk.ac.cam.eng.rule.retrieval;
17 
18 import java.io.IOException;
19 import java.util.Collections;
20 import java.util.Iterator;
21 
22 import org.apache.hadoop.conf.Configuration;
23 import org.apache.hadoop.fs.FileSystem;
24 import org.apache.hadoop.fs.Path;
25 import org.apache.hadoop.hbase.KeyValue;
26 import org.apache.hadoop.hbase.io.hfile.CacheConfig;
27 import org.apache.hadoop.hbase.io.hfile.HFile;
28 import org.apache.hadoop.hbase.io.hfile.HFileScanner;
29 import org.apache.hadoop.io.DataInputBuffer;
30 import org.apache.hadoop.io.DataOutputBuffer;
31 
32 import scala.Array;
33 import uk.ac.cam.eng.extraction.Rule;
34 import uk.ac.cam.eng.extraction.RuleString;
37 import uk.ac.cam.eng.util.Pair;
38 
44 public class HFileRuleReader implements Iterable<Pair<Rule, RuleData>> {
45 
46  private HFileScanner scanner;
47 
48  private final DataInputBuffer in = new DataInputBuffer();
49  private final DataOutputBuffer out = new DataOutputBuffer();
50  private final Rule rule = new Rule();
51  private final TargetFeatureList value = new TargetFeatureList();
52  private RuleString key = new RuleString();
53 
54  public HFileRuleReader(HFile.Reader hfReader) {
55  scanner = hfReader.getScanner(false, false);
56  rule.setSource(key);
57  }
58 
59  private void readValue() {
60  in.reset(scanner.getValue().array(), scanner.getValue().arrayOffset(),
61  scanner.getValue().limit());
62  try {
63  value.readFields(in);
64  } catch (IOException e) {
65  // Should not happen! Only reading buffered bytes
66  throw new RuntimeException(e);
67  }
68  }
69 
70  public boolean seek(RuleString source) throws IOException {
71  out.reset();
72  source.write(out);
73  byte[] empty = Array.emptyByteArray();
74  KeyValue kv = new KeyValue(out.getData(), 0, out.getLength(), empty, 0,
75  0, empty, 0, 0, 0l, KeyValue.Type.Put, empty, 0, 0);
76  int pos = scanner.seekTo(kv.getBuffer(), kv.getKeyOffset(),
77  kv.getKeyLength());
78  if (pos == 0) {
79  key.set(source);
80  rule.setSource(key);
81  return true;
82  } else {
83  return false;
84  }
85  }
86 
87  public Iterable<Pair<Rule, RuleData>> getRulesForSource() {
88  readValue();
89  final Iterator<Pair<RuleString, RuleData>> instance = value.iterator();
90 
91  return new Iterable<Pair<Rule, RuleData>>() {
92 
93  @Override
94  public Iterator<Pair<Rule, RuleData>> iterator() {
95  return new Iterator<Pair<Rule, RuleData>>() {
96 
97  @Override
98  public boolean hasNext() {
99  return instance.hasNext();
100  }
101 
102  @Override
103  public Pair<Rule, RuleData> next() {
104  Pair<RuleString, RuleData> next = instance.next();
105  rule.setTarget(next.getFirst());
106  return Pair.createPair(rule, next.getSecond());
107  }
108 
109  @Override
110  public void remove() {
111  throw new UnsupportedOperationException();
112 
113  }
114  };
115  }
116  };
117  }
118 
119  private RuleString readSource() {
120  // Have to put the ROW_LENGTH_SIZE due to KeyValue structure
121  in.reset(scanner.getKey().array(), scanner.getKey().arrayOffset()
122  + KeyValue.ROW_LENGTH_SIZE, scanner.getKey().limit());
123  key.readFields(in);
124  rule.setSource(key);
125  return key;
126  }
127 
128  @Override
129  public Iterator<Pair<Rule, RuleData>> iterator() {
130  boolean temp = false;
131  try {
132  temp = scanner.seekTo();
133  } catch (IOException e) {
134  throw new RuntimeException(e);
135  }
136  final boolean isNotEmpty = temp;
137  if (!isNotEmpty) {
138  return Collections.<Pair<Rule, RuleData>> emptyList().iterator();
139  }
140  readSource();
141  return new Iterator<Pair<Rule, RuleData>>() {
142 
143  Iterator<Pair<Rule, RuleData>> targetIter;
144 
145  boolean hasNext = isNotEmpty;
146 
147  @Override
148  public boolean hasNext() {
149  return hasNext || targetIter.hasNext();
150  }
151 
152  @Override
153  public Pair<Rule, RuleData> next() {
154  if (targetIter == null) {
155  targetIter = getRulesForSource().iterator();
156  try {
157  hasNext = scanner.next();
158  } catch (IOException e) {
159  e.printStackTrace();
160  hasNext = false;
161  }
162  }
163  if (targetIter.hasNext()) {
164  return targetIter.next();
165  } else if (hasNext) {
166  readSource();
167  targetIter = getRulesForSource().iterator();
168  try {
169  hasNext = scanner.next();
170  } catch (IOException e) {
171  e.printStackTrace();
172  hasNext = false;
173  }
174  return targetIter.next();
175  }
176  return null;
177  }
178 
179  @Override
180  public void remove() {
181  throw new UnsupportedOperationException();
182  }
183 
184  };
185  }
186 
187  public static void main(String[] args) throws IOException {
188  Configuration conf = new Configuration();
189  CacheConfig cacheConf = new CacheConfig(conf);
190  int count = 0;
191  for (String fileName : args) {
192  int fileCount = 0;
193  System.out.println("Reading file " + fileName);
194  HFile.Reader hfReader = HFile.createReader(FileSystem.get(conf),
195  new Path(fileName), cacheConf);
196  HFileRuleReader ruleReader = new HFileRuleReader(hfReader);
197  for (@SuppressWarnings("unused")
198  Pair<Rule, RuleData> entry : ruleReader) {
199  ++count;
200  ++fileCount;
201  }
202  System.out.println(fileCount + "\t" + fileName);
203  }
204  System.out.println(count + "\ttotal");
205  }
206 }
Iterator< Pair< Rule, RuleData > > iterator()
static< F, S > Pair< F, S > createPair(F first, S second)
Definition: Pair.java:46
Iterable< Pair< Rule, RuleData > > getRulesForSource()