16 package uk.ac.cam.eng.extraction.hadoop.util;
18 import java.io.IOException;
20 import org.apache.hadoop.conf.Configuration;
21 import org.apache.hadoop.fs.FileSystem;
22 import org.apache.hadoop.fs.Path;
23 import org.apache.hadoop.io.SequenceFile;
24 import org.apache.hadoop.io.SequenceFile.CompressionType;
25 import org.apache.hadoop.io.Writable;
26 import org.apache.hadoop.util.ReflectionUtils;
37 public static void main(String[] args)
throws IOException {
38 if (args.length != 3) {
40 .println(
"Args: <sequence file in> <sequence file out> <modulo #>");
43 int modulo = Integer.parseInt(args[2]);
44 Configuration conf =
new Configuration();
45 FileSystem fs = FileSystem.get(conf);
46 Path pathIn =
new Path(args[0]);
47 SequenceFile.Reader reader =
new SequenceFile.Reader(fs, pathIn, conf);
48 Path pathOut =
new Path(args[1]);
49 SequenceFile.Writer writer = SequenceFile.createWriter(fs, conf,
50 pathOut, reader.getKeyClass(), reader.getValueClass(),
51 CompressionType.BLOCK);
52 Writable key = (Writable) ReflectionUtils.newInstance(
53 reader.getKeyClass(), conf);
54 Writable value = (Writable) ReflectionUtils.newInstance(
55 reader.getValueClass(), conf);
57 while (reader.next(key, value)) {
58 if(count % modulo ==0){
59 writer.append(key, value);