16 package uk.ac.cam.eng.extraction.hadoop.util;
18 import java.io.BufferedReader;
19 import java.io.FileInputStream;
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.io.InputStreamReader;
23 import java.util.zip.GZIPInputStream;
25 import org.apache.hadoop.conf.Configuration;
26 import org.apache.hadoop.fs.FileSystem;
27 import org.apache.hadoop.fs.Path;
28 import org.apache.hadoop.io.MapWritable;
29 import org.apache.hadoop.io.NullWritable;
30 import org.apache.hadoop.io.SequenceFile;
31 import org.apache.hadoop.io.Text;
67 String targetTextFile, String wordAlignmentFile,
68 String provenanceFile, String hdfsName)
69 throws FileNotFoundException, IOException {
71 try (BufferedReader src =
new BufferedReader(
new InputStreamReader(
72 new GZIPInputStream(
new FileInputStream(sourceTextFile))));
73 BufferedReader trg =
new BufferedReader(
74 new InputStreamReader(
new GZIPInputStream(
75 new FileInputStream(targetTextFile))));
76 BufferedReader align =
new BufferedReader(
77 new InputStreamReader(
new GZIPInputStream(
78 new FileInputStream(wordAlignmentFile))));
79 BufferedReader prov =
new BufferedReader(
80 new InputStreamReader(
new GZIPInputStream(
81 new FileInputStream(provenanceFile))))) {
83 String srcLine = null, trgLine = null, alignLine = null, provLine = null;
84 Configuration conf =
new Configuration();
85 Path path =
new Path(hdfsName);
86 FileSystem fs = path.getFileSystem(conf);
87 try (SequenceFile.Writer writer =
new SequenceFile.Writer(fs, conf,
89 Text sourceSentenceText =
new Text();
90 Text targetSentenceText =
new Text();
91 Text alignmentText =
new Text();
92 Text[] array =
new Text[3];
93 array[0] = sourceSentenceText;
94 array[1] = targetSentenceText;
95 array[2] = alignmentText;
100 MapWritable metadata =
new MapWritable();
102 while ((srcLine = src.readLine()) != null
103 && (trgLine = trg.readLine()) != null
104 && (alignLine = align.readLine()) != null
105 && (provLine = prov.readLine()) != null) {
107 String[] provenances = provLine.split(
"\\s+");
108 for (String provenance : provenances) {
109 metadata.put(
new Text(provenance), NullWritable.get());
111 sourceSentenceText.set(srcLine);
112 targetSentenceText.set(trgLine);
114 alignmentText.set(alignLine);
115 arrayWritable.set(array);
116 writer.append(metadata, arrayWritable);
122 public static void main(String[] args)
throws FileNotFoundException,
128 params.targetTextFile, params.alignmentFile,
129 params.provenanceFile, params.hdfsName);