Cambridge SMT System
ExtractorDataLoader.java
Go to the documentation of this file.
1 /*******************************************************************************
2  * Licensed under the Apache License, Version 2.0 (the "License");
3  * you may not use these files except in compliance with the License.
4  * You may obtain a copy of the License at
5  *
6  * http://www.apache.org/licenses/LICENSE-2.0
7  *
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11  * See the License for the specific language governing permissions and
12  * limitations under the License.
13  *
14  * Copyright 2014 - Juan Pino, Aurelien Waite, William Byrne
15  *******************************************************************************/
16 package uk.ac.cam.eng.extraction.hadoop.util;
17 
18 import java.io.BufferedReader;
19 import java.io.FileInputStream;
20 import java.io.FileNotFoundException;
21 import java.io.IOException;
22 import java.io.InputStreamReader;
23 import java.util.zip.GZIPInputStream;
24 
25 import org.apache.hadoop.conf.Configuration;
26 import org.apache.hadoop.fs.FileSystem;
27 import org.apache.hadoop.fs.Path;
28 import org.apache.hadoop.io.MapWritable;
29 import org.apache.hadoop.io.NullWritable;
30 import org.apache.hadoop.io.SequenceFile;
31 import org.apache.hadoop.io.Text;
32 
34 import uk.ac.cam.eng.util.CLI;
35 
44 public class ExtractorDataLoader {
45 
66  public void loadTrainingData2Hdfs(String sourceTextFile,
67  String targetTextFile, String wordAlignmentFile,
68  String provenanceFile, String hdfsName)
69  throws FileNotFoundException, IOException {
70 
71  try (BufferedReader src = new BufferedReader(new InputStreamReader(
72  new GZIPInputStream(new FileInputStream(sourceTextFile))));
73  BufferedReader trg = new BufferedReader(
74  new InputStreamReader(new GZIPInputStream(
75  new FileInputStream(targetTextFile))));
76  BufferedReader align = new BufferedReader(
77  new InputStreamReader(new GZIPInputStream(
78  new FileInputStream(wordAlignmentFile))));
79  BufferedReader prov = new BufferedReader(
80  new InputStreamReader(new GZIPInputStream(
81  new FileInputStream(provenanceFile))))) {
82 
83  String srcLine = null, trgLine = null, alignLine = null, provLine = null;
84  Configuration conf = new Configuration();
85  Path path = new Path(hdfsName);
86  FileSystem fs = path.getFileSystem(conf);
87  try (SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf,
88  path, MapWritable.class, TextArrayWritable.class)) {
89  Text sourceSentenceText = new Text();
90  Text targetSentenceText = new Text();
91  Text alignmentText = new Text();
92  Text[] array = new Text[3];
93  array[0] = sourceSentenceText;
94  array[1] = targetSentenceText;
95  array[2] = alignmentText;
96  TextArrayWritable arrayWritable = new TextArrayWritable();
97  // metadata: provenance, e.g. genre, collection, training
98  // instance
99  // id, doc id, etc.
100  MapWritable metadata = new MapWritable();
101 
102  while ((srcLine = src.readLine()) != null
103  && (trgLine = trg.readLine()) != null
104  && (alignLine = align.readLine()) != null
105  && (provLine = prov.readLine()) != null) {
106  metadata.clear();
107  String[] provenances = provLine.split("\\s+");
108  for (String provenance : provenances) {
109  metadata.put(new Text(provenance), NullWritable.get());
110  }
111  sourceSentenceText.set(srcLine);
112  targetSentenceText.set(trgLine);
113  // note, alignLine can be the empty string
114  alignmentText.set(alignLine);
115  arrayWritable.set(array);
116  writer.append(metadata, arrayWritable);
117  }
118  }
119  }
120  }
121 
122  public static void main(String[] args) throws FileNotFoundException,
123  IOException {
125  Util.parseCommandLine(args, params);
127  loader.loadTrainingData2Hdfs(params.sourceTextFile,
128  params.targetTextFile, params.alignmentFile,
129  params.provenanceFile, params.hdfsName);
130 
131  }
132 }
static JCommander parseCommandLine(String[] args, Object params)
Definition: Util.java:85
void loadTrainingData2Hdfs(String sourceTextFile, String targetTextFile, String wordAlignmentFile, String provenanceFile, String hdfsName)