Cambridge SMT System
ExtractorJob.java
Go to the documentation of this file.
1 /*******************************************************************************
2  * Licensed under the Apache License, Version 2.0 (the "License");
3  * you may not use these files except in compliance with the License.
4  * You may obtain a copy of the License at
5  *
6  * http://www.apache.org/licenses/LICENSE-2.0
7  *
8  * Unless required by applicable law or agreed to in writing, software
9  * distributed under the License is distributed on an "AS IS" BASIS,
10  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11  * See the License for the specific language governing permissions and
12  * limitations under the License.
13  *
14  * Copyright 2014 - Juan Pino, Aurelien Waite, William Byrne
15  *******************************************************************************/
16 package uk.ac.cam.eng.extraction.hadoop.extraction;
17 
18 import java.io.FileNotFoundException;
19 import java.io.IOException;
20 import java.util.HashMap;
21 import java.util.Map;
22 
23 import org.apache.hadoop.conf.Configuration;
24 import org.apache.hadoop.conf.Configured;
25 import org.apache.hadoop.fs.Path;
26 import org.apache.hadoop.io.ByteWritable;
27 import org.apache.hadoop.io.IntWritable;
28 import org.apache.hadoop.io.MapWritable;
29 import org.apache.hadoop.io.Text;
30 import org.apache.hadoop.io.Writable;
31 import org.apache.hadoop.mapreduce.Job;
32 import org.apache.hadoop.mapreduce.Mapper;
33 import org.apache.hadoop.mapreduce.Reducer;
34 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
35 import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
36 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
37 import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
38 import org.apache.hadoop.util.Tool;
39 import org.apache.hadoop.util.ToolRunner;
40 
41 import uk.ac.cam.eng.extraction.Alignment;
42 import uk.ac.cam.eng.extraction.Extract;
43 import uk.ac.cam.eng.extraction.ExtractOptions;
44 import uk.ac.cam.eng.extraction.Rule;
49 import uk.ac.cam.eng.util.CLI;
50 import uk.ac.cam.eng.util.CLI.Provenance;
51 import uk.ac.cam.eng.util.Pair;
52 
53 import com.beust.jcommander.ParameterException;
54 
61 public class ExtractorJob extends Configured implements Tool {
62 
69  public static Job getJob(Configuration conf) throws IOException {
70  conf.setIfUnset("mapreduce.map.java.opts", "-Xmx800m");
71  conf.setIfUnset("mapreduce.reduce.java.opts", "-Xmx4096m");
72  conf.setIfUnset("mapreduce.map.memory.mb", "1000");
73  conf.setIfUnset("mapreduce.reduce.memory.mb", "6000");
74  conf.setIfUnset("mapreduce.input.fileinputformat.split.maxsize", "4194304");
75  Job job = new Job(conf, "Rule extraction");
76  job.setJarByClass(ExtractorJob.class);
77  job.setMapOutputKeyClass(Rule.class);
78  job.setMapOutputValueClass(ExtractedData.class);
79  job.setOutputKeyClass(Rule.class);
80  job.setOutputValueClass(ExtractedData.class);
81  job.setMapperClass(ExtractorMapper.class);
82  job.setReducerClass(ExtractorReducer.class);
83  job.setSortComparatorClass(Source2TargetComparator.class);
84  job.setCombinerClass(ExtractorReducer.class);
85  job.setInputFormatClass(SequenceFileInputFormat.class);
86  job.setOutputFormatClass(SequenceFileOutputFormat.class);
87  FileOutputFormat.setCompressOutput(job, true);
88  return job;
89  }
90 
98  private static class ExtractorMapper extends
99  Mapper<MapWritable, TextArrayWritable, Rule, ExtractedData> {
100 
101  private static final IntWritable ONE = new IntWritable(1);
102 
103  private ExtractedData ruleInfo = new ExtractedData();
104 
105  private Map<Text, ByteWritable> prov2Id = new HashMap<>();
106 
107  private static final ByteWritable ALL = new ByteWritable((byte) 0);
108 
109  @Override
110  protected void setup(Context context) throws IOException,
111  InterruptedException {
112  super.setup(context);
113  String provString = context.getConfiguration().get(Provenance.PROV);
114  String[] provs = provString.split(",");
115  if (provs.length + 1 >= Byte.MAX_VALUE) {
116  throw new RuntimeException(
117  String.format(
118  "Number of provenances is %d which is greater than 128",
119  provs.length));
120  }
121  for (int i = 0; i < provs.length; ++i) {
122  prov2Id.put(new Text(provs[i]),
123  new ByteWritable((byte) (i + 1)));
124  }
125  }
126 
127  /*
128  * (non-Javadoc)
129  *
130  * @see org.apache.hadoop.mapreduce.Mapper#map(java.lang.Object,
131  * java.lang.Object, org.apache.hadoop.mapreduce.Mapper.Context)
132  */
133  @Override
134  protected void map(MapWritable key, TextArrayWritable value,
135  Context context) throws IOException, InterruptedException {
136  Configuration conf = context.getConfiguration();
137  String sourceSentence = ((Text) value.get()[0]).toString();
138  String targetSentence = ((Text) value.get()[1]).toString();
139  String wordAlign = ((Text) value.get()[2]).toString();
140 
141  int maxSourcePhrase = conf.getInt(
142  CLI.RuleParameters.MAX_SOURCE_PHRASE, -1);
143  int maxSourceElements = conf.getInt(
144  CLI.RuleParameters.MAX_SOURCE_ELEMENTS, -1);
145  int maxTerminalLength = conf.getInt(
146  CLI.RuleParameters.MAX_TERMINAL_LENGTH, -1);
147  int maxNonTerminalSpan = conf.getInt(
148  CLI.RuleParameters.MAX_NONTERMINAL_SPAN, -1);
149  boolean removeMonotonicRepeats = conf.getBoolean(
150  CLI.ExtractorJobParameters.REMOVE_MONOTONIC_REPEATS, false);
151  boolean compatabilityMode = conf.getBoolean(
152  CLI.ExtractorJobParameters.COMPATIBILITY_MODE, false);
153  ExtractOptions opts = new ExtractOptions(maxSourcePhrase,
154  maxSourceElements, maxTerminalLength, maxNonTerminalSpan,
155  removeMonotonicRepeats, compatabilityMode);
156 
157  for (Pair<Rule, Alignment> ra : Extract.extractJava(opts,
158  sourceSentence, targetSentence, wordAlign)) {
159  ruleInfo.clear();
160  ruleInfo.putProvenanceCount(ALL, ONE);
161  for (Writable prov : key.keySet()) {
162  if (prov2Id.keySet().contains(prov)) {
163  ruleInfo.putProvenanceCount(prov2Id.get(prov), ONE);
164  }
165  }
166  ruleInfo.putAlignmentCount(ra.getSecond(), 1);
167  context.write(ra.getFirst(), ruleInfo);
168  }
169  }
170  }
171 
172  private static class ExtractorReducer extends
173  Reducer<Rule, ExtractedData, Rule, ExtractedData> {
174 
175  private ExtractedData compressed = new ExtractedData();
176 
177  @Override
178  protected void reduce(Rule key, Iterable<ExtractedData> values,
179  Context context) throws IOException, InterruptedException {
180  compressed.clear();
181  for (ExtractedData value : values) {
182  compressed.increment(value);
183  }
184  context.write(key, compressed);
185  }
186  }
187 
188  public int run(String[] args) throws FileNotFoundException, IOException,
189  ClassNotFoundException, InterruptedException,
190  IllegalArgumentException, IllegalAccessException {
191 
193  try {
194  Util.parseCommandLine(args, params);
195  } catch (ParameterException e) {
196  return 1;
197  }
198  Configuration conf = getConf();
199  Util.ApplyConf(params, conf);
200  Job job = getJob(conf);
201  FileInputFormat.setInputPaths(job, params.input);
202  FileOutputFormat.setOutputPath(job, new Path(params.output));
203  return job.waitForCompletion(true) ? 0 : 1;
204  }
205 
206  public static void main(String[] args) throws Exception {
207  int res = ToolRunner.run(new ExtractorJob(), args);
208  System.exit(res);
209  }
210 }
MertOpt opts
Definition: MertCommon.cpp:14
std::string toString(const T &x, uint pr=2)
Converts an arbitrary type to string Converts to string integers, floats, doubles Quits execution if ...
static void ApplyConf(Object params, Configuration conf)
Definition: Util.java:80
static JCommander parseCommandLine(String[] args, Object params)
Definition: Util.java:85
void putProvenanceCount(ByteWritable provenanceName, IntWritable count)