Command-line reference

SGNMT provides decode.py for decoding and train.py for NMT training. The neural word alignment script align.py is only available for the Blocks implementation. The scripts can be configured via command line or configuration file. For a quick overview of available parameters use --help:

python decode.py --help
python batch_decode.py --help
python train.py --help
python align.py --help

The complete and detailed list of parameters is provided below.

Decoding

usage: decode.py [-h] [--config_file CONFIG_FILE]
                 [--verbosity {debug,info,warn,error}] [--min_score MIN_SCORE]
                 [--range RANGE] [--src_test SRC_TEST] [--en_test EN_TEST]
                 [--indexing_scheme {blocks,tf,t2t}]
                 [--legacy_indexing LEGACY_INDEXING]
                 [--input_method {dummy,file,shell,stdin}]
                 [--log_sum {tropical,log}]
                 [--single_cpu_thread SINGLE_CPU_THREAD] [--beam BEAM]
                 [--decoder {greedy,beam,multisegbeam,syncbeam,sepbeam,syntaxbeam,dfs,restarting,bow,flip,bucket,bigramgreedy,astar,vanilla}]
                 [--hypo_recombination HYPO_RECOMBINATION]
                 [--allow_unk_in_output ALLOW_UNK_IN_OUTPUT]
                 [--max_node_expansions MAX_NODE_EXPANSIONS]
                 [--max_len_factor MAX_LEN_FACTOR]
                 [--early_stopping EARLY_STOPPING] [--heuristics HEURISTICS]
                 [--heuristic_predictors HEURISTIC_PREDICTORS]
                 [--multiseg_tokenizations MULTISEG_TOKENIZATIONS]
                 [--cache_heuristic_estimates CACHE_HEURISTIC_ESTIMATES]
                 [--pure_heuristic_scores PURE_HEURISTIC_SCORES]
                 [--restarting_node_score {difference,absolute,constant,expansions}]
                 [--low_decoder_memory LOW_DECODER_MEMORY]
                 [--stochastic_decoder STOCHASTIC_DECODER]
                 [--decode_always_single_step DECODE_ALWAYS_SINGLE_STEP]
                 [--flip_strategy {move,flip}]
                 [--bucket_selector BUCKET_SELECTOR]
                 [--bucket_score_strategy {difference,heap,absolute,constant}]
                 [--collect_statistics {best,full,all}]
                 [--heuristic_scores_file HEURISTIC_SCORES_FILE]
                 [--score_lower_bounds_file SCORE_LOWER_BOUNDS_FILE]
                 [--decoder_diversity_factor DECODER_DIVERSITY_FACTOR]
                 [--sync_symbol SYNC_SYMBOL] [--max_word_len MAX_WORD_LEN]
                 [--nbest NBEST] [--output_fst_unk_id OUTPUT_FST_UNK_ID]
                 [--fst_unk_id FST_UNK_ID] [--output_path OUTPUT_PATH]
                 [--outputs OUTPUTS] [--remove_eos REMOVE_EOS]
                 [--src_wmap SRC_WMAP] [--trg_wmap TRG_WMAP]
                 [--trg_cmap TRG_CMAP] [--predictors PREDICTORS]
                 [--predictor_weights PREDICTOR_WEIGHTS]
                 [--closed_vocabulary_normalization {none,exact,reduced,rescale_unk}]
                 [--combination_scheme {sum,length_norm,bayesian}]
                 [--apply_combination_scheme_to_partial_hypos APPLY_COMBINATION_SCHEME_TO_PARTIAL_HYPOS]
                 [--pred_src_vocab_size PRED_SRC_VOCAB_SIZE]
                 [--pred_trg_vocab_size PRED_TRG_VOCAB_SIZE]
                 [--length_normalization LENGTH_NORMALIZATION]
                 [--nmt_config NMT_CONFIG] [--nmt_path NMT_PATH]
                 [--nmt_engine {none,blocks,tensorflow}]
                 [--nmt_model_selector {params,bleu,time}]
                 [--cache_nmt_posteriors CACHE_NMT_POSTERIORS]
                 [--gnmt_beta GNMT_BETA]
                 [--layerbylayer_terminal_strategy {none,force,skip}]
                 [--syntax_max_depth SYNTAX_MAX_DEPTH]
                 [--syntax_root_id SYNTAX_ROOT_ID]
                 [--syntax_pop_id SYNTAX_POP_ID]
                 [--syntax_max_terminal_id SYNTAX_MAX_TERMINAL_ID]
                 [--syntax_terminal_list SYNTAX_TERMINAL_LIST]
                 [--t2t_usr_dir T2T_USR_DIR] [--t2t_model T2T_MODEL]
                 [--t2t_problem T2T_PROBLEM]
                 [--t2t_hparams_set T2T_HPARAMS_SET]
                 [--t2t_checkpoint_dir T2T_CHECKPOINT_DIR]
                 [--t2t_src_vocab_size T2T_SRC_VOCAB_SIZE]
                 [--t2t_trg_vocab_size T2T_TRG_VOCAB_SIZE]
                 [--nizza_model NIZZA_MODEL]
                 [--nizza_hparams_set NIZZA_HPARAMS_SET]
                 [--nizza_checkpoint_dir NIZZA_CHECKPOINT_DIR]
                 [--src_test_raw SRC_TEST_RAW]
                 [--length_model_weights LENGTH_MODEL_WEIGHTS]
                 [--use_length_point_probs USE_LENGTH_POINT_PROBS]
                 [--length_model_offset LENGTH_MODEL_OFFSET]
                 [--extlength_path EXTLENGTH_PATH]
                 [--unk_count_lambdas UNK_COUNT_LAMBDAS] [--wc_word WC_WORD]
                 [--ngramc_path NGRAMC_PATH] [--ngramc_order NGRAMC_ORDER]
                 [--ngramize_min_order NGRAMIZE_MIN_ORDER]
                 [--ngramize_max_order NGRAMIZE_MAX_ORDER]
                 [--ngramc_discount_factor NGRAMC_DISCOUNT_FACTOR]
                 [--skipvocab_max_id SKIPVOCAB_MAX_ID]
                 [--skipvocab_stop_size SKIPVOCAB_STOP_SIZE]
                 [--trg_test TRG_TEST] [--fr_test FR_TEST]
                 [--forcedlst_sparse_feat FORCEDLST_SPARSE_FEAT]
                 [--use_nbest_weights USE_NBEST_WEIGHTS]
                 [--bow_heuristic_strategies BOW_HEURISTIC_STRATEGIES]
                 [--bow_accept_subsets BOW_ACCEPT_SUBSETS]
                 [--bow_accept_duplicates BOW_ACCEPT_DUPLICATES]
                 [--bow_diversity_heuristic_factor BOW_DIVERSITY_HEURISTIC_FACTOR]
                 [--src_idxmap SRC_IDXMAP] [--en_idxmap EN_IDXMAP]
                 [--trg_idxmap TRG_IDXMAP] [--fr_idxmap FR_IDXMAP]
                 [--altsrc_test ALTSRC_TEST] [--word2char_map WORD2CHAR_MAP]
                 [--fsttok_path FSTTOK_PATH]
                 [--fsttok_max_pending_score FSTTOK_MAX_PENDING_SCORE]
                 [--rules_path RULES_PATH]
                 [--use_grammar_weights USE_GRAMMAR_WEIGHTS]
                 [--grammar_feature_weights GRAMMAR_FEATURE_WEIGHTS]
                 [--srilm_path SRILM_PATH]
                 [--srilm_convert_to_ln SRILM_CONVERT_TO_LN]
                 [--nplm_path NPLM_PATH] [--rnnlm_path RNNLM_PATH]
                 [--rnnlm_config RNNLM_CONFIG] [--srilm_order SRILM_ORDER]
                 [--normalize_nplm_probs NORMALIZE_NPLM_PROBS]
                 [--fst_path FST_PATH] [--rtn_path RTN_PATH]
                 [--fst_skip_bos_weight FST_SKIP_BOS_WEIGHT]
                 [--fst_to_log FST_TO_LOG] [--use_fst_weights USE_FST_WEIGHTS]
                 [--use_rtn_weights USE_RTN_WEIGHTS]
                 [--minimize_rtns MINIMIZE_RTNS]
                 [--remove_epsilon_in_rtns REMOVE_EPSILON_IN_RTNS]
                 [--normalize_fst_weights NORMALIZE_FST_WEIGHTS]
                 [--normalize_rtn_weights NORMALIZE_RTN_WEIGHTS]
                 [--nmt_config2 NMT_CONFIG2] [--nmt_path2 NMT_PATH2]
                 [--nmt_engine2 NMT_ENGINE2] [--t2t_model2 T2T_MODEL2]
                 [--t2t_problem2 T2T_PROBLEM2]
                 [--t2t_hparams_set2 T2T_HPARAMS_SET2]
                 [--t2t_checkpoint_dir2 T2T_CHECKPOINT_DIR2]
                 [--pred_src_vocab_size2 PRED_SRC_VOCAB_SIZE2]
                 [--pred_trg_vocab_size2 PRED_TRG_VOCAB_SIZE2]
                 [--rnnlm_config2 RNNLM_CONFIG2] [--rnnlm_path2 RNNLM_PATH2]
                 [--src_test2 SRC_TEST2] [--altsrc_test2 ALTSRC_TEST2]
                 [--word2char_map2 WORD2CHAR_MAP2]
                 [--fsttok_path2 FSTTOK_PATH2] [--src_idxmap2 SRC_IDXMAP2]
                 [--trg_idxmap2 TRG_IDXMAP2] [--fst_path2 FST_PATH2]
                 [--forcedlst_sparse_feat2 FORCEDLST_SPARSE_FEAT2]
                 [--ngramc_path2 NGRAMC_PATH2] [--ngramc_order2 NGRAMC_ORDER2]
                 [--nmt_config3 NMT_CONFIG3] [--nmt_path3 NMT_PATH3]
                 [--nmt_engine3 NMT_ENGINE3] [--t2t_model3 T2T_MODEL3]
                 [--t2t_problem3 T2T_PROBLEM3]
                 [--t2t_hparams_set3 T2T_HPARAMS_SET3]
                 [--t2t_checkpoint_dir3 T2T_CHECKPOINT_DIR3]
                 [--pred_src_vocab_size3 PRED_SRC_VOCAB_SIZE3]
                 [--pred_trg_vocab_size3 PRED_TRG_VOCAB_SIZE3]
                 [--rnnlm_config3 RNNLM_CONFIG3] [--rnnlm_path3 RNNLM_PATH3]
                 [--src_test3 SRC_TEST3] [--altsrc_test3 ALTSRC_TEST3]
                 [--word2char_map3 WORD2CHAR_MAP3]
                 [--fsttok_path3 FSTTOK_PATH3] [--src_idxmap3 SRC_IDXMAP3]
                 [--trg_idxmap3 TRG_IDXMAP3] [--fst_path3 FST_PATH3]
                 [--forcedlst_sparse_feat3 FORCEDLST_SPARSE_FEAT3]
                 [--ngramc_path3 NGRAMC_PATH3] [--ngramc_order3 NGRAMC_ORDER3]
                 [--nmt_config4 NMT_CONFIG4] [--nmt_path4 NMT_PATH4]
                 [--nmt_engine4 NMT_ENGINE4] [--t2t_model4 T2T_MODEL4]
                 [--t2t_problem4 T2T_PROBLEM4]
                 [--t2t_hparams_set4 T2T_HPARAMS_SET4]
                 [--t2t_checkpoint_dir4 T2T_CHECKPOINT_DIR4]
                 [--pred_src_vocab_size4 PRED_SRC_VOCAB_SIZE4]
                 [--pred_trg_vocab_size4 PRED_TRG_VOCAB_SIZE4]
                 [--rnnlm_config4 RNNLM_CONFIG4] [--rnnlm_path4 RNNLM_PATH4]
                 [--src_test4 SRC_TEST4] [--altsrc_test4 ALTSRC_TEST4]
                 [--word2char_map4 WORD2CHAR_MAP4]
                 [--fsttok_path4 FSTTOK_PATH4] [--src_idxmap4 SRC_IDXMAP4]
                 [--trg_idxmap4 TRG_IDXMAP4] [--fst_path4 FST_PATH4]
                 [--forcedlst_sparse_feat4 FORCEDLST_SPARSE_FEAT4]
                 [--ngramc_path4 NGRAMC_PATH4] [--ngramc_order4 NGRAMC_ORDER4]
                 [--nmt_config5 NMT_CONFIG5] [--nmt_path5 NMT_PATH5]
                 [--nmt_engine5 NMT_ENGINE5] [--t2t_model5 T2T_MODEL5]
                 [--t2t_problem5 T2T_PROBLEM5]
                 [--t2t_hparams_set5 T2T_HPARAMS_SET5]
                 [--t2t_checkpoint_dir5 T2T_CHECKPOINT_DIR5]
                 [--pred_src_vocab_size5 PRED_SRC_VOCAB_SIZE5]
                 [--pred_trg_vocab_size5 PRED_TRG_VOCAB_SIZE5]
                 [--rnnlm_config5 RNNLM_CONFIG5] [--rnnlm_path5 RNNLM_PATH5]
                 [--src_test5 SRC_TEST5] [--altsrc_test5 ALTSRC_TEST5]
                 [--word2char_map5 WORD2CHAR_MAP5]
                 [--fsttok_path5 FSTTOK_PATH5] [--src_idxmap5 SRC_IDXMAP5]
                 [--trg_idxmap5 TRG_IDXMAP5] [--fst_path5 FST_PATH5]
                 [--forcedlst_sparse_feat5 FORCEDLST_SPARSE_FEAT5]
                 [--ngramc_path5 NGRAMC_PATH5] [--ngramc_order5 NGRAMC_ORDER5]
                 [--nmt_config6 NMT_CONFIG6] [--nmt_path6 NMT_PATH6]
                 [--nmt_engine6 NMT_ENGINE6] [--t2t_model6 T2T_MODEL6]
                 [--t2t_problem6 T2T_PROBLEM6]
                 [--t2t_hparams_set6 T2T_HPARAMS_SET6]
                 [--t2t_checkpoint_dir6 T2T_CHECKPOINT_DIR6]
                 [--pred_src_vocab_size6 PRED_SRC_VOCAB_SIZE6]
                 [--pred_trg_vocab_size6 PRED_TRG_VOCAB_SIZE6]
                 [--rnnlm_config6 RNNLM_CONFIG6] [--rnnlm_path6 RNNLM_PATH6]
                 [--src_test6 SRC_TEST6] [--altsrc_test6 ALTSRC_TEST6]
                 [--word2char_map6 WORD2CHAR_MAP6]
                 [--fsttok_path6 FSTTOK_PATH6] [--src_idxmap6 SRC_IDXMAP6]
                 [--trg_idxmap6 TRG_IDXMAP6] [--fst_path6 FST_PATH6]
                 [--forcedlst_sparse_feat6 FORCEDLST_SPARSE_FEAT6]
                 [--ngramc_path6 NGRAMC_PATH6] [--ngramc_order6 NGRAMC_ORDER6]
                 [--nmt_config7 NMT_CONFIG7] [--nmt_path7 NMT_PATH7]
                 [--nmt_engine7 NMT_ENGINE7] [--t2t_model7 T2T_MODEL7]
                 [--t2t_problem7 T2T_PROBLEM7]
                 [--t2t_hparams_set7 T2T_HPARAMS_SET7]
                 [--t2t_checkpoint_dir7 T2T_CHECKPOINT_DIR7]
                 [--pred_src_vocab_size7 PRED_SRC_VOCAB_SIZE7]
                 [--pred_trg_vocab_size7 PRED_TRG_VOCAB_SIZE7]
                 [--rnnlm_config7 RNNLM_CONFIG7] [--rnnlm_path7 RNNLM_PATH7]
                 [--src_test7 SRC_TEST7] [--altsrc_test7 ALTSRC_TEST7]
                 [--word2char_map7 WORD2CHAR_MAP7]
                 [--fsttok_path7 FSTTOK_PATH7] [--src_idxmap7 SRC_IDXMAP7]
                 [--trg_idxmap7 TRG_IDXMAP7] [--fst_path7 FST_PATH7]
                 [--forcedlst_sparse_feat7 FORCEDLST_SPARSE_FEAT7]
                 [--ngramc_path7 NGRAMC_PATH7] [--ngramc_order7 NGRAMC_ORDER7]
                 [--nmt_config8 NMT_CONFIG8] [--nmt_path8 NMT_PATH8]
                 [--nmt_engine8 NMT_ENGINE8] [--t2t_model8 T2T_MODEL8]
                 [--t2t_problem8 T2T_PROBLEM8]
                 [--t2t_hparams_set8 T2T_HPARAMS_SET8]
                 [--t2t_checkpoint_dir8 T2T_CHECKPOINT_DIR8]
                 [--pred_src_vocab_size8 PRED_SRC_VOCAB_SIZE8]
                 [--pred_trg_vocab_size8 PRED_TRG_VOCAB_SIZE8]
                 [--rnnlm_config8 RNNLM_CONFIG8] [--rnnlm_path8 RNNLM_PATH8]
                 [--src_test8 SRC_TEST8] [--altsrc_test8 ALTSRC_TEST8]
                 [--word2char_map8 WORD2CHAR_MAP8]
                 [--fsttok_path8 FSTTOK_PATH8] [--src_idxmap8 SRC_IDXMAP8]
                 [--trg_idxmap8 TRG_IDXMAP8] [--fst_path8 FST_PATH8]
                 [--forcedlst_sparse_feat8 FORCEDLST_SPARSE_FEAT8]
                 [--ngramc_path8 NGRAMC_PATH8] [--ngramc_order8 NGRAMC_ORDER8]
                 [--nmt_config9 NMT_CONFIG9] [--nmt_path9 NMT_PATH9]
                 [--nmt_engine9 NMT_ENGINE9] [--t2t_model9 T2T_MODEL9]
                 [--t2t_problem9 T2T_PROBLEM9]
                 [--t2t_hparams_set9 T2T_HPARAMS_SET9]
                 [--t2t_checkpoint_dir9 T2T_CHECKPOINT_DIR9]
                 [--pred_src_vocab_size9 PRED_SRC_VOCAB_SIZE9]
                 [--pred_trg_vocab_size9 PRED_TRG_VOCAB_SIZE9]
                 [--rnnlm_config9 RNNLM_CONFIG9] [--rnnlm_path9 RNNLM_PATH9]
                 [--src_test9 SRC_TEST9] [--altsrc_test9 ALTSRC_TEST9]
                 [--word2char_map9 WORD2CHAR_MAP9]
                 [--fsttok_path9 FSTTOK_PATH9] [--src_idxmap9 SRC_IDXMAP9]
                 [--trg_idxmap9 TRG_IDXMAP9] [--fst_path9 FST_PATH9]
                 [--forcedlst_sparse_feat9 FORCEDLST_SPARSE_FEAT9]
                 [--ngramc_path9 NGRAMC_PATH9] [--ngramc_order9 NGRAMC_ORDER9]
                 [--nmt_config10 NMT_CONFIG10] [--nmt_path10 NMT_PATH10]
                 [--nmt_engine10 NMT_ENGINE10] [--t2t_model10 T2T_MODEL10]
                 [--t2t_problem10 T2T_PROBLEM10]
                 [--t2t_hparams_set10 T2T_HPARAMS_SET10]
                 [--t2t_checkpoint_dir10 T2T_CHECKPOINT_DIR10]
                 [--pred_src_vocab_size10 PRED_SRC_VOCAB_SIZE10]
                 [--pred_trg_vocab_size10 PRED_TRG_VOCAB_SIZE10]
                 [--rnnlm_config10 RNNLM_CONFIG10]
                 [--rnnlm_path10 RNNLM_PATH10] [--src_test10 SRC_TEST10]
                 [--altsrc_test10 ALTSRC_TEST10]
                 [--word2char_map10 WORD2CHAR_MAP10]
                 [--fsttok_path10 FSTTOK_PATH10] [--src_idxmap10 SRC_IDXMAP10]
                 [--trg_idxmap10 TRG_IDXMAP10] [--fst_path10 FST_PATH10]
                 [--forcedlst_sparse_feat10 FORCEDLST_SPARSE_FEAT10]
                 [--ngramc_path10 NGRAMC_PATH10]
                 [--ngramc_order10 NGRAMC_ORDER10]
                 [--nmt_config11 NMT_CONFIG11] [--nmt_path11 NMT_PATH11]
                 [--nmt_engine11 NMT_ENGINE11] [--t2t_model11 T2T_MODEL11]
                 [--t2t_problem11 T2T_PROBLEM11]
                 [--t2t_hparams_set11 T2T_HPARAMS_SET11]
                 [--t2t_checkpoint_dir11 T2T_CHECKPOINT_DIR11]
                 [--pred_src_vocab_size11 PRED_SRC_VOCAB_SIZE11]
                 [--pred_trg_vocab_size11 PRED_TRG_VOCAB_SIZE11]
                 [--rnnlm_config11 RNNLM_CONFIG11]
                 [--rnnlm_path11 RNNLM_PATH11] [--src_test11 SRC_TEST11]
                 [--altsrc_test11 ALTSRC_TEST11]
                 [--word2char_map11 WORD2CHAR_MAP11]
                 [--fsttok_path11 FSTTOK_PATH11] [--src_idxmap11 SRC_IDXMAP11]
                 [--trg_idxmap11 TRG_IDXMAP11] [--fst_path11 FST_PATH11]
                 [--forcedlst_sparse_feat11 FORCEDLST_SPARSE_FEAT11]
                 [--ngramc_path11 NGRAMC_PATH11]
                 [--ngramc_order11 NGRAMC_ORDER11]
                 [--nmt_config12 NMT_CONFIG12] [--nmt_path12 NMT_PATH12]
                 [--nmt_engine12 NMT_ENGINE12] [--t2t_model12 T2T_MODEL12]
                 [--t2t_problem12 T2T_PROBLEM12]
                 [--t2t_hparams_set12 T2T_HPARAMS_SET12]
                 [--t2t_checkpoint_dir12 T2T_CHECKPOINT_DIR12]
                 [--pred_src_vocab_size12 PRED_SRC_VOCAB_SIZE12]
                 [--pred_trg_vocab_size12 PRED_TRG_VOCAB_SIZE12]
                 [--rnnlm_config12 RNNLM_CONFIG12]
                 [--rnnlm_path12 RNNLM_PATH12] [--src_test12 SRC_TEST12]
                 [--altsrc_test12 ALTSRC_TEST12]
                 [--word2char_map12 WORD2CHAR_MAP12]
                 [--fsttok_path12 FSTTOK_PATH12] [--src_idxmap12 SRC_IDXMAP12]
                 [--trg_idxmap12 TRG_IDXMAP12] [--fst_path12 FST_PATH12]
                 [--forcedlst_sparse_feat12 FORCEDLST_SPARSE_FEAT12]
                 [--ngramc_path12 NGRAMC_PATH12]
                 [--ngramc_order12 NGRAMC_ORDER12]
General options
--config_file Configuration file in standard .ini format. NOTE: Configuration file overrides command line arguments
--verbosity=info
 

Log level: debug,info,warn,error

Possible choices: debug, info, warn, error

--min_score=-1000000.0
 Delete all complete hypotheses with total scores smaller than this value
--range= Defines the range of sentences to be processed. Syntax is equal to HiFSTs printstrings and lmerts idxrange parameter: <start-idx>:<end-idx> (both inclusive, start with 1). E.g. 2:5 means: skip the first sentence, process next 4 sentences
--src_test=test_en
 Path to source test set. This is expected to be a plain text file with one source sentence in each line. Words need to be indexed, i.e. use word IDs instead of their string representations.
--en_test= DEPRECATED: Old name for –src_test
--indexing_scheme=blocks
 

This parameter defines the reserved IDs. * ‘blocks’: eps,unk: 0, <s>: 1, </s>: 2. * ‘tf’: unk: 3, <s>: 1, </s>: 2. * ‘t2t’: unk: 3, <s>: 2, </s>: 1.

Possible choices: blocks, tf, t2t

--legacy_indexing=False
 DEPRECATED: Use –indexing_scheme=tf instead
--input_method=file
 

This parameter controls how the input to SGNMT is provided. SGNMT supports three modes: * ‘dummy’: Use dummy source sentences. * ‘file’: Read test sentences from a plain text filespecified by –src_test. * ‘shell’: Start SGNMT in an interactive shell. * ‘stdin’: Test sentences are read from stdin In shell and stdin mode you can change SGNMT options on the fly: Beginning a line with the string ‘!sgnmt ‘ signals SGNMT directives instead of sentences to translate. E.g. ‘!sgnmt config predictor_weights 0.2,0.8’ changes the current predictor weights. ‘!sgnmt help’ lists all available directives. Using SGNMT directives is particularly useful in combination with MERT to avoid start up times between evaluations. Note that input sentences still have to be written using word ids in all cases.

Possible choices: dummy, file, shell, stdin

--log_sum=log

Controls how to compute the sum in the log space, i.e. how to compute log(exp(l1)+exp(l2)) for log values l1,l2. * ‘tropical’: approximate with max(l1,l2) * ‘log’: Use logsumexp in scipy

Possible choices: tropical, log

--single_cpu_thread=False
 If true, try to prevent libraries like Theano or TensorFlow from doing internal multithreading. Also, see the OMP_NUM_THREADS environment variable.
Decoding options
--beam=12 Size of beam. Only used if –decoder is set to ‘beam’ or ‘astar’. For ‘astar’ it limits the capacity of the queue. Use –beam 0 for unlimited capacity.
--decoder=beam

Strategy for traversing the search space which is spanned by the predictors. * ‘greedy’: Greedy decoding (similar to beam=1) * ‘beam’: beam search like in Bahdanau et al, 2015 * ‘dfs’: Depth-first search. This should be used for exact decoding or the complete enumeration of the search space, but it cannot be used if the search space is too large (like for unrestricted NMT) as it performs exhaustive search. If you have not only negative predictor scores, set –early_stopping to false. * ‘restarting’: Like DFS but with better admissible pruning behavior. * ‘multisegbeam’: Beam search for predictors with multiple tokenizations ([sub]word/char-levels). * ‘syncbeam’: beam search which compares after consuming a special synchronization symbol instead of after each iteration. * ‘syntaxbeam’: beam search which ensures terminal symbol diversity. * ‘sepbeam’: Associates predictors with hypos in beam search and applies only one predictor instead of all for hypo expansion. * ‘bow’: Restarting decoder optimized for bag-of-words problems. * ‘flip’: This decoder works only for bag problems. It traverses the search space by switching two words in the hypothesis. Do not use bow predictor. * ‘bucket’: Works best for bag problems. Maintains buckets for each hypo length and extends a hypo in a bucket by one before selecting the next bucket. * ‘bigramgreedy’: Works best for bag problems. Collects bigram statistics and constructs hypos to score by greedily selecting high scoring bigrams. Do not use bow predictor with this search strategy. * ‘astar’: A* search. The heuristic function is configured using the –heuristics options. * ‘vanilla’: Original blocks beam decoder. This bypasses the predictor framework and directly performs pure NMT beam decoding on the GPU. Use this when you do pure NMT decoding as this is usually faster then using a single nmt predictor as the search can be parallelized on the GPU.

Possible choices: greedy, beam, multisegbeam, syncbeam, sepbeam, syntaxbeam, dfs, restarting, bow, flip, bucket, bigramgreedy, astar, vanilla

--hypo_recombination=False
 Activates hypothesis recombination. Has to be supported by the decoder. Applicable to beam, restarting, bow, bucket
--allow_unk_in_output=True
 If false, remove all UNKs in the final posteriors. Predictor distributions can still produce UNKs, but they have to be replaced by other words by other predictors
--max_node_expansions=0
 This parameter allows to limit the total number of search space expansions for a single sentence. If this is 0 we allow an unlimited number of expansions. If it is negative, the maximum number of expansions is this times the length of the source sentence. Supporting decoders: bigramgreedy, bow, bucket, dfs, flip, restarting
--max_len_factor=2
 Limits the length of hypotheses to avoid infinity loops in search strategies for unbounded search spaces. The length of any translation is limited to max_len_factor times the length of the source sentence.
--early_stopping=True
 Use this parameter if you are only interested in the first best decoding result. This option has a different effect depending on the used –decoder. For the beam decoder, it means stopping decoding when the best active hypothesis ends with </s>. If false, do not stop until all hypotheses end with EOS. For the dfs and restarting decoders, early stopping enables admissible pruning of branches when the accumulated score already exceeded the currently best score. DO NOT USE early stopping in combination with the dfs or restarting decoder when your predictors can produce positive scores!
--heuristics= Comma-separated list of heuristics to use in heuristic based search like A*. * ‘predictor’: Predictor specific heuristics. Some predictors come with own heuristics - e.g. the fst predictor uses the shortest path to the final state. Using ‘predictor’ combines the specific heuristics of all selected predictors. * ‘greedy’: Do greedy decoding to get the heuristic costs. This is expensive but accurate. * ‘lasttoken’: Use the single score of the last token. * ‘stats’: Collect unigram statistics during decodingand compare actual hypothesis scores with the product of unigram scores of the used words. * ‘scoreperword’: Using this heuristic normalizes the previously accumulated costs by its length. It can be used for beam search with normalized scores, using a capacity (–beam), no other heuristic, and setting–decoder to astar. Note that all heuristics are inadmissible, i.e. A* is not guaranteed to find the globally best path.
--heuristic_predictors=all
 Comma separated list of indices of predictors considered by the heuristic. For example, if –predictors is set to nmt,length,fst then setting –heuristic_predictors to 0,2 results in using nmt and fst in the heuristics. Use ‘all’ to use all predictors in the heuristics
--multiseg_tokenizations=
 This argument must be used when the multisegbeam decoder is activated. For each predictor, it defines the tokenizations used for it (comma separated). If a path to a word map file is provided, the corresponding predictor is operating on the pure word level. The ‘mixed:’ prefix activates mixed word/character models according Wu et al. (2016). the ‘eow’: prefix assumes to find explicit </w>specifiers in the word maps which mark end of words. This is suitable for subword units, e.g. bpe.
--cache_heuristic_estimates=True
 Whether to cache heuristic future cost estimates. This is especially useful with the greedy heuristic.
--pure_heuristic_scores=False
 If this is set to false, heuristic decoders as A* score hypotheses with the sum of the partial hypo score plus the heuristic estimates (lik in standard A*). Set to true to use the heuristic estimates only
--restarting_node_score=difference
 

This parameter defines the strategy how the restarting decoder decides from which node to restart. * ‘difference’: Restart where the difference between 1-best and 2-best is smallest * ‘absolute’: Restart from the unexplored node with the best absolute score globally. * ‘constant’: Constant node score. Simulates FILO or uniform distribution with restarting_stochastic. * ‘expansions’: Inverse of the number of expansions on the node. Discourages expanding arcs on the same node repeatedly.

Possible choices: difference, absolute, constant, expansions

--low_decoder_memory=True
 Some decoding strategies support modes which do not change the decoding logic, but make use of the inadmissible pruning parameters like max_expansions to reduce memory consumption. This usually requires some computational overhead for cleaning up data structures. Applicable to restarting and bucket decoders.
--stochastic_decoder=False
 Activates stochastic decoders. Applicable to the decoders restarting, bow, bucket
--decode_always_single_step=False
 If this is set to true, heuristic depth first search decoders like restarting or bow always perform a single decoding step instead of greedy decoding. Handle with care...
--flip_strategy=move
 

Defines the hypothesis transition in the flip decoder. ‘flip’ flips two words, ‘move’ moves a word to a different position

Possible choices: move, flip

--bucket_selector=maxscore
 Defines the bucket selection strategy for the bucket decoder. * ‘iter’: Rotate through all lengths * ‘iter-n’: Rotate through all lengths n times * ‘maxscore’: Like iter, but filters buckets with hypos worse than a threshold. Threshold is increased if no bucket found * ‘score’: Select bucket with the highest bucket score. The bucket score is determined by the bucket_score_strategy * ‘score-end’: Start with the bucket with highest bucket score, and iterate through all subsequent buckets.
--bucket_score_strategy=difference
 

Defines how buckets are scored for the bucket decoder. Usually, the best hypo in the bucket is compared to the global best score of that length according –collect_statistics. * ‘difference’: Difference between both hypos * ‘heap’: Use best score on bucket heap directly * ‘absolute’: Use best hypo score in bucket directly * ‘constant’: Uniform bucket scores.

Possible choices: difference, heap, absolute, constant

--collect_statistics=best
 

Determines over which hypotheses statistics are collected. * ‘best’: Collect statistics from the current best full hypothesis * ‘full’: Collect statistics from all full hypos * ‘all’: Collect statistics also from partial hypos Applicable to the bucket decoder, the heuristic of the bow predictor, and the heuristic ‘stats’.

Possible choices: best, full, all

--heuristic_scores_file=
 The bow predictor heuristic and the stats heuristic sum up the unigram scores of words as heuristic estimate. This option should point to a mapping file from word-id to (unigram) score. If this is empty, the unigram scores are collected during decoding for each sentence separately according –collect_statistics.
--score_lower_bounds_file=
 Admissible pruning in some decoding strategies can be improved by providing lower bounds on complete hypothesis scores. This is useful to improve the efficiency of exhaustive search, with lower bounds found by e.g. beam search. The expected file format is just a text file with line separated scores for each sentence. Supported by the following decoders: astar, bigramgreedy, bow, bucket, dfs, flip, restarting
--decoder_diversity_factor=-1.0
 If this is greater than zero, promote diversity between active hypotheses during decoding. The exact way of doing this depends on –decoder: * The ‘beam’ decoder roughly follows the approach in Li and Jurafsky, 2016 * The ‘bucket’ decoder reorders the hypotheses in a bucket by penalizing hypotheses with the number of expanded hypotheses from the same parent.
--sync_symbol=-1
 Used for the syncbeam decoder. Synchronization symbol for hypothesis comparision. If negative, use the </w> entry in –trg_cmap.
--max_word_len=25
 Maximum length of a single word. Only applicable to the decoders multisegbeam and syncbeam.
Output options
--nbest=0 Maximum number of hypotheses in the output files. Set to 0 to output all hypotheses found by the decoder. If you use the beam or astar decoder, this option is limited by the beam size.
--output_fst_unk_id=0
 DEPRECATED: Old name for –fst_unk_id
--fst_unk_id=999999998
 SGNMT uses the ID 0 for UNK. However, this clashes with OpenFST when writing FSTs as OpenFST reserves 0 for epsilon arcs. Therefore, we use this ID for UNK instead. Note that this only applies to output FSTs created by the fst or sfst output handler, or FSTs used by the fsttok wrapper. Apart from that, UNK is still represented by the ID 0.
--output_path=sgnmt-out.%s
 Path to the output files generated by SGNMT. You can use the placeholder %%s for the format specifier
--outputs= Comma separated list of output formats: * ‘text’: First best translations in plain text format * ‘nbest’: Moses’ n-best format with separate scores for each predictor. * ‘fst’: Translation lattices in OpenFST format with sparse tuple arcs. * ‘sfst’: Translation lattices in OpenFST format with standard arcs (i.e. combined scores). The path to the output files can be specified with –output_path
--remove_eos=True
 Whether to remove </S> symbol on output.
--src_wmap= Path to the source side word map (Format: <word> <id>). This is used to map the words in –src_test to their word IDs. If empty, SGNMT expects the input words to be in integer representation.
--trg_wmap= Path to the target side word map (Format: <word> <id>). This is used to generate log output and the output formats text and nbest. If empty, we directly write word IDs.
--trg_cmap= Path to the target side char map (Format: <char> <id>). If this is not empty, all output files are converted to character-level. The mapping from word to character sequence is read from –trg_wmap. The char map must contain an entry for </w> which points to the word boundary ID.
General predictor options
--predictors=nmt
 Comma separated list of predictors. Predictors are scoring modules which define a distribution over target words given the history and some side information like the source sentence. If vocabulary sizes differ among predictors, we fill in gaps with predictor UNK scores.: * ‘nmt’: neural machine translation predictor. Options: nmt_config, nmt_path, gnmt_beta, nmt_model_selector, cache_nmt_posteriors. * ‘t2t’: Tensor2Tensor predictor. Options: t2t_usr_dir, t2t_model, t2t_problem, t2t_hparams_set, t2t_checkpoint_dir, pred_src_vocab_size, pred_trg_vocab_size * ‘nizza’: Nizza alignment models. Options: nizza_model, nizza_hparams_set, nizza_checkpoint_dir, pred_src_vocab_size, pred_trg_vocab_size * ‘bfslayerbylayer’: Layerbylayer models (BFS). Options: t2t_usr_dir, t2t_model, t2t_problem, t2t_hparams_set, t2t_checkpoint_dir, syntax_root_id, syntax_max_terminal_id, syntax_terminal_list, syntax_pop_id,layerbylayer_terminal_strategy, syntax_max_depth, pred_src_vocab_size, pred_trg_vocab_size * ‘dfslayerbylayer’: Layerbylayer models (DFS). Options: t2t_usr_dir, t2t_model, t2t_problem, t2t_hparams_set, t2t_checkpoint_dir, syntax_root_id, syntax_max_terminal_id, syntax_terminal_list, syntax_pop_id,layerbylayer_terminal_strategy, syntax_max_depth, pred_src_vocab_size, pred_trg_vocab_size * ‘bracket’: Well-formed bracketing. Options: syntax_max_terminal_id, syntax_pop_id, syntax_max_depth, extlength_path * ‘srilm’: n-gram language model. Options: srilm_path, srilm_order * ‘nplm’: neural n-gram language model (NPLM). Options: nplm_path, normalize_nplm_probs * ‘rnnlm’: RNN language model based on TensorFlow. Options: rnnlm_config, rnnlm_path * ‘forced’: Forced decoding with one reference Options: trg_test * ‘forcedlst’: Forced decoding with a Moses n-best list (n-best list rescoring) Options: trg_test, forcedlst_sparse_feat, use_nbest_weights * ‘bow’: Forced decoding with one bag-of-words ref. Options: trg_test, heuristic_scores_file, bow_heuristic_strategies, bow_accept_subsets, bow_accept_duplicates, pred_trg_vocab_size * ‘bowsearch’: Forced decoding with one bag-of-words ref. Options: hypo_recombination, trg_test, heuristic_scores_file, bow_heuristic_strategies, bow_accept_subsets, bow_accept_duplicates, pred_trg_vocab_size * ‘fst’: Deterministic translation lattices Options: fst_path, use_fst_weights, normalize_fst_weights, fst_to_log, fst_skip_bos_weight * ‘nfst’: Non-deterministic translation lattices Options: fst_path, use_fst_weights, normalize_fst_weights, fst_to_log, fst_skip_bos_weight * ‘rtn’: Recurrent transition networks as created by HiFST with late expansion. Options: rtn_path, use_rtn_weights, minimize_rtns, remove_epsilon_in_rtns, normalize_rtn_weights * ‘lrhiero’: Direct Hiero (left-to-right Hiero). This is an EXPERIMENTAL implementation of LRHiero. Options: rules_path, grammar_feature_weights, use_grammar_weights * ‘wc’: Number of words feature. Options: wc_word. * ‘unkc’: Poisson model for number of UNKs. Options: unk_count_lambdas, pred_src_vocab_size. * ‘ngramc’: Number of ngram feature. Options: ngramc_path, ngramc_order. * ‘length’: Target sentence length model Options: src_test_raw, length_model_weights, use_length_point_probs * ‘extlength’: External target sentence lengths Options: extlength_path All predictors can be combined with one or more wrapper predictors by adding the wrapper name separated by a _ symbol. Following wrappers are available: * ‘idxmap’: Add this wrapper to predictors which use an alternative word map. Options: src_idxmap, trg_idxmap * ‘altsrc’: This wrapper loads source sentences from an alternative source. Options: altsrc_test * ‘unkvocab’: This wrapper explicitly excludes matching word indices higher than pred_trg_vocab_size with UNK scores. Options: pred_trg_vocab_size * ‘fsttok’: Uses an FST to transduce SGNMT tokens to predictor tokens. Options: fsttok_path, fsttok_max_pending_score, fst_unk_id * ‘word2char’: Wraps word-level predictors when SGNMT is running on character level. Options: word2char_map * ‘skipvocab’: Skip a subset of the predictor vocabulary. Options: skipvocab_max_id, skipvocab_stop_size * ‘ngramize’: Extracts n-gram posterior from predictors without token-level history. Options: ngramize_min_order, ngramize_max_order, max_len_factor Note that you can use multiple instances of the same predictor. For example, ‘nmt,nmt,nmt’ can be used for ensembling three NMT systems. You can often override parts of the predictor configurations for subsequent predictors by adding the predictor number (e.g. see –nmt_config2 or –fst_path2)
--predictor_weights=
 Predictor weights. Have to be specified consistently with –predictor, e.g. if –predictor is ‘bla_fst,nmt’ then set their weights with –predictor_weights bla-weight_fst-weight,nmt-weight, e.g. ‘–predictor_weights 0.1_0.3,0.6’. Default (empty string) means that each predictor gets assigned the weight 1.
--closed_vocabulary_normalization=none
 

This parameter specifies the way closed vocabulary predictors (e.g. NMT) are normalized. Closed vocabulary means that they have a predefined vocabulary. Open vocabulary predictors (e.g. fst) can potentially produce any word, or have a very large vocabulary. * ‘none’: Use unmodified scores for closed vocabulary predictors * ‘exact’: Renormalize scores depending on the probability mass which they distribute to words outside the vocabulary via the UNK probability. * ‘rescale_unk’: Rescale UNK probabilities and leave all other scores unmodified. Results in a distribution if predictor scores are stochastic. * ‘reduced’: Normalize to vocabulary defined by the open vocabulary predictors at each time step.

Possible choices: none, exact, reduced, rescale_unk

--combination_scheme=sum
 

This parameter controls how the combined hypothesis score is calculated from the predictor scores and weights. * ‘sum’: The combined score is the weighted sum of all predictor scores * ‘length_norm’: Renormalize scores by the length of hypotheses. * ‘bayesian’: Apply the Bayesian LM interpolation scheme from Allauzen and Riley to interpolate the predictor scores

Possible choices: sum, length_norm, bayesian

--apply_combination_scheme_to_partial_hypos=False
 If true, apply the combination scheme specified with –combination_scheme after each node expansion. If false, apply it only to complete hypotheses at the end of decoding
--pred_src_vocab_size=30000
 Predictor source vocabulary size. Used by the bow, bowsearch, t2t, nizza, unkc predictors.
--pred_trg_vocab_size=30000
 Predictor target vocabulary size. Used by thebow, bowsearch, t2t, nizza, unkc predictors.
Neural predictor options
--length_normalization=False
 DEPRECATED. Synonym for –combination_scheme length_norm. Normalize n-best hypotheses by sentence length. Normally improves pure NMT decoding, but degrades performance when combined with predictors like fst or multiple NMT systems.
--nmt_config= Defines the configuration of the NMT model. This can either point to a configuration file, or it can directly contain the parameters (e.g. ‘src_vocab_size=1234,trg_vocab_size=2345’). Use ‘config_file=’ in the parameter string to use configuration files with the second method.
--nmt_path= Defines the path to the NMT model. If empty, the model is loaded from the default location which depends on the NMT engine
--nmt_engine=blocks
 

NMT implementation which should be used. Use ‘none’ to disable NMT support.

Possible choices: none, blocks, tensorflow

--nmt_model_selector=bleu
 

NMT training normally creates several files in the ./train/ directory from which we can load the NMT model. Possible options: * ‘params’: Load parameters from params.npz. This is usually the most recent model. * ‘bleu’: Load from the best_bleu_params_* file with the best BLEU score. * ‘time’: Load from the most recent best_bleu_params_* file.

Possible choices: params, bleu, time

--cache_nmt_posteriors=False
 This enables the cache in the [F]NMT predictor. Normally, the search procedure is responsible to avoid applying predictors to the same history twice. However, due to the limited NMT vocabulary, two different histories might be the same from the NMT perspective, e.g. if they are the same up to words which are outside the NMT vocabulary. If this parameter is set to true, we cache posteriors with histories containing UNK and reload them when needed
--gnmt_beta=0.0
 If this is greater than zero, add a coverage penalization term following Google’s NMT (Wu et al., 2016) to the NMT score.
--layerbylayer_terminal_strategy=force
 

Strategy for dealing with terminals as parents in layerbylayer predictors with POP attention. ‘none’: Treat terminal parents like any other token ‘force’: Force the output to the terminal parent label. ‘skip’: Like ‘force’, but with log(1)=0 scores. This is usually faster, and must be used if the model is trained with use_loss_mask.

Possible choices: none, force, skip

--syntax_max_depth=30
 Maximum depth of generated trees. After this depth is reached, only terminals and POP are allowed on the next layer.
--syntax_root_id=-1
 Must be set for the layerbylayer predictor. ID of the initial target root label.
--syntax_pop_id=-1
 ID of the closing bracket in output syntax trees. layerbylayer and t2t predictors support single integer values. The bracket predictor can take a comma-separated list of integers.
--syntax_max_terminal_id=30003
 All token IDs larger than this are considered to be non-terminal symbols except the ones specified by –syntax_terminal_list
--syntax_terminal_list=
 List of IDs which are explicitly treated as terminals, in addition to all IDs lower or equal –syntax_max_terminal_id. This can be used to exclude the POP symbol from the list of non-terminals even though it has a ID higher than max_terminal_id.
--t2t_usr_dir= Available for the t2t predictor. See the –t2t_usr_dir argument in tensor2tensor.
--t2t_model=transformer
 Available for the t2t predictor. Name of the tensor2tensor model.
--t2t_problem=translate_ende_wmt32k
 Available for the t2t predictor. Name of the tensor2tensor problem.
--t2t_hparams_set=transformer_base_single_gpu
 Available for the t2t predictor. Name of the tensor2tensor hparams set.
--t2t_checkpoint_dir=
 Available for the t2t predictor. Path to the tensor2tensor checkpoint directory. Same as –output_dir in t2t_trainer.
--t2t_src_vocab_size=0
 DEPRECATED! Use –pred_src_vocab_size
--t2t_trg_vocab_size=0
 DEPRECATED! Use –pred_trg_vocab_size
--nizza_model=model1
 Available for the nizza predictor. Name of the nizza model.
--nizza_hparams_set=model1_default
 Available for the nizza predictor. Name of the nizza hparams set.
--nizza_checkpoint_dir=
 Available for the nizza predictor. Path to the nizza checkpoint directory. Same as –model_dir in nizza_trainer.
Length predictor options
--src_test_raw=
 Only required for the ‘length’ predictor. Path to original source test set WITHOUT word indices. This is used to extract features for target sentence length predictions
--length_model_weights=
 Only required for length predictor. String of length model parameters.
--use_length_point_probs=False
 If this is true, the length predictor outputs probability 1 for all tokens except </S>. For </S> it uses the point probability given by the length model. If this is set to false, we normalize the predictive score by comparing P(l=x) and P(l<x)
--length_model_offset=0
 The target sentence length model is applied to hypothesis length minus length_model_offst
--extlength_path=
 Only required for the ‘extlength’ predictor. This is the path to the file which specifies the length distributions for each sentence. Each line consists of blank separated ‘<length>:<logprob>’ pairs.
Count predictor options
--unk_count_lambdas=1.0
 Model parameters for the UNK count model: comma-separated list of lambdas for Poisson distributions. The first float specifies the Poisson distribution over the number of UNKs in the hypotheses given that the number of UNKs on the source side is 0. The last lambda specifies the distribution given >=n-1 UNKs in the source sentence.
--wc_word=-1 If negative, the wc predictor counts all words. Otherwise, count only the specific word
--ngramc_path=ngramc/%d.txt
 Only required for ngramc predictor. The ngramc predictor counts the number of ngrams and multiplies them with the factors defined in the files. The format is one ngram per line ‘<ngram> : <score>’. You can use the placeholder %%d for the sentence index.
--ngramc_order=0
 If positive, count only ngrams of the specified Order. Otherwise, count all ngrams
--ngramize_min_order=1
 Minimum ngram order for ngramize wrapper
--ngramize_max_order=4
 Maximum ngram order for ngramize wrapper
--ngramc_discount_factor=-1.0
 If this is non-negative, discount ngram counts by this factor each time the ngram is consumed
--skipvocab_max_id=30003
 All tokens above this threshold are skipped by the skipvocab predictor wrapper.
--skipvocab_stop_size=1
 The internal beam search of the skipvocab predictor wrapper stops if the best stop_size scores are for in-vocabulary words (ie. with index lower or equal skipvocab_max_id
Forced decoding predictor options
--trg_test=test_fr
 Path to target test set (with integer tokens). This is only required for the predictors ‘forced’ and ‘forcedlst’. For ‘forcedlst’ this needs to point to an n-best list in Moses format.
--fr_test= DEPRECATED. Old name for –trg_test
--forcedlst_sparse_feat=
 Per default, the forcedlst predictor uses the combined score in the Moses nbest list. Alternatively, for nbest lists in sparse feature format, you can specify the name of the features which should be used instead.
--use_nbest_weights=False
 Only required for forcedlst predictor. Whether to use the scores in n-best lists.
--bow_heuristic_strategies=remaining
 Defines the form of heuristic estimates of the bow predictor. Comma-separate following values: * remaining: sum up unigram estimates for all words in the bag which haven’t been consumed * consumed: Use the difference between the actual hypothesis score and the sum of unigram estimates of consumed words as score
--bow_accept_subsets=False
 If this is set to false, the bow predictor enforces exact correspondence between bag and words in complete hypotheses. If false, it ensures that hypotheses are consistent with the bag (i.e. do not contain words outside the bag) but do not necessarily have all words in the bag
--bow_accept_duplicates=False
 If this is set to true, the bow predictor allows a word in the bag to appear multiple times, i.e. the exact count of the word is not enforced. Can only be used in conjunction with bow_accept_subsets
--bow_diversity_heuristic_factor=-1.0
 If this is greater than zero, promote diversity between bags via the bow predictor heuristic. Bags which correspond to bags of partial bags of full hypotheses are penalized by this factor.
Wrapper predictor options
--src_idxmap=idxmap.en
 Only required for idxmap wrapper predictor. Path to the source side mapping file. The format is ‘<index> <alternative_index>’. The mapping must be complete and should be a bijection.
--en_idxmap= DEPRECATED. Old name for –src_idxmap
--trg_idxmap=idxmap.fr
 Only required for idxmap wrapper predictor. Path to the target side mapping file. The format is ‘<index> <alternative_index>’. The mapping must be complete and should be a bijection.
--fr_idxmap= DEPRECATED. Old name for –trg_idxmap
--altsrc_test=test_en.alt
 Only required for altsrc wrapper predictor. Path to the alternative source sentences.
--word2char_map=word2char.map
 Only required for word2char wrapper predictor. Path to a mapping file from word ID to sequence of character IDs (format: <word-id> <char-id1> <char-id2>...). All character IDs which do not occur in this mapping are treated as word boundary symbols.
--fsttok_path=tok.fst
 For the fsttok wrapper. Defines the path to the FSt which transduces sequences of SGNMT tokens (eg. characters) to predictor tokens (eg BPEs). FST may be non-deterministic and contain epsilons.
--fsttok_max_pending_score=5.0
 Applicable if an FST used by the fsttok wrapper is non-deterministic. In this case, one predictor state may correspond to multiple nodes in the FST. We prune nodes which are this much worse than the best scoring node with the same history.
Hiero predictor options
--rules_path=rules/rules
 Only required for predictor lrhiero. Path to the ruleXtract rules file.
--use_grammar_weights=False
 Whether to use weights in the synchronous grammar for the lrhiero predictor. If set to false, use uniform grammar scores.
--grammar_feature_weights=
 If rules_path points to a factorized rules file (i.e. containing rules associated with a number of features, not only one score) SGNMT uses a weighted sum for them. You can specify the weights for this summation here (comma-separated) or leave it blank to sum them up equally weighted.
(Neural) LM predictor options
--srilm_path=lm/ngram.lm.gz
 Path to the ngram LM file in SRILM format
--srilm_convert_to_ln=False
 Whether to convert srilm scores from log to ln.
--nplm_path=nplm/nplm.gz
 Path to the NPLM language model
--rnnlm_path=rnnlm/rnn.ckpt
 Path to the RNNLM language model
--rnnlm_config=rnnlm.ini
 Defines the configuration of the RNNLM model. This can either point to a configuration file, or it can directly contain the parameters (e.g. ‘src_vocab_size=1234,trg_vocab_size=2345’). Use ‘config_file=’ in the parameter string to use configuration files with the second method. Use ‘model_name=X’ in the parameter string to use one of the predefined models.
--srilm_order=5
 Order of ngram for srilm predictor
--normalize_nplm_probs=False
 Whether to normalize nplm probabilities over the current unbounded predictor vocabulary.
FST and RTN predictor options
--fst_path=fst/%d.fst
 Only required for fst and nfst predictor. Sets the path to the OpenFST translation lattices. You can use the placeholder %%d for the sentence index.
--rtn_path=rtn/
 Only required for rtn predictor. Sets the path to the RTN directory as created by HiFST
--fst_skip_bos_weight=True
 This option applies to fst and nfst predictors. Lattices produced by HiFST contain the <S> symbol and often have scores on the corresponding arc. However, SGNMT skips <S> and this score is not regarded anywhere. Set this option to true to add the <S> scores. This ensures that the complete path scores for the [n]fst and rtn predictors match the corresponding path weights in the original FST as obtained with fstshortestpath.
--fst_to_log=True
 Multiply weights in the FST by -1 to transform them from tropical semiring into logprobs.
--use_fst_weights=False
 Whether to use weights in FSTs for thenfst and fst predictor.
--use_rtn_weights=False
 Whether to use weights in RTNs.
--minimize_rtns=True
 Whether to do determinization, epsilon removal, and minimization after each RTN expansion.
--remove_epsilon_in_rtns=True
 Whether to remove epsilons after RTN expansion.
--normalize_fst_weights=False
 Whether to normalize weights in FSTs. This forces the weights on outgoing edges to sum up to 1. Applicable to fst and nfst predictor.
--normalize_rtn_weights=False
 Whether to normalize weights in RTNs. This forces the weights on outgoing edges to sum up to 1. Applicable to rtn predictor.
Override options
--nmt_config2= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the second one with this parameter. The second nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path2= Overrides –nmt_path for the second nmt
--nmt_engine2= Overrides –nmt_engine for the second nmt
--t2t_model2= Overrides –t2t_model for the second t2t predictor
--t2t_problem2=
 Overrides –t2t_problem for the second t2t predictor
--t2t_hparams_set2=
 Overrides –t2t_hparams_set for the second t2t predictor
--t2t_checkpoint_dir2=
 Overrides –t2t_checkpoint_dir for the second t2t predictor
--pred_src_vocab_size2=0
 Overrides –pred_src_vocab_size for the second t2t predictor
--pred_trg_vocab_size2=0
 Overrides –pred_trg_vocab_size for the second t2t predictor
--rnnlm_config2=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the second one with this parameter. The second rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path2= Overrides –rnnlm_path for the second nmt
--src_test2= Overrides –src_test for the second src
--altsrc_test2=
 Overrides –altsrc_test for the second altsrc
--word2char_map2=
 Overrides –word2char_map for the second word2char
--fsttok_path2=
 Overrides –fsttok_path for the second fsttok
--src_idxmap2= Overrides –src_idxmap for the second indexmap
--trg_idxmap2= Overrides –trg_idxmap for the second indexmap
--fst_path2= Overrides –fst_path for the second fst predictor
--forcedlst_sparse_feat2=
 Overrides –forcedlst_sparse_feat for the second forcedlst predictor
--ngramc_path2=
 Overrides –ngramc_path for the second ngramc
--ngramc_order2=0
 Overrides –ngramc_order for the second ngramc
--nmt_config3= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the third one with this parameter. The third nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path3= Overrides –nmt_path for the third nmt
--nmt_engine3= Overrides –nmt_engine for the third nmt
--t2t_model3= Overrides –t2t_model for the third t2t predictor
--t2t_problem3=
 Overrides –t2t_problem for the third t2t predictor
--t2t_hparams_set3=
 Overrides –t2t_hparams_set for the third t2t predictor
--t2t_checkpoint_dir3=
 Overrides –t2t_checkpoint_dir for the third t2t predictor
--pred_src_vocab_size3=0
 Overrides –pred_src_vocab_size for the third t2t predictor
--pred_trg_vocab_size3=0
 Overrides –pred_trg_vocab_size for the third t2t predictor
--rnnlm_config3=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the third one with this parameter. The third rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path3= Overrides –rnnlm_path for the third nmt
--src_test3= Overrides –src_test for the third src
--altsrc_test3=
 Overrides –altsrc_test for the third altsrc
--word2char_map3=
 Overrides –word2char_map for the third word2char
--fsttok_path3=
 Overrides –fsttok_path for the third fsttok
--src_idxmap3= Overrides –src_idxmap for the third indexmap
--trg_idxmap3= Overrides –trg_idxmap for the third indexmap
--fst_path3= Overrides –fst_path for the third fst predictor
--forcedlst_sparse_feat3=
 Overrides –forcedlst_sparse_feat for the third forcedlst predictor
--ngramc_path3=
 Overrides –ngramc_path for the third ngramc
--ngramc_order3=0
 Overrides –ngramc_order for the third ngramc
--nmt_config4= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 4-th one with this parameter. The 4-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path4= Overrides –nmt_path for the 4-th nmt
--nmt_engine4= Overrides –nmt_engine for the 4-th nmt
--t2t_model4= Overrides –t2t_model for the 4-th t2t predictor
--t2t_problem4=
 Overrides –t2t_problem for the 4-th t2t predictor
--t2t_hparams_set4=
 Overrides –t2t_hparams_set for the 4-th t2t predictor
--t2t_checkpoint_dir4=
 Overrides –t2t_checkpoint_dir for the 4-th t2t predictor
--pred_src_vocab_size4=0
 Overrides –pred_src_vocab_size for the 4-th t2t predictor
--pred_trg_vocab_size4=0
 Overrides –pred_trg_vocab_size for the 4-th t2t predictor
--rnnlm_config4=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 4-th one with this parameter. The 4-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path4= Overrides –rnnlm_path for the 4-th nmt
--src_test4= Overrides –src_test for the 4-th src
--altsrc_test4=
 Overrides –altsrc_test for the 4-th altsrc
--word2char_map4=
 Overrides –word2char_map for the 4-th word2char
--fsttok_path4=
 Overrides –fsttok_path for the 4-th fsttok
--src_idxmap4= Overrides –src_idxmap for the 4-th indexmap
--trg_idxmap4= Overrides –trg_idxmap for the 4-th indexmap
--fst_path4= Overrides –fst_path for the 4-th fst predictor
--forcedlst_sparse_feat4=
 Overrides –forcedlst_sparse_feat for the 4-th forcedlst predictor
--ngramc_path4=
 Overrides –ngramc_path for the 4-th ngramc
--ngramc_order4=0
 Overrides –ngramc_order for the 4-th ngramc
--nmt_config5= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 5-th one with this parameter. The 5-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path5= Overrides –nmt_path for the 5-th nmt
--nmt_engine5= Overrides –nmt_engine for the 5-th nmt
--t2t_model5= Overrides –t2t_model for the 5-th t2t predictor
--t2t_problem5=
 Overrides –t2t_problem for the 5-th t2t predictor
--t2t_hparams_set5=
 Overrides –t2t_hparams_set for the 5-th t2t predictor
--t2t_checkpoint_dir5=
 Overrides –t2t_checkpoint_dir for the 5-th t2t predictor
--pred_src_vocab_size5=0
 Overrides –pred_src_vocab_size for the 5-th t2t predictor
--pred_trg_vocab_size5=0
 Overrides –pred_trg_vocab_size for the 5-th t2t predictor
--rnnlm_config5=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 5-th one with this parameter. The 5-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path5= Overrides –rnnlm_path for the 5-th nmt
--src_test5= Overrides –src_test for the 5-th src
--altsrc_test5=
 Overrides –altsrc_test for the 5-th altsrc
--word2char_map5=
 Overrides –word2char_map for the 5-th word2char
--fsttok_path5=
 Overrides –fsttok_path for the 5-th fsttok
--src_idxmap5= Overrides –src_idxmap for the 5-th indexmap
--trg_idxmap5= Overrides –trg_idxmap for the 5-th indexmap
--fst_path5= Overrides –fst_path for the 5-th fst predictor
--forcedlst_sparse_feat5=
 Overrides –forcedlst_sparse_feat for the 5-th forcedlst predictor
--ngramc_path5=
 Overrides –ngramc_path for the 5-th ngramc
--ngramc_order5=0
 Overrides –ngramc_order for the 5-th ngramc
--nmt_config6= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 6-th one with this parameter. The 6-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path6= Overrides –nmt_path for the 6-th nmt
--nmt_engine6= Overrides –nmt_engine for the 6-th nmt
--t2t_model6= Overrides –t2t_model for the 6-th t2t predictor
--t2t_problem6=
 Overrides –t2t_problem for the 6-th t2t predictor
--t2t_hparams_set6=
 Overrides –t2t_hparams_set for the 6-th t2t predictor
--t2t_checkpoint_dir6=
 Overrides –t2t_checkpoint_dir for the 6-th t2t predictor
--pred_src_vocab_size6=0
 Overrides –pred_src_vocab_size for the 6-th t2t predictor
--pred_trg_vocab_size6=0
 Overrides –pred_trg_vocab_size for the 6-th t2t predictor
--rnnlm_config6=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 6-th one with this parameter. The 6-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path6= Overrides –rnnlm_path for the 6-th nmt
--src_test6= Overrides –src_test for the 6-th src
--altsrc_test6=
 Overrides –altsrc_test for the 6-th altsrc
--word2char_map6=
 Overrides –word2char_map for the 6-th word2char
--fsttok_path6=
 Overrides –fsttok_path for the 6-th fsttok
--src_idxmap6= Overrides –src_idxmap for the 6-th indexmap
--trg_idxmap6= Overrides –trg_idxmap for the 6-th indexmap
--fst_path6= Overrides –fst_path for the 6-th fst predictor
--forcedlst_sparse_feat6=
 Overrides –forcedlst_sparse_feat for the 6-th forcedlst predictor
--ngramc_path6=
 Overrides –ngramc_path for the 6-th ngramc
--ngramc_order6=0
 Overrides –ngramc_order for the 6-th ngramc
--nmt_config7= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 7-th one with this parameter. The 7-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path7= Overrides –nmt_path for the 7-th nmt
--nmt_engine7= Overrides –nmt_engine for the 7-th nmt
--t2t_model7= Overrides –t2t_model for the 7-th t2t predictor
--t2t_problem7=
 Overrides –t2t_problem for the 7-th t2t predictor
--t2t_hparams_set7=
 Overrides –t2t_hparams_set for the 7-th t2t predictor
--t2t_checkpoint_dir7=
 Overrides –t2t_checkpoint_dir for the 7-th t2t predictor
--pred_src_vocab_size7=0
 Overrides –pred_src_vocab_size for the 7-th t2t predictor
--pred_trg_vocab_size7=0
 Overrides –pred_trg_vocab_size for the 7-th t2t predictor
--rnnlm_config7=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 7-th one with this parameter. The 7-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path7= Overrides –rnnlm_path for the 7-th nmt
--src_test7= Overrides –src_test for the 7-th src
--altsrc_test7=
 Overrides –altsrc_test for the 7-th altsrc
--word2char_map7=
 Overrides –word2char_map for the 7-th word2char
--fsttok_path7=
 Overrides –fsttok_path for the 7-th fsttok
--src_idxmap7= Overrides –src_idxmap for the 7-th indexmap
--trg_idxmap7= Overrides –trg_idxmap for the 7-th indexmap
--fst_path7= Overrides –fst_path for the 7-th fst predictor
--forcedlst_sparse_feat7=
 Overrides –forcedlst_sparse_feat for the 7-th forcedlst predictor
--ngramc_path7=
 Overrides –ngramc_path for the 7-th ngramc
--ngramc_order7=0
 Overrides –ngramc_order for the 7-th ngramc
--nmt_config8= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 8-th one with this parameter. The 8-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path8= Overrides –nmt_path for the 8-th nmt
--nmt_engine8= Overrides –nmt_engine for the 8-th nmt
--t2t_model8= Overrides –t2t_model for the 8-th t2t predictor
--t2t_problem8=
 Overrides –t2t_problem for the 8-th t2t predictor
--t2t_hparams_set8=
 Overrides –t2t_hparams_set for the 8-th t2t predictor
--t2t_checkpoint_dir8=
 Overrides –t2t_checkpoint_dir for the 8-th t2t predictor
--pred_src_vocab_size8=0
 Overrides –pred_src_vocab_size for the 8-th t2t predictor
--pred_trg_vocab_size8=0
 Overrides –pred_trg_vocab_size for the 8-th t2t predictor
--rnnlm_config8=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 8-th one with this parameter. The 8-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path8= Overrides –rnnlm_path for the 8-th nmt
--src_test8= Overrides –src_test for the 8-th src
--altsrc_test8=
 Overrides –altsrc_test for the 8-th altsrc
--word2char_map8=
 Overrides –word2char_map for the 8-th word2char
--fsttok_path8=
 Overrides –fsttok_path for the 8-th fsttok
--src_idxmap8= Overrides –src_idxmap for the 8-th indexmap
--trg_idxmap8= Overrides –trg_idxmap for the 8-th indexmap
--fst_path8= Overrides –fst_path for the 8-th fst predictor
--forcedlst_sparse_feat8=
 Overrides –forcedlst_sparse_feat for the 8-th forcedlst predictor
--ngramc_path8=
 Overrides –ngramc_path for the 8-th ngramc
--ngramc_order8=0
 Overrides –ngramc_order for the 8-th ngramc
--nmt_config9= If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 9-th one with this parameter. The 9-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path9= Overrides –nmt_path for the 9-th nmt
--nmt_engine9= Overrides –nmt_engine for the 9-th nmt
--t2t_model9= Overrides –t2t_model for the 9-th t2t predictor
--t2t_problem9=
 Overrides –t2t_problem for the 9-th t2t predictor
--t2t_hparams_set9=
 Overrides –t2t_hparams_set for the 9-th t2t predictor
--t2t_checkpoint_dir9=
 Overrides –t2t_checkpoint_dir for the 9-th t2t predictor
--pred_src_vocab_size9=0
 Overrides –pred_src_vocab_size for the 9-th t2t predictor
--pred_trg_vocab_size9=0
 Overrides –pred_trg_vocab_size for the 9-th t2t predictor
--rnnlm_config9=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 9-th one with this parameter. The 9-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path9= Overrides –rnnlm_path for the 9-th nmt
--src_test9= Overrides –src_test for the 9-th src
--altsrc_test9=
 Overrides –altsrc_test for the 9-th altsrc
--word2char_map9=
 Overrides –word2char_map for the 9-th word2char
--fsttok_path9=
 Overrides –fsttok_path for the 9-th fsttok
--src_idxmap9= Overrides –src_idxmap for the 9-th indexmap
--trg_idxmap9= Overrides –trg_idxmap for the 9-th indexmap
--fst_path9= Overrides –fst_path for the 9-th fst predictor
--forcedlst_sparse_feat9=
 Overrides –forcedlst_sparse_feat for the 9-th forcedlst predictor
--ngramc_path9=
 Overrides –ngramc_path for the 9-th ngramc
--ngramc_order9=0
 Overrides –ngramc_order for the 9-th ngramc
--nmt_config10=
 If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 10-th one with this parameter. The 10-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path10= Overrides –nmt_path for the 10-th nmt
--nmt_engine10=
 Overrides –nmt_engine for the 10-th nmt
--t2t_model10= Overrides –t2t_model for the 10-th t2t predictor
--t2t_problem10=
 Overrides –t2t_problem for the 10-th t2t predictor
--t2t_hparams_set10=
 Overrides –t2t_hparams_set for the 10-th t2t predictor
--t2t_checkpoint_dir10=
 Overrides –t2t_checkpoint_dir for the 10-th t2t predictor
--pred_src_vocab_size10=0
 Overrides –pred_src_vocab_size for the 10-th t2t predictor
--pred_trg_vocab_size10=0
 Overrides –pred_trg_vocab_size for the 10-th t2t predictor
--rnnlm_config10=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 10-th one with this parameter. The 10-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path10=
 Overrides –rnnlm_path for the 10-th nmt
--src_test10= Overrides –src_test for the 10-th src
--altsrc_test10=
 Overrides –altsrc_test for the 10-th altsrc
--word2char_map10=
 Overrides –word2char_map for the 10-th word2char
--fsttok_path10=
 Overrides –fsttok_path for the 10-th fsttok
--src_idxmap10=
 Overrides –src_idxmap for the 10-th indexmap
--trg_idxmap10=
 Overrides –trg_idxmap for the 10-th indexmap
--fst_path10= Overrides –fst_path for the 10-th fst predictor
--forcedlst_sparse_feat10=
 Overrides –forcedlst_sparse_feat for the 10-th forcedlst predictor
--ngramc_path10=
 Overrides –ngramc_path for the 10-th ngramc
--ngramc_order10=0
 Overrides –ngramc_order for the 10-th ngramc
--nmt_config11=
 If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 11-th one with this parameter. The 11-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path11= Overrides –nmt_path for the 11-th nmt
--nmt_engine11=
 Overrides –nmt_engine for the 11-th nmt
--t2t_model11= Overrides –t2t_model for the 11-th t2t predictor
--t2t_problem11=
 Overrides –t2t_problem for the 11-th t2t predictor
--t2t_hparams_set11=
 Overrides –t2t_hparams_set for the 11-th t2t predictor
--t2t_checkpoint_dir11=
 Overrides –t2t_checkpoint_dir for the 11-th t2t predictor
--pred_src_vocab_size11=0
 Overrides –pred_src_vocab_size for the 11-th t2t predictor
--pred_trg_vocab_size11=0
 Overrides –pred_trg_vocab_size for the 11-th t2t predictor
--rnnlm_config11=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 11-th one with this parameter. The 11-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path11=
 Overrides –rnnlm_path for the 11-th nmt
--src_test11= Overrides –src_test for the 11-th src
--altsrc_test11=
 Overrides –altsrc_test for the 11-th altsrc
--word2char_map11=
 Overrides –word2char_map for the 11-th word2char
--fsttok_path11=
 Overrides –fsttok_path for the 11-th fsttok
--src_idxmap11=
 Overrides –src_idxmap for the 11-th indexmap
--trg_idxmap11=
 Overrides –trg_idxmap for the 11-th indexmap
--fst_path11= Overrides –fst_path for the 11-th fst predictor
--forcedlst_sparse_feat11=
 Overrides –forcedlst_sparse_feat for the 11-th forcedlst predictor
--ngramc_path11=
 Overrides –ngramc_path for the 11-th ngramc
--ngramc_order11=0
 Overrides –ngramc_order for the 11-th ngramc
--nmt_config12=
 If the –predictors string contains more than one nmt predictor, you can specify the configuration for the 12-th one with this parameter. The 12-th nmt predictor inherits all previous settings except for the ones in this parameter.
--nmt_path12= Overrides –nmt_path for the 12-th nmt
--nmt_engine12=
 Overrides –nmt_engine for the 12-th nmt
--t2t_model12= Overrides –t2t_model for the 12-th t2t predictor
--t2t_problem12=
 Overrides –t2t_problem for the 12-th t2t predictor
--t2t_hparams_set12=
 Overrides –t2t_hparams_set for the 12-th t2t predictor
--t2t_checkpoint_dir12=
 Overrides –t2t_checkpoint_dir for the 12-th t2t predictor
--pred_src_vocab_size12=0
 Overrides –pred_src_vocab_size for the 12-th t2t predictor
--pred_trg_vocab_size12=0
 Overrides –pred_trg_vocab_size for the 12-th t2t predictor
--rnnlm_config12=
 If the –predictors string contains more than one rnnlm predictor, you can specify the configuration for the 12-th one with this parameter. The 12-th rnnlm predictor inherits all previous settings except for the ones in this parameter.
--rnnlm_path12=
 Overrides –rnnlm_path for the 12-th nmt
--src_test12= Overrides –src_test for the 12-th src
--altsrc_test12=
 Overrides –altsrc_test for the 12-th altsrc
--word2char_map12=
 Overrides –word2char_map for the 12-th word2char
--fsttok_path12=
 Overrides –fsttok_path for the 12-th fsttok
--src_idxmap12=
 Overrides –src_idxmap for the 12-th indexmap
--trg_idxmap12=
 Overrides –trg_idxmap for the 12-th indexmap
--fst_path12= Overrides –fst_path for the 12-th fst predictor
--forcedlst_sparse_feat12=
 Overrides –forcedlst_sparse_feat for the 12-th forcedlst predictor
--ngramc_path12=
 Overrides –ngramc_path for the 12-th ngramc
--ngramc_order12=0
 Overrides –ngramc_order for the 12-th ngramc

Batch Decoding (Blocks only)

This is a fast decoder for pure NMT which does not process sentences in a sequential order. It is optimized for GPU decoding. For maximum decoding speed we recommend using the Theano flags lib.cnmem=1,allow_gc=False and the most recent versions of cuDNN and Theano. Not implemented for TensorFlow.

usage: batch_decode.py [-h] [--src_test SRC_TEST] [--range RANGE]
                       [--enc_max_words ENC_MAX_WORDS] [--min_jobs MIN_JOBS]
                       [--max_tasks_per_job MAX_TASKS_PER_JOB]
                       [--max_tasks_per_state_update_job MAX_TASKS_PER_STATE_UPDATE_JOB]
                       [--max_rows_per_job MAX_ROWS_PER_JOB]
                       [--min_tasks_per_bucket MIN_TASKS_PER_BUCKET]
                       [--min_bucket_tolerance MIN_BUCKET_TOLERANCE]
                       [--beam BEAM] [--enc_nhids ENC_NHIDS]
                       [--enc_layers ENC_LAYERS]
                       [--store_full_main_loop STORE_FULL_MAIN_LOOP]
                       [--dec_share_weights DEC_SHARE_WEIGHTS]
                       [--src_mono_data SRC_MONO_DATA]
                       [--weight_scale WEIGHT_SCALE]
                       [--memory_size MEMORY_SIZE]
                       [--sort_k_batches SORT_K_BATCHES] [--val_set VAL_SET]
                       [--dec_init DEC_INIT]
                       [--weight_noise_rec WEIGHT_NOISE_REC]
                       [--src_vocab_size SRC_VOCAB_SIZE]
                       [--save_freq SAVE_FREQ]
                       [--fix_embeddings FIX_EMBEDDINGS]
                       [--val_set_grndtruth VAL_SET_GRNDTRUTH]
                       [--enc_embed ENC_EMBED] [--dec_nhids DEC_NHIDS]
                       [--finish_after FINISH_AFTER] [--saveto SAVETO]
                       [--att_nhids ATT_NHIDS] [--memory MEMORY]
                       [--trg_data TRG_DATA]
                       [--dec_attention_sources DEC_ATTENTION_SOURCES]
                       [--step_clipping STEP_CLIPPING]
                       [--annotations ANNOTATIONS]
                       [--enc_skip_connections ENC_SKIP_CONNECTIONS]
                       [--val_set_out VAL_SET_OUT] [--val_burn_in VAL_BURN_IN]
                       [--step_rule STEP_RULE] [--attention ATTENTION]
                       [--trg_vocab_size TRG_VOCAB_SIZE]
                       [--batch_size BATCH_SIZE] [--dec_layers DEC_LAYERS]
                       [--src_sparse_feat_map SRC_SPARSE_FEAT_MAP]
                       [--enc_share_weights ENC_SHARE_WEIGHTS]
                       [--output_val_set OUTPUT_VAL_SET]
                       [--dec_readout_sources DEC_READOUT_SOURCES]
                       [--src_data SRC_DATA]
                       [--trg_sparse_feat_map TRG_SPARSE_FEAT_MAP]
                       [--dropout DROPOUT] [--bleu_val_freq BLEU_VAL_FREQ]
                       [--maxout_nhids MAXOUT_NHIDS]
                       [--trg_mono_data TRG_MONO_DATA]
                       [--normalized_bleu NORMALIZED_BLEU] [--reload RELOAD]
                       [--beam_size BEAM_SIZE] [--dec_embed DEC_EMBED]
                       [--bleu_script BLEU_SCRIPT] [--seq_len SEQ_LEN]
                       [--weight_noise_ff WEIGHT_NOISE_FF]
optional arguments
--src_test=test_en
 Path to source test set. This is expected to be a plain text file with one source sentence in each line. Words need to be indexed, i.e. use word IDs instead of their string representations.
--range= Defines the range of sentences to be processed. Syntax is equal to HiFSTs printstrings and lmerts idxrange parameter: <start-idx>:<end-idx> (both inclusive, start with 1). E.g. 2:5 means: skip the first sentence, process next 4 sentences
--enc_max_words=5000
 Maximum number of words in an encoder batch. These batches compute source side annotations. Encoder batches are clustered by source sentence length, so smaller batches are possible.
--min_jobs=2 The CPU scheduler starts to construct small jobs when the total number of jobs in the pipelines is below this threshold. This prevents the computation thread from being idle, at the cost of smaller batches
--max_tasks_per_job=450
 The maximum number of tasks in a single decoder batch. Larger batches can exploit GPU parallelism more efficiently, but limit the flexibility of the CPU scheduler
--max_tasks_per_state_update_job=100
 Maximum number of tasks in a state update batch. Larger batches are more efficient to compute on the GPU, but delaying state updates for too long may lead to smaller forward pass jobs.
--max_rows_per_job=20
 Maximum number of entries in a forward pass batch. Note that each task in the batch gets at least one entry, so this parameters applies only if there are less than this threshold tasks left.
--min_tasks_per_bucket=100
 Minimum number of tasks in a bucket. Large buckets give the CPU scheduler more flexibility, but more padding may be required on the source side, leading to more wasted computation.
--min_bucket_tolerance=8
 Minimum padding width in a bucket. Increasing this leads to larger buckets and more flexible scheduling and larger batches, but potentially more wasteful state update computation due to padding.
--beam=5 Size of the beam.
--enc_nhids=1000
 Number of hidden units in encoder GRU
--enc_layers=1 Number of encoder layers
--store_full_main_loop=False
 Old style archives (not recommended)
--dec_share_weights=True
 Whether to share weights in deep decoders
--src_mono_data=./data/mono.ids.en
 Source language monolingual data (for use see –mono_data_integration)
--weight_scale=0.01
 Std of weight initialization
--memory_size=500
 Size of external memory structure
--sort_k_batches=12
 This many batches will be read ahead and sorted
--val_set=./data/dev.ids.en
 Validation set source file
--dec_init=last
 Decoder state initialisation: last, average, constant
--weight_noise_rec=False
 Weight noise flag for recurrent layers
--src_vocab_size=30003
 Source vocab size, including special tokens
--save_freq=750
 Save model after this many updates
--fix_embeddings=False
 Fix embeddings during training
--val_set_grndtruth=./data/dev.ids.fr
 Validation set gold file
--enc_embed=620
 Dimension of the word embedding matrix in encoder
--dec_nhids=1000
 Number of hidden units in decoder GRU
--finish_after=1000000
 Maximum number of updates
--saveto=./train
 Where to save model, same as ‘prefix’ in groundhog
--att_nhids=-1 Dimensionality of attention match vector (-1 to use dec_nhids)
--memory=none External memory: none, stack
--trg_data=./data/train.ids.shuf.fr
 Target dataset
--dec_attention_sources=s
 Sources used by attention: f for feedback, s for decoder states
--step_clipping=1.0
 Gradient clipping threshold
--annotations=direct
 Annotation strategy (comma-separated): direct, hierarchical
--enc_skip_connections=False
 Add skip connection in deep encoders
--val_set_out=./train/validation_out.txt
 Validation output file
--val_burn_in=80000
 Start bleu validation after this many updates
--step_rule=AdaDelta
 Optimization step rule
--attention=content
 Attention mechanism: none, content, nbest-<n>, coverage-<n>, tree, content-<n>
--trg_vocab_size=30003
 Target vocab size, including special tokens
--batch_size=80
 Batch size
--dec_layers=1 Number of decoder layers (NOT IMPLEMENTED for != 1)
--src_sparse_feat_map=
 Mapping files for using sparse feature word representations on the source side
--enc_share_weights=True
 Whether to share weights in deep encoders
--output_val_set=True
 Print validation output to file
--dec_readout_sources=sfa
 Sources used by readout network: f for feedback, s for decoder states, a for attention (context vector)
--src_data=./data/train.ids.shuf.en
 Source dataset
--trg_sparse_feat_map=
 Mapping files for using sparse feature word representations on the target side
--dropout=1.0 Dropout ratio, applied only after readout maxout
--bleu_val_freq=6000
 Validate bleu after this many updates
--maxout_nhids=-1
 Dimensionality of maxout output layer (-1 to use dec_nhids)
--trg_mono_data=./data/mono.ids.fr
 Target language monolingual data (for use see –mono_data_integration)
--normalized_bleu=True
 Length normalization IN TRAINING
--reload=True Reload model from files if exist
--beam_size=12 Beam-size for decoding DURING TRAINING
--dec_embed=620
 Dimension of the word embedding matrix in decoder
--bleu_script=perl ../scripts/multi-bleu.perl %s <
 BLEU script used during training for model selection
--seq_len=50 Sequences longer than this will be discarded
--weight_noise_ff=0.0
 Weight noise flag for feed forward layers

Training (Blocks only)

The training script follows the NMT training example in Blocks, but it adds an option for enabling reshuffling the training data between epochs, and fixing word embedding which might be used in later training stages.

usage: train.py [-h] [--bokeh] [--reshuffle] [--slim_iteration_state]
                [--reset_epoch] [--mono_data_integration {none}]
                [--loss {default,gleu}]
                [--add_mono_dummy_data ADD_MONO_DUMMY_DATA]
                [--backtrans_nmt_config BACKTRANS_NMT_CONFIG]
                [--backtrans_reload_frequency BACKTRANS_RELOAD_FREQUENCY]
                [--backtrans_store BACKTRANS_STORE]
                [--backtrans_max_same_word BACKTRANS_MAX_SAME_WORD]
                [--learning_rate LEARNING_RATE] [--prune_every PRUNE_EVERY]
                [--prune_reset_every PRUNE_RESET_EVERY]
                [--prune_n_steps PRUNE_N_STEPS] [--prune_layers PRUNE_LAYERS]
                [--prune_layout_path PRUNE_LAYOUT_PATH]
                [--sampling_freq SAMPLING_FREQ] [--hook_samples HOOK_SAMPLES]
                [--enc_nhids ENC_NHIDS] [--enc_layers ENC_LAYERS]
                [--store_full_main_loop STORE_FULL_MAIN_LOOP]
                [--dec_share_weights DEC_SHARE_WEIGHTS]
                [--src_mono_data SRC_MONO_DATA] [--weight_scale WEIGHT_SCALE]
                [--memory_size MEMORY_SIZE] [--sort_k_batches SORT_K_BATCHES]
                [--val_set VAL_SET] [--dec_init DEC_INIT]
                [--weight_noise_rec WEIGHT_NOISE_REC]
                [--src_vocab_size SRC_VOCAB_SIZE] [--save_freq SAVE_FREQ]
                [--fix_embeddings FIX_EMBEDDINGS]
                [--val_set_grndtruth VAL_SET_GRNDTRUTH]
                [--enc_embed ENC_EMBED] [--dec_nhids DEC_NHIDS]
                [--finish_after FINISH_AFTER] [--saveto SAVETO]
                [--att_nhids ATT_NHIDS] [--memory MEMORY]
                [--trg_data TRG_DATA]
                [--dec_attention_sources DEC_ATTENTION_SOURCES]
                [--step_clipping STEP_CLIPPING] [--annotations ANNOTATIONS]
                [--enc_skip_connections ENC_SKIP_CONNECTIONS]
                [--val_set_out VAL_SET_OUT] [--val_burn_in VAL_BURN_IN]
                [--step_rule STEP_RULE] [--attention ATTENTION]
                [--trg_vocab_size TRG_VOCAB_SIZE] [--batch_size BATCH_SIZE]
                [--dec_layers DEC_LAYERS]
                [--src_sparse_feat_map SRC_SPARSE_FEAT_MAP]
                [--enc_share_weights ENC_SHARE_WEIGHTS]
                [--output_val_set OUTPUT_VAL_SET]
                [--dec_readout_sources DEC_READOUT_SOURCES]
                [--src_data SRC_DATA]
                [--trg_sparse_feat_map TRG_SPARSE_FEAT_MAP]
                [--dropout DROPOUT] [--bleu_val_freq BLEU_VAL_FREQ]
                [--maxout_nhids MAXOUT_NHIDS] [--trg_mono_data TRG_MONO_DATA]
                [--normalized_bleu NORMALIZED_BLEU] [--reload RELOAD]
                [--beam_size BEAM_SIZE] [--dec_embed DEC_EMBED]
                [--bleu_script BLEU_SCRIPT] [--seq_len SEQ_LEN]
                [--weight_noise_ff WEIGHT_NOISE_FF]
optional arguments
--bokeh=False Use bokeh server for plotting
--reshuffle=False
 Reshuffle before each epoch
--slim_iteration_state=False
 Per default the iteration state stores the data stream and the main loop epoch iterator. Enabling this option only stores the epoch iterator. This results in a much smaller iteration state, but the data stream is reset after reloading. Normally, you can use slim iteration states if your data stream does reshuffling
--reset_epoch=False
 Set epoch_started in main loop status to false. Sometimes required if you change training parameters such as –mono_data_integration
--mono_data_integration=none
 

This parameter specifies how to use monolingual data. Currently, we only support using the target data. * ‘none’: Do not use monolingual data

Possible choices: none

--loss=default

Training loss function. * ‘default’: Standard loss function: squared error with target feature maps, else cross entropy * ‘gleu’: Reinforcement learning objective function as proposed by Wu et al., 2016 (Googles NMT)

Possible choices: default, gleu

--add_mono_dummy_data=True
 If the method specified with mono_data_integration uses monolingual data, it usually combines synthetic and dummy source sentences. Set this to false to disable dummy source sentences.
--backtrans_nmt_config=
 A string describing the configuration of the back-translating NMT system. Syntax is equal to nmt_config2 in decode.py: Comma separated list of name-value pairs, where name is one of the NMT configuration parameters. E.g. saveto=train.back,src_vocab_size=50000,trg_vocab_size=50000
--backtrans_reload_frequency=0
 The back-translating NMT model is reloaded every n updates. This is useful if the back-translating NMT system is currently trained by itself with the same policy. This enables us to train two NMT systems in opposite translation directions and benefit from gains in the other system immediately. Set to 0 to disable reloading
--backtrans_store=True
 Write the back-translated sentences to the file system.
--backtrans_max_same_word=0.3
 Used for sanity check of the backtranslation. If the most frequent word in the backtranslated sentence has relative frequency higher than this, discard this sentence pair
--learning_rate=0.002
 Learning rate for AdaGrad and Adam
--prune_every=-1
 Prune model every n iterations. Pruning is disabled if this is < 1
--prune_reset_every=-1
 Reset pruning statistics every n iterations. If set to -1, use –prune_every
--prune_n_steps=10
 Number of pruning steps until the target layer sizes should be reached
--prune_layers=encfwdgru:1000,encbwdgru:1000,decgru:1000
 A comma separated list of <layer>:<size> pairs. <layer> is one of ‘encfwdgru’, ‘encbwdgru’, ‘decgru’, ‘decmaxout’ which should be shrunk to <size> during training. Pruned neurons are marked by setting all in- and output connection to zero.
--prune_layout_path=prune.layout
 Points to a file which defines which weight matrices are connected to which prunable layers. The rows/columns of these matrices are set to zero for all removed neurons. The format of this file is <layer> <in|out> <mat_name> <dim> <start-idx>=0.0 <layer> is one of the layer names specified via –prune_layers. Set <start-idx> to 0.5 to add an offset of half the matrix dimension to the indices.
--sampling_freq=13
 NOT USED, just to prevent old code from breaking
--hook_samples=0
 NOT USED, just to prevent old code from breaking
--enc_nhids=1000
 Number of hidden units in encoder GRU
--enc_layers=1 Number of encoder layers
--store_full_main_loop=False
 Old style archives (not recommended)
--dec_share_weights=True
 Whether to share weights in deep decoders
--src_mono_data=./data/mono.ids.en
 Source language monolingual data (for use see –mono_data_integration)
--weight_scale=0.01
 Std of weight initialization
--memory_size=500
 Size of external memory structure
--sort_k_batches=12
 This many batches will be read ahead and sorted
--val_set=./data/dev.ids.en
 Validation set source file
--dec_init=last
 Decoder state initialisation: last, average, constant
--weight_noise_rec=False
 Weight noise flag for recurrent layers
--src_vocab_size=30003
 Source vocab size, including special tokens
--save_freq=750
 Save model after this many updates
--fix_embeddings=False
 Fix embeddings during training
--val_set_grndtruth=./data/dev.ids.fr
 Validation set gold file
--enc_embed=620
 Dimension of the word embedding matrix in encoder
--dec_nhids=1000
 Number of hidden units in decoder GRU
--finish_after=1000000
 Maximum number of updates
--saveto=./train
 Where to save model, same as ‘prefix’ in groundhog
--att_nhids=-1 Dimensionality of attention match vector (-1 to use dec_nhids)
--memory=none External memory: none, stack
--trg_data=./data/train.ids.shuf.fr
 Target dataset
--dec_attention_sources=s
 Sources used by attention: f for feedback, s for decoder states
--step_clipping=1.0
 Gradient clipping threshold
--annotations=direct
 Annotation strategy (comma-separated): direct, hierarchical
--enc_skip_connections=False
 Add skip connection in deep encoders
--val_set_out=./train/validation_out.txt
 Validation output file
--val_burn_in=80000
 Start bleu validation after this many updates
--step_rule=AdaDelta
 Optimization step rule
--attention=content
 Attention mechanism: none, content, nbest-<n>, coverage-<n>, tree, content-<n>
--trg_vocab_size=30003
 Target vocab size, including special tokens
--batch_size=80
 Batch size
--dec_layers=1 Number of decoder layers (NOT IMPLEMENTED for != 1)
--src_sparse_feat_map=
 Mapping files for using sparse feature word representations on the source side
--enc_share_weights=True
 Whether to share weights in deep encoders
--output_val_set=True
 Print validation output to file
--dec_readout_sources=sfa
 Sources used by readout network: f for feedback, s for decoder states, a for attention (context vector)
--src_data=./data/train.ids.shuf.en
 Source dataset
--trg_sparse_feat_map=
 Mapping files for using sparse feature word representations on the target side
--dropout=1.0 Dropout ratio, applied only after readout maxout
--bleu_val_freq=6000
 Validate bleu after this many updates
--maxout_nhids=-1
 Dimensionality of maxout output layer (-1 to use dec_nhids)
--trg_mono_data=./data/mono.ids.fr
 Target language monolingual data (for use see –mono_data_integration)
--normalized_bleu=True
 Length normalization IN TRAINING
--reload=True Reload model from files if exist
--beam_size=12 Beam-size for decoding DURING TRAINING
--dec_embed=620
 Dimension of the word embedding matrix in decoder
--bleu_script=perl ../scripts/multi-bleu.perl %s <
 BLEU script used during training for model selection
--seq_len=50 Sequences longer than this will be discarded
--weight_noise_ff=0.0
 Weight noise flag for feed forward layers

Alignment (Blocks only)

Only available for the Blocks (Theano) NMT engine. Supports two different neural word alignment models which both utilize the concept of attention in NMT.

usage: align.py [-h] [--iterations ITERATIONS]
                [--nmt_model_selector {params,bleu,time}]
                [--alignment_model {nam,nmt}] [--output_path OUTPUT_PATH]
                [--outputs OUTPUTS] [--enc_nhids ENC_NHIDS]
                [--enc_layers ENC_LAYERS]
                [--store_full_main_loop STORE_FULL_MAIN_LOOP]
                [--dec_share_weights DEC_SHARE_WEIGHTS]
                [--src_mono_data SRC_MONO_DATA] [--weight_scale WEIGHT_SCALE]
                [--memory_size MEMORY_SIZE] [--sort_k_batches SORT_K_BATCHES]
                [--val_set VAL_SET] [--dec_init DEC_INIT]
                [--weight_noise_rec WEIGHT_NOISE_REC]
                [--src_vocab_size SRC_VOCAB_SIZE] [--save_freq SAVE_FREQ]
                [--fix_embeddings FIX_EMBEDDINGS]
                [--val_set_grndtruth VAL_SET_GRNDTRUTH]
                [--enc_embed ENC_EMBED] [--dec_nhids DEC_NHIDS]
                [--finish_after FINISH_AFTER] [--saveto SAVETO]
                [--att_nhids ATT_NHIDS] [--memory MEMORY]
                [--trg_data TRG_DATA]
                [--dec_attention_sources DEC_ATTENTION_SOURCES]
                [--step_clipping STEP_CLIPPING] [--annotations ANNOTATIONS]
                [--enc_skip_connections ENC_SKIP_CONNECTIONS]
                [--val_set_out VAL_SET_OUT] [--val_burn_in VAL_BURN_IN]
                [--step_rule STEP_RULE] [--attention ATTENTION]
                [--trg_vocab_size TRG_VOCAB_SIZE] [--batch_size BATCH_SIZE]
                [--dec_layers DEC_LAYERS]
                [--src_sparse_feat_map SRC_SPARSE_FEAT_MAP]
                [--enc_share_weights ENC_SHARE_WEIGHTS]
                [--output_val_set OUTPUT_VAL_SET]
                [--dec_readout_sources DEC_READOUT_SOURCES]
                [--src_data SRC_DATA]
                [--trg_sparse_feat_map TRG_SPARSE_FEAT_MAP]
                [--dropout DROPOUT] [--bleu_val_freq BLEU_VAL_FREQ]
                [--maxout_nhids MAXOUT_NHIDS] [--trg_mono_data TRG_MONO_DATA]
                [--normalized_bleu NORMALIZED_BLEU] [--reload RELOAD]
                [--beam_size BEAM_SIZE] [--dec_embed DEC_EMBED]
                [--bleu_script BLEU_SCRIPT] [--seq_len SEQ_LEN]
                [--weight_noise_ff WEIGHT_NOISE_FF]
optional arguments
--iterations=50
 Number of optimization iterations for each token
--nmt_model_selector=bleu
 

NMT training normally creates several files in the ./train/ directory from which we can load the NMT model. Possible options: * ‘params’: Load parameters from params.npz. This is usually the most recent model. * ‘bleu’: Load from the best_bleu_params_* file with the best BLEU score. * ‘time’: Load from the most recent best_bleu_params_* file.

Possible choices: params, bleu, time

--alignment_model=nam
 

Defines the alignment model. * ‘nam’: Neural alignment model. Similar to NMT but trains the alignment weights explicitly for each sentence pair instead of using the NMT attention model. * ‘nmt’: Standard NMT attention model following Bahdanau et. al., 2015.

Possible choices: nam, nmt

--output_path=sgnmt-out.%s
 Path to the output files generated by SGNMT. You can use the placeholder %%s for the format specifier.
--outputs= Comma separated list of output formats: * ‘csv’: Plain text file with alignment matrix * ‘npy’: Alignment matrices in numpy’s npy format * ‘align’: Usual (Pharaoh) alignment format.
--enc_nhids=1000
 Number of hidden units in encoder GRU
--enc_layers=1 Number of encoder layers
--store_full_main_loop=False
 Old style archives (not recommended)
--dec_share_weights=True
 Whether to share weights in deep decoders
--src_mono_data=./data/mono.ids.en
 Source language monolingual data (for use see –mono_data_integration)
--weight_scale=0.01
 Std of weight initialization
--memory_size=500
 Size of external memory structure
--sort_k_batches=12
 This many batches will be read ahead and sorted
--val_set=./data/dev.ids.en
 Validation set source file
--dec_init=last
 Decoder state initialisation: last, average, constant
--weight_noise_rec=False
 Weight noise flag for recurrent layers
--src_vocab_size=30003
 Source vocab size, including special tokens
--save_freq=750
 Save model after this many updates
--fix_embeddings=False
 Fix embeddings during training
--val_set_grndtruth=./data/dev.ids.fr
 Validation set gold file
--enc_embed=620
 Dimension of the word embedding matrix in encoder
--dec_nhids=1000
 Number of hidden units in decoder GRU
--finish_after=1000000
 Maximum number of updates
--saveto=./train
 Where to save model, same as ‘prefix’ in groundhog
--att_nhids=-1 Dimensionality of attention match vector (-1 to use dec_nhids)
--memory=none External memory: none, stack
--trg_data=./data/train.ids.shuf.fr
 Target dataset
--dec_attention_sources=s
 Sources used by attention: f for feedback, s for decoder states
--step_clipping=1.0
 Gradient clipping threshold
--annotations=direct
 Annotation strategy (comma-separated): direct, hierarchical
--enc_skip_connections=False
 Add skip connection in deep encoders
--val_set_out=./train/validation_out.txt
 Validation output file
--val_burn_in=80000
 Start bleu validation after this many updates
--step_rule=AdaDelta
 Optimization step rule
--attention=content
 Attention mechanism: none, content, nbest-<n>, coverage-<n>, tree, content-<n>
--trg_vocab_size=30003
 Target vocab size, including special tokens
--batch_size=80
 Batch size
--dec_layers=1 Number of decoder layers (NOT IMPLEMENTED for != 1)
--src_sparse_feat_map=
 Mapping files for using sparse feature word representations on the source side
--enc_share_weights=True
 Whether to share weights in deep encoders
--output_val_set=True
 Print validation output to file
--dec_readout_sources=sfa
 Sources used by readout network: f for feedback, s for decoder states, a for attention (context vector)
--src_data=./data/train.ids.shuf.en
 Source dataset
--trg_sparse_feat_map=
 Mapping files for using sparse feature word representations on the target side
--dropout=1.0 Dropout ratio, applied only after readout maxout
--bleu_val_freq=6000
 Validate bleu after this many updates
--maxout_nhids=-1
 Dimensionality of maxout output layer (-1 to use dec_nhids)
--trg_mono_data=./data/mono.ids.fr
 Target language monolingual data (for use see –mono_data_integration)
--normalized_bleu=True
 Length normalization IN TRAINING
--reload=True Reload model from files if exist
--beam_size=12 Beam-size for decoding DURING TRAINING
--dec_embed=620
 Dimension of the word embedding matrix in decoder
--bleu_script=perl ../scripts/multi-bleu.perl %s <
 BLEU script used during training for model selection
--seq_len=50 Sequences longer than this will be discarded
--weight_noise_ff=0.0
 Weight noise flag for feed forward layers