test1 - luckystar1992/ERM GitHub Wiki
#! /usr/bin/env python
-- coding:UTF-8 --
""" Created on 6/30/17 Project : noah_smt_pipeline Filename: smt_training_pipline.py Author: c00412415 """ import sys import os import shutil import codecs import argparse import subprocess import logging import json import traceback from json_define import * import rule_filter import smt_training_tools import train_language_model
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
INIT_MODE = 'init' TRAIN_MODE = 'train' PREP_MODE = 'preprocess'
MERGE_MODE = 'merge' DEV_PREP_MODE = 'devprep' TUNING_MODE = 'tuning' TEST_MODE = 'test' BIN_MODEL_MODE = 'binary_model' LM_MODE = 'lm'
TRAINING_STEPS_HELP = '1: CHECK_VALID_STEP; 2: CHARSET_UNIFICATION_STEP; '
'3: NE_RECOGNITION_STEP; '
'4: TOKENIZATION_STEP; '
'5: TRUECASING_STEP; '
'6: PLAIN_TEXT_EXTRACTION_STEP; '
'7: ALIGNMENT_STEP; '
'8: SYMMETRIZATION_STEP; '
'9: RULE_EXTRACTION_STEP; '
'10: RULE_FILTERING_STEP'
def check_init_stat(local_work_dir): if not os.path.isdir(local_work_dir): logging.error(local_work_dir + ' not exists, work directory not init correctly.') return False
if not os.path.isdir(os.path.join(local_work_dir, MT_CFG_FILE_FOLDER)):
logging.error('config folder not exists, work directory not init correctly.')
return False
# check pre-process related file
for _step in pre_step_name_list:
tgt_file = os.path.join(local_work_dir, MT_CFG_FILE_FOLDER, _step + '_conf.json')
if not os.path.exists(tgt_file):
logging.error(tgt_file + ' not exist, work directory not init correctly')
return False
# check train-process related file
for _step in align_and_rule_step_name_list:
tgt_file = os.path.join(local_work_dir, MT_CFG_FILE_FOLDER, _step + '_conf.json')
if not os.path.exists(tgt_file):
logging.error(tgt_file + ' not exist, work directory not init correctly')
return False
return True
def init_work_path(_dir, _mt_dir): _abs_dir = os.path.abspath(_dir) _mt_dir = os.path.abspath(_mt_dir) if _abs_dir != _dir: logging.warn('the path input is a relative path')
if os.path.isdir(_abs_dir):
logging.info(_abs_dir + " is existed, use this dir")
else:
logging.info("create path: " + _abs_dir)
try:
os.mkdir(_abs_dir)
except OSError, e:
logging.error(traceback.print_exc())
logging.error("create dir fail in init process , fail exit")
exit(-1)
work_cfg_path = os.path.join(_abs_dir, MT_CFG_FILE_FOLDER)
if os.path.isdir(work_cfg_path):
logging.info(work_cfg_path + ' exist, remove it')
try:
shutil.rmtree(work_cfg_path)
except OSError:
logging.error(traceback.print_exc())
logging.error("remove permission denied, exit")
exit(-1)
# copy config file from bin/init_cfg to work_path/init_cfg
shutil.copytree(os.path.join(_mt_dir, MT_CFG_FILE_FOLDER),
work_cfg_path)
def pre_proc(work_dir, mt_dir, _corpus, traindir, filterd=True, **kwargs):
"""
run pre-process: check_valid
:param work_dir:
:param mt_dir:
:param _corpus:
:param traindir:
:param filterd: whether filter sentences or not
:param kwargs:
:return:
"""
is_monolingual = kwargs['is_mono']
src_lang_type = kwargs['src']
tgt_lang_type = kwargs['tgt']
# if not check_init_stat(work_dir):
# logging.info("init process didn't run success before, procedure exit")
# return False
if os.path.exists(_corpus):
logging.info(_corpus + " exist on local disk. ")
else:
tmp_cmd = HADOOP_LS_CMD % _corpus
proc_ = subprocess.Popen(tmp_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True)
proc_.communicate()
proc_.wait()
if proc_.returncode == 0:
logging.info(_corpus + " exist on hdfs. ")
else:
logging.info(_corpus + " not exist on hdfs. ")
return False
_copy_files(work_dir, mt_dir, pre_step_name_list)
init_cfg(work_dir, mt_dir, _corpus, traindir, pre_step_name_list,
is_mono=is_monolingual,
src=src_lang_type,
tgt=tgt_lang_type)
import time
time.clock()
t1 = time.time()
logging.info("check valid trans pair ing ...")
if not check_valid_pair.pipeline_train_proc(os.path.join(work_dir,
MT_CFG_FILE_FOLDER, EAI_TRAIN_CHECK_VALID_CONF),
filterd):
logging.error("check trans pair valid fail, exit.")
return False
t2 = time.time()
logging.info("check cost %f s." % (t2 - t1))
t1 = time.time()
logging.info("charset_unification ing...")
if not charset_uniform.pipeline_train_proc(os.path.join(work_dir,
MT_CFG_FILE_FOLDER, EAI_TRAIN_CHARSET_UNIF_CONF)):
logging.error("charset uniform fail, exit.")
return False
t2 = time.time()
logging.info("charset unification cost %f s." % (t2 - t1))
t1 = time.time()
logging.info("ne extracting ing...")
if not name_entity_recognize.pipeline_train_proc(os.path.join(work_dir,
MT_CFG_FILE_FOLDER,
EAI_TRAIN_NE_REC_CONF)):
logging.error("ne recognize fail, exit.")
return False
t2 = time.time()
logging.info("name entity cost %f s." % (t2 - t1))
t1 = time.time()
logging.info("tokenization ing...")
if not tokenize.pipeline_train_proc(os.path.join(work_dir, MT_CFG_FILE_FOLDER, EAI_TRAIN_TOKEN_CONF)):
logging.error("tokenization fail, exit.")
return False
t2 = time.time()
logging.info("tokenize cost %f s." % (t2 - t1))
t1 = time.time()
logging.info("truecasing ing ...")
if not truecasing.truecasing_train_pipeline(os.path.join(work_dir, MT_CFG_FILE_FOLDER, EAI_TRAIN_TRUECASE_CONF)):
logging.error("truecasing fail, exit.")
return False
t2 = time.time()
logging.info("truecasing cost %f s." % (t2 - t1))
t1 = time.time()
if not plain_text_extract.pipeline_train_proc(os.path.join(work_dir, MT_CFG_FILE_FOLDER,
EAI_TRAIN_PLAIN_TEXT_CONF), filterd):
logging.error("extracting fail, exit.")
return False
t2 = time.time()
logging.info("plaintext extract cost %f s." % (t2 - t1))
return True
def align_and_rule_extract_proc(work_dir, mt_dir, _extract_output, traindir, **kwargs):
is_monolingual = kwargs['is_mono']
src_lang_type = kwargs['src']
tgt_lang_type = kwargs['tgt']
# if not check_init_stat(work_dir):
# logging.info("init process didn't run success before, procedure exit")
# return False
_copy_files(work_dir, mt_dir, align_and_rule_step_name_list)
init_cfg(work_dir, mt_dir, _extract_output, traindir, align_and_rule_step_name_list,
is_mono=is_monolingual,
src=src_lang_type,
tgt=tgt_lang_type)
import time
t1 = time.time()
if not alignment.alignment_pipeline(os.path.join(work_dir, MT_CFG_FILE_FOLDER,
EAI_TRAIN_ALIGN_CONF)):
logging.error('alignment fail, exit.')
return False
t2 = time.time()
logging.info("aligment cost %f s." % (t2 - t1))
t1 = time.time()
if not rule_extract.pipeline_train_proc(work_dir, mt_dir):
logging.error('rule extraction fail, exit')
return False
t2 = time.time()
logging.info("rule extract cost %f s." % (t2 - t1))
return True
def filter_rule_proc(work_dir, mt_dir, _extract_output, traindir, **kwargs):
is_monolingual = kwargs['is_mono']
src_lang_type = kwargs['src']
tgt_lang_type = kwargs['tgt']
# if not check_init_stat(work_dir):
# logging.info("init process didn't run success before, procedure exit")
# return False
_copy_files(work_dir, mt_dir, rule_filter_step_name_list)
init_cfg(work_dir, mt_dir, _extract_output, traindir, rule_filter_step_name_list,
is_mono=is_monolingual,
src=src_lang_type,
tgt=tgt_lang_type)
import time
t1 = time.time()
if not rule_filter.pipeline_train_proc(work_dir, traindir):
logging.error('alignment fail, exit.')
return False
t2 = time.time()
logging.info("filter rule cost %f s." % (t2 - t1))
return True
def pipeline_proc(work_dir, mt_dir, corpus, traindir, **kwargs):
"""
read stream format: source corpus\ttarget corpus
charset_uniform -> NE recognize -> tokenize -> truecase -> plaintext extract ...
:param work_dir: local work path
:param mt_dir: local mt_bin path
:param corpus:
:param traindir:
:param kwargs:
:return: True for running successfully, False for running unsuccessfully
"""
is_monolingual = kwargs['is_mono']
src_lang_type = kwargs['src']
tgt_lang_type = kwargs['tgt']
if pre_proc(work_dir, mt_dir, corpus, traindir, is_mono=is_monolingual,
src=src_lang_type, tgt=tgt_lang_type):
logging.info("run train data pre process successfully.")
else:
logging.error("run train data pre process fail.")
return False
if align_and_rule_extract_proc(work_dir, mt_dir, os.path.join(traindir, 'alignment'), traindir,
is_mono=is_monolingual, src=src_lang_type, tgt=tgt_lang_type):
logging.info("run alignment and rule extraction successfully.")
else:
logging.error("run alignment and rule extraction fail.")
return False
# dev set pre-process
if filter_rule_proc(work_dir, mt_dir, corpus, traindir,
is_mono=is_monolingual, src=src_lang_type, tgt=tgt_lang_type):
logging.info("run rule filter successfully.")
else:
logging.error("run rule filter fail.")
return True
def check_hdfs_train_dir(hdfs_train_dir): # check if hdfs_train_dir is on hdfs hdfs_cmd = smt_training_tools.HADOOP_LS_CMD % hdfs_train_dir check_proc = subprocess.Popen(hdfs_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True) check_proc.communicate() check_proc_.wait() if check_proc_.returncode == 0: logging.info(hdfs_train_dir + " exist on hdfs. ") else: logging.info(hdfs_train_dir + " not exist on hdfs. ") mkdir_cmd = smt_training_tools.HADOO_MKDIR_CMD % hdfs_train_dir mkdir_proc = subprocess.Popen(mkdir_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True) mkdir_proc.communicate() mkdir_proc.wait() if mkdir_proc_.returncode == 0: logging.info(_hdfs_train_dir + 'created on hdfs') else: logging.error('create hdfs dir : ' + _hdfs_train_dir + ' failed!!!') return False return True
def merge_local_bilingal_to_hdfs(_work_dir, _hdfs_train_dir, _local_src, _local_tgt): if not os.path.isdir(_work_dir): logging.info('local work dir :' + _work_dir + ' not exist, create') os.mkdir(_work_dir) local_corpus = os.path.join(_work_dir, smt_training_tools.DEFAULT_CORPUS_NAME) if rule_filter.prep_bilingual_data(_local_src, _local_tgt, local_corpus, _hdfs_train_dir): logging.info('merge {} and {} to hdfs success'.format(_local_src, _local_tgt)) return True else: logging.info('merge {} and {} to hdfs failed'.format(_local_src, _local_tgt)) return False
def perform_init(args): try: init_path = os.path.abspath(args.local) mt_path = os.path.abspath(args.mtdir) logging.info("init local work path to " + init_path) init_work_path(init_path, mt_path) exit(0) except AttributeError, e: logging.error("AttributeError: " + e.message) exit(-1)
def perform_preprocess(args): try: training_cfg_file = args.config do_filter = args.filter start_step = args.start end_step = args.end train_truecasing = False logging.info('perform preprocess with filter = ' + str(do_filter)) if not os.path.exists(training_cfg_file): logging.error('missing cfg file : ' + training_cfg_file) exit(-1) with codecs.open(training_cfg_file, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp)
work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR])
mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR])
hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER]
corpus_file = train_cfg[EAI_TRAIN_CORPUS]
is_monolingual = train_cfg[EAI_MONOLINGUAL]
src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE]
tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE]
dev_src_raw = ''
# train_cfg[EAI_TRAIN_DEV_SRC_RAW]
dev_ref_raw_list = []
# train_cfg[EAI_TRAIN_DEV_REF_RAW]
tst_src_raw = ''
# train_cfg[EAI_TRAIN_TST_SRC_RAW]
tst_ref_raw_list = []
# train_cfg[EAI_TRAIN_TST_REF_RAW]
# smt_training_tools.CHECK_VALID_STEP
# smt_training_tools.PLAIN_TEXT_EXTRACTION_STEP
# smt_training_tools.CHECK_VALID_STEP
if is_monolingual:
logging.info("These procedure is assigned to process monolingual corpus, "
"the language type is " + src_lang_type)
else:
logging.info("These procedure is assigned to process bi-lingual corpus, "
"the source language type is " + src_lang_type +
", the target language type is " + tgt_lang_type)
if src_lang_type not in EAI_MT_SUPPORT_LANG:
logging.error("unsupported language type " + src_lang_type)
exit(-1)
elif tgt_lang_type not in EAI_MT_SUPPORT_LANG:
logging.error("unsupported language type " + tgt_lang_type)
exit(-1)
elif tgt_lang_type == src_lang_type:
logging.error("source language type is same as target language type.")
exit(-1)
if not check_hdfs_train_dir(hdfs_train_dir):
return False
tmp_cmd = smt_training_tools.HADOOP_LS_CMD % corpus_file
proc_ = subprocess.Popen(tmp_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True)
proc_.communicate()
proc_.wait()
if proc_.returncode == 0:
logging.info(corpus_file + ' exist on hdfs. ')
else:
logging.info(corpus_file + ' not exist on hdfs. ')
if is_monolingual:
if EAI_SOURCE_CORPUS in train_cfg:
src_cps_local = train_cfg[EAI_SOURCE_CORPUS]
put_cmd = smt_training_tools.HADOOP_PUT_CMD % (src_cps_local, corpus_file)
logging.info('run: ' + put_cmd)
put_proc_ = subprocess.Popen(put_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout,
shell=True)
put_proc_.communicate()
put_proc_.wait()
if put_proc_.returncode != 0:
exit(-1)
else:
if EAI_SOURCE_CORPUS in train_cfg and EAI_TARGET_CORPUS in train_cfg:
src_cps_local = train_cfg[EAI_SOURCE_CORPUS]
tgt_cps_local = train_cfg[EAI_TARGET_CORPUS]
if not merge_local_bilingal_to_hdfs(work_dir, hdfs_train_dir, src_cps_local, tgt_cps_local):
logging.error('fail to merge ' + src_cps_local + ' and ' + tgt_cps_local +
' to ' + hdfs_train_dir)
exit(-1)
else:
logging.error('missing corpus on hdfs and local.')
exit(-1)
if not smt_training_tools.init_preprocess_cfg(work_dir, mt_dir, hdfs_train_dir, corpus_file, is_monolingual,
src_lang_type, tgt_lang_type, dev_src_raw, dev_ref_raw_list,
tst_src_raw, tst_ref_raw_list):
return False
if not smt_training_tools.pre_alignment_steps(work_dir, is_monolingual, do_filter, train_truecasing,
start_step, end_step):
return False
return True
except StandardError:
logging.error(traceback.print_exc())
exit(-1)
def pipline_steps(_work_dir, _mt_dir, _hdfs_train_dir, _corpus_file, _is_monolingual, _src_lang_type, _tgt_lang_type, _dev_src_raw, _dev_ref_raw_list, _tst_src_raw, _tst_ref_raw_list, _do_filter, train_drt, _start_step, _end_step): if not check_hdfs_train_dir(_hdfs_train_dir): return False if not smt_training_tools.init_cfgs(_work_dir, _mt_dir, _hdfs_train_dir, _corpus_file, _is_monolingual, _src_lang_type, _tgt_lang_type, _dev_src_raw, _dev_ref_raw_list, _tst_src_raw, _tst_ref_raw_list, train_drt): return False
train_truecasing = True
if not smt_training_tools.pre_alignment_steps(_work_dir, _is_monolingual, _do_filter, train_truecasing,
_start_step, _end_step):
return False
if not smt_training_tools.alignment_rule_extraction_steps(_work_dir, _mt_dir, _do_filter, train_drt,
_start_step, _end_step):
return False
if not smt_training_tools.ruletable_filtering_steps(_work_dir, _mt_dir, _hdfs_train_dir, train_drt,
_start_step, _end_step):
return False
return True
def perform_training(args): try: training_cfg = args.config do_filter = not args.nofilter if not os.path.exists(training_cfg): logging.error('missing cfg file : ' + training_cfg) exit(-1) with codecs.open(training_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp)
work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR])
mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR])
hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER]
corpus_file = train_cfg[EAI_TRAIN_CORPUS]
is_monolingual = False
src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE]
tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE]
dev_src_raw = train_cfg[EAI_TRAIN_DEV_SRC_RAW]
dev_ref_raw_list = train_cfg[EAI_TRAIN_DEV_REF_RAW]
tst_src_raw = train_cfg[EAI_TRAIN_TST_SRC_RAW]
tst_ref_raw_list = train_cfg[EAI_TRAIN_TST_REF_RAW]
train_drt = train_cfg[EAI_TRAIN_DIRECTION]
start_step = args.start
end_step = args.end
if is_monolingual:
logging.info("These procedure is assigned to process monolingual corpus, "
"the language type is " + src_lang_type)
else:
logging.info("These procedure is assigned to process bi-lingual corpus, "
"the source language type is " + src_lang_type +
", the target language type is " + tgt_lang_type)
if src_lang_type not in EAI_MT_SUPPORT_LANG:
logging.error("unsupported language type " + src_lang_type)
exit(-1)
elif tgt_lang_type not in EAI_MT_SUPPORT_LANG:
logging.error("unsupported language type " + tgt_lang_type)
exit(-1)
elif tgt_lang_type == src_lang_type:
logging.error("source language type is same as target language type.")
exit(-1)
tmp_cmd = smt_training_tools.HADOOP_LS_CMD % corpus_file
proc_ = subprocess.Popen(tmp_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True)
proc_.communicate()
proc_.wait()
if proc_.returncode == 0:
logging.info(corpus_file + " exist on hdfs. ")
else:
logging.info(corpus_file + " not exist on hdfs. ")
if EAI_SOURCE_CORPUS in train_cfg and EAI_TARGET_CORPUS in train_cfg:
src_cps_local = train_cfg[EAI_SOURCE_CORPUS]
tgt_cps_local = train_cfg[EAI_TARGET_CORPUS]
if not merge_local_bilingal_to_hdfs(work_dir, hdfs_train_dir, src_cps_local, tgt_cps_local):
logging.error('fail to merge ' + src_cps_local + ' and ' + tgt_cps_local +
' to ' + hdfs_train_dir)
exit(-1)
else:
logging.error('missing corpus on hdfs and local.')
exit(-1)
pipline_steps(work_dir, mt_dir, hdfs_train_dir, corpus_file, is_monolingual,
src_lang_type, tgt_lang_type, dev_src_raw, dev_ref_raw_list,
tst_src_raw, tst_ref_raw_list, do_filter, train_drt, start_step, end_step)
except StandardError:
logging.error(traceback.print_exc())
exit(-1)
def perform_tuning(args): try: training_cfg = args.config with codecs.open(training_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp) work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR]) mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR]) hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER] corpus_file = train_cfg[EAI_TRAIN_CORPUS] src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE] tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE] dev_src_raw = train_cfg[EAI_TRAIN_DEV_SRC_RAW] dev_ref_raw_list = train_cfg[EAI_TRAIN_DEV_REF_RAW] tst_src_raw = train_cfg[EAI_TRAIN_TST_SRC_RAW] tst_ref_raw_list = train_cfg[EAI_TRAIN_TST_REF_RAW] train_drt = train_cfg[EAI_TRAIN_DIRECTION] if EAI_TRAIN_ARPA_LM in train_cfg: arpa_lm = train_cfg[EAI_TRAIN_ARPA_LM] else: arpa_lm = '' if not smt_training_tools.tuning_steps(work_dir, mt_dir, corpus_file, hdfs_train_dir, src_lang_type, tgt_lang_type, dev_src_raw, tst_src_raw, dev_ref_raw_list, tst_ref_raw_list, arpa_lm, train_drt): logging.error('tuning failed!!!') exit(-1) exit(0) except StandardError: logging.error(traceback.print_exc()) exit(-1)
def perform_lm_training(args): training_cfg = args.config with codecs.open(training_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp) work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR]) mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR]) hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER] corpus_file = train_cfg[EAI_TRAIN_CORPUS] src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE] tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE] drt = train_cfg[EAI_TRAIN_DIRECTION] step_name_list = [EAI_TRAIN_LANGUAGE_MODEL] if not smt_training_tools.init_cfg(work_dir, mt_dir, corpus_file, hdfs_train_dir, step_name_list, is_mono=False, src=src_lang_type, tgt=tgt_lang_type, dev_src_raw='', tst_src_raw='', dev_ref_list=[], tst_ref_list=[], direction=drt): logging.error('writing tuning config failed!!!') return False lm_cfg_file = os.path.join(work_dir, MT_CFG_FILE_FOLDER, EAI_TRAIN_LANGUAGE_MODEL_CONF) if not train_language_model.lm_training_pipeline(lm_cfg_file): exit(-1) return
def perform_binary_model(args): try: binary_ruletable_cfg = args.config with codecs.open(binary_ruletable_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: binary_cfg = json.load(fp) work_dir = os.path.abspath(binary_cfg[EAI_TRAIN_WORKDIR]) mt_dir = os.path.abspath(binary_cfg[EAI_TRAIN_MTDIR]) hdfs_train_dir = binary_cfg[EAI_TRAIN_HDFS_WORD_FOLDER] corpus_file = binary_cfg[EAI_TRAIN_CORPUS] src_lang_type = binary_cfg[EAI_SOURCE_LANGUAGE_TYPE] tgt_lang_type = binary_cfg[EAI_TARGET_LANGUAGE_TYPE] if not smt_training_tools.build_binary_ruletable_steps(work_dir, mt_dir, corpus_file, hdfs_train_dir, src_lang_type, tgt_lang_type): exit(-1) except StandardError: logging.error(traceback.print_exc()) exit(-1) return
def perform_merge(args): if not merge_local_bilingal_to_hdfs(args.local, args.traindir, args.src, args.tgt): exit(-1) exit(0)
def main(args): if INIT_MODE == args.mode: perform_init(args) elif PREP_MODE == args.mode: perform_preprocess(args) elif TRAIN_MODE == args.mode: perform_training(args) elif TUNING_MODE == args.mode: perform_tuning(args) elif LM_MODE == args.mode: perform_lm_training(args) elif BIN_MODEL_MODE == args.mode: perform_binary_model(args) elif MERGE_MODE == args.mode: perform_merge(args)
if name == "main": arg_parser = argparse.ArgumentParser(description="this parser parse args from command line") subparser = arg_parser.add_subparsers(dest='mode')
init_parser = subparser.add_parser(INIT_MODE, help="init work path, --init=/path/to/work")
merge_parser = subparser.add_parser(MERGE_MODE, help='merge src and tgt training data to hdfs.')
pipe_parser = subparser.add_parser(TRAIN_MODE, help="trigger train pipeline")
prep_parser = subparser.add_parser(PREP_MODE, help='trigger pre procedure')
dev_prep_parser = subparser.add_parser(DEV_PREP_MODE, help='prepare tuning data(dev and tst)')
tuning_parser = subparser.add_parser(TUNING_MODE, help='perform tuning')
binary_model_parser = subparser.add_parser(BIN_MODEL_MODE, help='build binary model')
lm_training_parser = subparser.add_parser(LM_MODE, help='build binary model')
# init arg parser
init_parser.add_argument('--local', type=str, required=True, help="local work path")
init_parser.add_argument('--mtdir', type=str, required=True, help='mt_bin path')
# merge parser
merge_parser.add_argument('--src', type=str, required=True, help='src training data')
merge_parser.add_argument('--tgt', type=str, required=True, help='tgt training data')
merge_parser.add_argument('--traindir', required=True, help="train path : HDFS path")
merge_parser.add_argument('--local', type=str, required=True, help="local work path")
# train arg parser
pipe_parser.add_argument('--config', type=str, required=True,
help='config for training, including dev and tst data')
pipe_parser.add_argument('--start', type=int, required=True,
help=('the start step, could be any of : ' + TRAINING_STEPS_HELP))
pipe_parser.add_argument('--end', type=int, required=True,
help='the end step')
pipe_parser.add_argument('--no-filter', dest='nofilter', action='store_true',
help='control filter corpus or not, default is false(do filtering)')
# prep arg parser
prep_parser.add_argument('--config', type=str, required=True,
help='config for dev and tst data, or monolingual data')
prep_parser.add_argument('--start', type=int, required=True,
help='the start step')
prep_parser.add_argument('--end', type=int, required=True,
help='the end step')
prep_parser.add_argument('--filter', dest='filter', action='store_true',
help='control filter corpus or not, default is false(no filter)')
tuning_parser.add_argument('--config', type=str, required=True,
help='config for training, including dev and tst data')
binary_model_parser.add_argument('--config', type=str, required=True,
help='config for training, including the local and HDFS work dir')
lm_training_parser.add_argument('--config', type=str, required=True,
help='config for training, including the local and HDFS work dir')
opts = arg_parser.parse_args()
main(opts)