test1 - luckystar1992/ERM GitHub Wiki

#! /usr/bin/env python

-- coding:UTF-8 --

""" Created on 6/30/17 Project : noah_smt_pipeline Filename: smt_training_pipline.py Author: c00412415 """ import sys import os import shutil import codecs import argparse import subprocess import logging import json import traceback from json_define import * import rule_filter import smt_training_tools import train_language_model

logging.basicConfig(stream=sys.stderr, level=logging.INFO)

INIT_MODE = 'init' TRAIN_MODE = 'train' PREP_MODE = 'preprocess'

MERGE_MODE = 'merge' DEV_PREP_MODE = 'devprep' TUNING_MODE = 'tuning' TEST_MODE = 'test' BIN_MODEL_MODE = 'binary_model' LM_MODE = 'lm'

TRAINING_STEPS_HELP = '1: CHECK_VALID_STEP; 2: CHARSET_UNIFICATION_STEP; '
'3: NE_RECOGNITION_STEP; '
'4: TOKENIZATION_STEP; '
'5: TRUECASING_STEP; '
'6: PLAIN_TEXT_EXTRACTION_STEP; '
'7: ALIGNMENT_STEP; '
'8: SYMMETRIZATION_STEP; '
'9: RULE_EXTRACTION_STEP; '
'10: RULE_FILTERING_STEP'

def check_init_stat(local_work_dir): if not os.path.isdir(local_work_dir): logging.error(local_work_dir + ' not exists, work directory not init correctly.') return False

if not os.path.isdir(os.path.join(local_work_dir, MT_CFG_FILE_FOLDER)):
    logging.error('config folder not exists, work directory not init correctly.')
    return False

#  check pre-process related file
for _step in pre_step_name_list:
    tgt_file = os.path.join(local_work_dir, MT_CFG_FILE_FOLDER, _step + '_conf.json')
    if not os.path.exists(tgt_file):
        logging.error(tgt_file + ' not exist, work directory not init correctly')
        return False

# check train-process related file
for _step in align_and_rule_step_name_list:
    tgt_file = os.path.join(local_work_dir, MT_CFG_FILE_FOLDER, _step + '_conf.json')
    if not os.path.exists(tgt_file):
        logging.error(tgt_file + ' not exist, work directory not init correctly')
        return False

return True

def init_work_path(_dir, _mt_dir): _abs_dir = os.path.abspath(_dir) _mt_dir = os.path.abspath(_mt_dir) if _abs_dir != _dir: logging.warn('the path input is a relative path')

if os.path.isdir(_abs_dir):
    logging.info(_abs_dir + " is existed, use this dir")
else:
    logging.info("create path: " + _abs_dir)
    try:
        os.mkdir(_abs_dir)
    except OSError, e:
        logging.error(traceback.print_exc())
        logging.error("create dir fail in init process , fail exit")
        exit(-1)

work_cfg_path = os.path.join(_abs_dir, MT_CFG_FILE_FOLDER)
if os.path.isdir(work_cfg_path):
    logging.info(work_cfg_path + ' exist, remove it')
    try:
        shutil.rmtree(work_cfg_path)
    except OSError:
        logging.error(traceback.print_exc())
        logging.error("remove permission denied, exit")
        exit(-1)
# copy config file from bin/init_cfg to work_path/init_cfg
shutil.copytree(os.path.join(_mt_dir, MT_CFG_FILE_FOLDER),
                work_cfg_path)

def pre_proc(work_dir, mt_dir, _corpus, traindir, filterd=True, **kwargs):

"""

run pre-process: check_valid

:param work_dir:

:param mt_dir:

:param _corpus:

:param traindir:

:param filterd: whether filter sentences or not

:param kwargs:

:return:

"""

is_monolingual = kwargs['is_mono']

src_lang_type = kwargs['src']

tgt_lang_type = kwargs['tgt']

# if not check_init_stat(work_dir):

# logging.info("init process didn't run success before, procedure exit")

# return False

if os.path.exists(_corpus):

logging.info(_corpus + " exist on local disk. ")

else:

tmp_cmd = HADOOP_LS_CMD % _corpus

proc_ = subprocess.Popen(tmp_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True)

proc_.communicate()

proc_.wait()

if proc_.returncode == 0:

logging.info(_corpus + " exist on hdfs. ")

else:

logging.info(_corpus + " not exist on hdfs. ")

return False

_copy_files(work_dir, mt_dir, pre_step_name_list)

init_cfg(work_dir, mt_dir, _corpus, traindir, pre_step_name_list,

is_mono=is_monolingual,

src=src_lang_type,

tgt=tgt_lang_type)

import time

time.clock()

t1 = time.time()

logging.info("check valid trans pair ing ...")

if not check_valid_pair.pipeline_train_proc(os.path.join(work_dir,

MT_CFG_FILE_FOLDER, EAI_TRAIN_CHECK_VALID_CONF),

filterd):

logging.error("check trans pair valid fail, exit.")

return False

t2 = time.time()

logging.info("check cost %f s." % (t2 - t1))

t1 = time.time()

logging.info("charset_unification ing...")

if not charset_uniform.pipeline_train_proc(os.path.join(work_dir,

MT_CFG_FILE_FOLDER, EAI_TRAIN_CHARSET_UNIF_CONF)):

logging.error("charset uniform fail, exit.")

return False

t2 = time.time()

logging.info("charset unification cost %f s." % (t2 - t1))

t1 = time.time()

logging.info("ne extracting ing...")

if not name_entity_recognize.pipeline_train_proc(os.path.join(work_dir,

MT_CFG_FILE_FOLDER,

EAI_TRAIN_NE_REC_CONF)):

logging.error("ne recognize fail, exit.")

return False

t2 = time.time()

logging.info("name entity cost %f s." % (t2 - t1))

t1 = time.time()

logging.info("tokenization ing...")

if not tokenize.pipeline_train_proc(os.path.join(work_dir, MT_CFG_FILE_FOLDER, EAI_TRAIN_TOKEN_CONF)):

logging.error("tokenization fail, exit.")

return False

t2 = time.time()

logging.info("tokenize cost %f s." % (t2 - t1))

t1 = time.time()

logging.info("truecasing ing ...")

if not truecasing.truecasing_train_pipeline(os.path.join(work_dir, MT_CFG_FILE_FOLDER, EAI_TRAIN_TRUECASE_CONF)):

logging.error("truecasing fail, exit.")

return False

t2 = time.time()

logging.info("truecasing cost %f s." % (t2 - t1))

t1 = time.time()

if not plain_text_extract.pipeline_train_proc(os.path.join(work_dir, MT_CFG_FILE_FOLDER,

EAI_TRAIN_PLAIN_TEXT_CONF), filterd):

logging.error("extracting fail, exit.")

return False

t2 = time.time()

logging.info("plaintext extract cost %f s." % (t2 - t1))

return True

def align_and_rule_extract_proc(work_dir, mt_dir, _extract_output, traindir, **kwargs):

is_monolingual = kwargs['is_mono']

src_lang_type = kwargs['src']

tgt_lang_type = kwargs['tgt']

# if not check_init_stat(work_dir):

# logging.info("init process didn't run success before, procedure exit")

# return False

_copy_files(work_dir, mt_dir, align_and_rule_step_name_list)

init_cfg(work_dir, mt_dir, _extract_output, traindir, align_and_rule_step_name_list,

is_mono=is_monolingual,

src=src_lang_type,

tgt=tgt_lang_type)

import time

t1 = time.time()

if not alignment.alignment_pipeline(os.path.join(work_dir, MT_CFG_FILE_FOLDER,

EAI_TRAIN_ALIGN_CONF)):

logging.error('alignment fail, exit.')

return False

t2 = time.time()

logging.info("aligment cost %f s." % (t2 - t1))

t1 = time.time()

if not rule_extract.pipeline_train_proc(work_dir, mt_dir):

logging.error('rule extraction fail, exit')

return False

t2 = time.time()

logging.info("rule extract cost %f s." % (t2 - t1))

return True

def filter_rule_proc(work_dir, mt_dir, _extract_output, traindir, **kwargs):

is_monolingual = kwargs['is_mono']

src_lang_type = kwargs['src']

tgt_lang_type = kwargs['tgt']

# if not check_init_stat(work_dir):

# logging.info("init process didn't run success before, procedure exit")

# return False

_copy_files(work_dir, mt_dir, rule_filter_step_name_list)

init_cfg(work_dir, mt_dir, _extract_output, traindir, rule_filter_step_name_list,

is_mono=is_monolingual,

src=src_lang_type,

tgt=tgt_lang_type)

import time

t1 = time.time()

if not rule_filter.pipeline_train_proc(work_dir, traindir):

logging.error('alignment fail, exit.')

return False

t2 = time.time()

logging.info("filter rule cost %f s." % (t2 - t1))

return True

def pipeline_proc(work_dir, mt_dir, corpus, traindir, **kwargs):

"""

read stream format: source corpus\ttarget corpus

charset_uniform -> NE recognize -> tokenize -> truecase -> plaintext extract ...

:param work_dir: local work path

:param mt_dir: local mt_bin path

:param corpus:

:param traindir:

:param kwargs:

:return: True for running successfully, False for running unsuccessfully

"""

is_monolingual = kwargs['is_mono']

src_lang_type = kwargs['src']

tgt_lang_type = kwargs['tgt']

if pre_proc(work_dir, mt_dir, corpus, traindir, is_mono=is_monolingual,

src=src_lang_type, tgt=tgt_lang_type):

logging.info("run train data pre process successfully.")

else:

logging.error("run train data pre process fail.")

return False

if align_and_rule_extract_proc(work_dir, mt_dir, os.path.join(traindir, 'alignment'), traindir,

is_mono=is_monolingual, src=src_lang_type, tgt=tgt_lang_type):

logging.info("run alignment and rule extraction successfully.")

else:

logging.error("run alignment and rule extraction fail.")

return False

# dev set pre-process

if filter_rule_proc(work_dir, mt_dir, corpus, traindir,

is_mono=is_monolingual, src=src_lang_type, tgt=tgt_lang_type):

logging.info("run rule filter successfully.")

else:

logging.error("run rule filter fail.")

return True

def check_hdfs_train_dir(hdfs_train_dir): # check if hdfs_train_dir is on hdfs hdfs_cmd = smt_training_tools.HADOOP_LS_CMD % hdfs_train_dir check_proc = subprocess.Popen(hdfs_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True) check_proc.communicate() check_proc_.wait() if check_proc_.returncode == 0: logging.info(hdfs_train_dir + " exist on hdfs. ") else: logging.info(hdfs_train_dir + " not exist on hdfs. ") mkdir_cmd = smt_training_tools.HADOO_MKDIR_CMD % hdfs_train_dir mkdir_proc = subprocess.Popen(mkdir_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True) mkdir_proc.communicate() mkdir_proc.wait() if mkdir_proc_.returncode == 0: logging.info(_hdfs_train_dir + 'created on hdfs') else: logging.error('create hdfs dir : ' + _hdfs_train_dir + ' failed!!!') return False return True

def merge_local_bilingal_to_hdfs(_work_dir, _hdfs_train_dir, _local_src, _local_tgt): if not os.path.isdir(_work_dir): logging.info('local work dir :' + _work_dir + ' not exist, create') os.mkdir(_work_dir) local_corpus = os.path.join(_work_dir, smt_training_tools.DEFAULT_CORPUS_NAME) if rule_filter.prep_bilingual_data(_local_src, _local_tgt, local_corpus, _hdfs_train_dir): logging.info('merge {} and {} to hdfs success'.format(_local_src, _local_tgt)) return True else: logging.info('merge {} and {} to hdfs failed'.format(_local_src, _local_tgt)) return False

def perform_init(args): try: init_path = os.path.abspath(args.local) mt_path = os.path.abspath(args.mtdir) logging.info("init local work path to " + init_path) init_work_path(init_path, mt_path) exit(0) except AttributeError, e: logging.error("AttributeError: " + e.message) exit(-1)

def perform_preprocess(args): try: training_cfg_file = args.config do_filter = args.filter start_step = args.start end_step = args.end train_truecasing = False logging.info('perform preprocess with filter = ' + str(do_filter)) if not os.path.exists(training_cfg_file): logging.error('missing cfg file : ' + training_cfg_file) exit(-1) with codecs.open(training_cfg_file, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp)

        work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR])
        mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR])
        hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER]
        corpus_file = train_cfg[EAI_TRAIN_CORPUS]
        is_monolingual = train_cfg[EAI_MONOLINGUAL]
        src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE]
        tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE]
        dev_src_raw = ''
        # train_cfg[EAI_TRAIN_DEV_SRC_RAW]
        dev_ref_raw_list = []
        # train_cfg[EAI_TRAIN_DEV_REF_RAW]
        tst_src_raw = ''
        # train_cfg[EAI_TRAIN_TST_SRC_RAW]
        tst_ref_raw_list = []
        # train_cfg[EAI_TRAIN_TST_REF_RAW]
        # smt_training_tools.CHECK_VALID_STEP
        # smt_training_tools.PLAIN_TEXT_EXTRACTION_STEP
        # smt_training_tools.CHECK_VALID_STEP
        if is_monolingual:
            logging.info("These procedure is assigned to process monolingual corpus, "
                         "the language type is " + src_lang_type)
        else:
            logging.info("These procedure is assigned to process bi-lingual corpus, "
                         "the source language type is " + src_lang_type +
                         ", the target language type is " + tgt_lang_type)

        if src_lang_type not in EAI_MT_SUPPORT_LANG:
            logging.error("unsupported language type " + src_lang_type)
            exit(-1)
        elif tgt_lang_type not in EAI_MT_SUPPORT_LANG:
            logging.error("unsupported language type " + tgt_lang_type)
            exit(-1)
        elif tgt_lang_type == src_lang_type:
            logging.error("source language type is same as target language type.")
            exit(-1)

        if not check_hdfs_train_dir(hdfs_train_dir):
            return False
        tmp_cmd = smt_training_tools.HADOOP_LS_CMD % corpus_file
        proc_ = subprocess.Popen(tmp_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True)
        proc_.communicate()
        proc_.wait()
        if proc_.returncode == 0:
            logging.info(corpus_file + ' exist on hdfs. ')
        else:
            logging.info(corpus_file + ' not exist on hdfs. ')
            if is_monolingual:
                if EAI_SOURCE_CORPUS in train_cfg:
                    src_cps_local = train_cfg[EAI_SOURCE_CORPUS]
                    put_cmd = smt_training_tools.HADOOP_PUT_CMD % (src_cps_local, corpus_file)
                    logging.info('run: ' + put_cmd)
                    put_proc_ = subprocess.Popen(put_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout,
                                                 shell=True)
                    put_proc_.communicate()
                    put_proc_.wait()
                    if put_proc_.returncode != 0:
                        exit(-1)
            else:
                if EAI_SOURCE_CORPUS in train_cfg and EAI_TARGET_CORPUS in train_cfg:
                    src_cps_local = train_cfg[EAI_SOURCE_CORPUS]
                    tgt_cps_local = train_cfg[EAI_TARGET_CORPUS]
                    if not merge_local_bilingal_to_hdfs(work_dir, hdfs_train_dir, src_cps_local, tgt_cps_local):
                        logging.error('fail to merge ' + src_cps_local + ' and ' + tgt_cps_local +
                                      ' to ' + hdfs_train_dir)
                        exit(-1)
                else:
                    logging.error('missing corpus on hdfs and local.')
                    exit(-1)

        if not smt_training_tools.init_preprocess_cfg(work_dir, mt_dir, hdfs_train_dir, corpus_file, is_monolingual,
                                                      src_lang_type, tgt_lang_type, dev_src_raw, dev_ref_raw_list,
                                                      tst_src_raw, tst_ref_raw_list):
            return False

        if not smt_training_tools.pre_alignment_steps(work_dir, is_monolingual, do_filter, train_truecasing,
                                                      start_step, end_step):
            return False
        return True
except StandardError:
    logging.error(traceback.print_exc())
    exit(-1)

def pipline_steps(_work_dir, _mt_dir, _hdfs_train_dir, _corpus_file, _is_monolingual, _src_lang_type, _tgt_lang_type, _dev_src_raw, _dev_ref_raw_list, _tst_src_raw, _tst_ref_raw_list, _do_filter, train_drt, _start_step, _end_step): if not check_hdfs_train_dir(_hdfs_train_dir): return False if not smt_training_tools.init_cfgs(_work_dir, _mt_dir, _hdfs_train_dir, _corpus_file, _is_monolingual, _src_lang_type, _tgt_lang_type, _dev_src_raw, _dev_ref_raw_list, _tst_src_raw, _tst_ref_raw_list, train_drt): return False

train_truecasing = True
if not smt_training_tools.pre_alignment_steps(_work_dir, _is_monolingual, _do_filter, train_truecasing,
                                              _start_step, _end_step):
    return False

if not smt_training_tools.alignment_rule_extraction_steps(_work_dir, _mt_dir, _do_filter, train_drt,
                                                          _start_step, _end_step):
    return False

if not smt_training_tools.ruletable_filtering_steps(_work_dir, _mt_dir, _hdfs_train_dir, train_drt,
                                                    _start_step, _end_step):
    return False
return True

def perform_training(args): try: training_cfg = args.config do_filter = not args.nofilter if not os.path.exists(training_cfg): logging.error('missing cfg file : ' + training_cfg) exit(-1) with codecs.open(training_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp)

        work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR])
        mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR])
        hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER]
        corpus_file = train_cfg[EAI_TRAIN_CORPUS]
        is_monolingual = False
        src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE]
        tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE]
        dev_src_raw = train_cfg[EAI_TRAIN_DEV_SRC_RAW]
        dev_ref_raw_list = train_cfg[EAI_TRAIN_DEV_REF_RAW]
        tst_src_raw = train_cfg[EAI_TRAIN_TST_SRC_RAW]
        tst_ref_raw_list = train_cfg[EAI_TRAIN_TST_REF_RAW]
        train_drt = train_cfg[EAI_TRAIN_DIRECTION]
        start_step = args.start
        end_step = args.end
        if is_monolingual:
            logging.info("These procedure is assigned to process monolingual corpus, "
                         "the language type is " + src_lang_type)
        else:
            logging.info("These procedure is assigned to process bi-lingual corpus, "
                         "the source language type is " + src_lang_type +
                         ", the target language type is " + tgt_lang_type)

        if src_lang_type not in EAI_MT_SUPPORT_LANG:
            logging.error("unsupported language type " + src_lang_type)
            exit(-1)
        elif tgt_lang_type not in EAI_MT_SUPPORT_LANG:
            logging.error("unsupported language type " + tgt_lang_type)
            exit(-1)
        elif tgt_lang_type == src_lang_type:
            logging.error("source language type is same as target language type.")
            exit(-1)

        tmp_cmd = smt_training_tools.HADOOP_LS_CMD % corpus_file
        proc_ = subprocess.Popen(tmp_cmd, stdin=sys.stdin, stderr=sys.stderr, stdout=sys.stdout, shell=True)
        proc_.communicate()
        proc_.wait()
        if proc_.returncode == 0:
            logging.info(corpus_file + " exist on hdfs. ")
        else:
            logging.info(corpus_file + " not exist on hdfs. ")
            if EAI_SOURCE_CORPUS in train_cfg and EAI_TARGET_CORPUS in train_cfg:
                src_cps_local = train_cfg[EAI_SOURCE_CORPUS]
                tgt_cps_local = train_cfg[EAI_TARGET_CORPUS]
                if not merge_local_bilingal_to_hdfs(work_dir, hdfs_train_dir, src_cps_local, tgt_cps_local):
                    logging.error('fail to merge ' + src_cps_local + ' and ' + tgt_cps_local +
                                  ' to ' + hdfs_train_dir)
                    exit(-1)
            else:
                logging.error('missing corpus on hdfs and local.')
                exit(-1)

        pipline_steps(work_dir, mt_dir, hdfs_train_dir, corpus_file, is_monolingual,
                      src_lang_type, tgt_lang_type, dev_src_raw, dev_ref_raw_list,
                      tst_src_raw, tst_ref_raw_list, do_filter, train_drt, start_step, end_step)

except StandardError:
    logging.error(traceback.print_exc())
    exit(-1)

def perform_tuning(args): try: training_cfg = args.config with codecs.open(training_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp) work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR]) mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR]) hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER] corpus_file = train_cfg[EAI_TRAIN_CORPUS] src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE] tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE] dev_src_raw = train_cfg[EAI_TRAIN_DEV_SRC_RAW] dev_ref_raw_list = train_cfg[EAI_TRAIN_DEV_REF_RAW] tst_src_raw = train_cfg[EAI_TRAIN_TST_SRC_RAW] tst_ref_raw_list = train_cfg[EAI_TRAIN_TST_REF_RAW] train_drt = train_cfg[EAI_TRAIN_DIRECTION] if EAI_TRAIN_ARPA_LM in train_cfg: arpa_lm = train_cfg[EAI_TRAIN_ARPA_LM] else: arpa_lm = '' if not smt_training_tools.tuning_steps(work_dir, mt_dir, corpus_file, hdfs_train_dir, src_lang_type, tgt_lang_type, dev_src_raw, tst_src_raw, dev_ref_raw_list, tst_ref_raw_list, arpa_lm, train_drt): logging.error('tuning failed!!!') exit(-1) exit(0) except StandardError: logging.error(traceback.print_exc()) exit(-1)

def perform_lm_training(args): training_cfg = args.config with codecs.open(training_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: train_cfg = json.load(fp) work_dir = os.path.abspath(train_cfg[EAI_TRAIN_WORKDIR]) mt_dir = os.path.abspath(train_cfg[EAI_TRAIN_MTDIR]) hdfs_train_dir = train_cfg[EAI_TRAIN_HDFS_WORD_FOLDER] corpus_file = train_cfg[EAI_TRAIN_CORPUS] src_lang_type = train_cfg[EAI_SOURCE_LANGUAGE_TYPE] tgt_lang_type = train_cfg[EAI_TARGET_LANGUAGE_TYPE] drt = train_cfg[EAI_TRAIN_DIRECTION] step_name_list = [EAI_TRAIN_LANGUAGE_MODEL] if not smt_training_tools.init_cfg(work_dir, mt_dir, corpus_file, hdfs_train_dir, step_name_list, is_mono=False, src=src_lang_type, tgt=tgt_lang_type, dev_src_raw='', tst_src_raw='', dev_ref_list=[], tst_ref_list=[], direction=drt): logging.error('writing tuning config failed!!!') return False lm_cfg_file = os.path.join(work_dir, MT_CFG_FILE_FOLDER, EAI_TRAIN_LANGUAGE_MODEL_CONF) if not train_language_model.lm_training_pipeline(lm_cfg_file): exit(-1) return

def perform_binary_model(args): try: binary_ruletable_cfg = args.config with codecs.open(binary_ruletable_cfg, 'rb', encoding=EAI_TRAIN_ENCODING) as fp: binary_cfg = json.load(fp) work_dir = os.path.abspath(binary_cfg[EAI_TRAIN_WORKDIR]) mt_dir = os.path.abspath(binary_cfg[EAI_TRAIN_MTDIR]) hdfs_train_dir = binary_cfg[EAI_TRAIN_HDFS_WORD_FOLDER] corpus_file = binary_cfg[EAI_TRAIN_CORPUS] src_lang_type = binary_cfg[EAI_SOURCE_LANGUAGE_TYPE] tgt_lang_type = binary_cfg[EAI_TARGET_LANGUAGE_TYPE] if not smt_training_tools.build_binary_ruletable_steps(work_dir, mt_dir, corpus_file, hdfs_train_dir, src_lang_type, tgt_lang_type): exit(-1) except StandardError: logging.error(traceback.print_exc()) exit(-1) return

def perform_merge(args): if not merge_local_bilingal_to_hdfs(args.local, args.traindir, args.src, args.tgt): exit(-1) exit(0)

def main(args): if INIT_MODE == args.mode: perform_init(args) elif PREP_MODE == args.mode: perform_preprocess(args) elif TRAIN_MODE == args.mode: perform_training(args) elif TUNING_MODE == args.mode: perform_tuning(args) elif LM_MODE == args.mode: perform_lm_training(args) elif BIN_MODEL_MODE == args.mode: perform_binary_model(args) elif MERGE_MODE == args.mode: perform_merge(args)

if name == "main": arg_parser = argparse.ArgumentParser(description="this parser parse args from command line") subparser = arg_parser.add_subparsers(dest='mode')

init_parser = subparser.add_parser(INIT_MODE, help="init work path, --init=/path/to/work")
merge_parser = subparser.add_parser(MERGE_MODE, help='merge src and tgt training data to hdfs.')
pipe_parser = subparser.add_parser(TRAIN_MODE, help="trigger train pipeline")
prep_parser = subparser.add_parser(PREP_MODE, help='trigger pre procedure')
dev_prep_parser = subparser.add_parser(DEV_PREP_MODE, help='prepare tuning data(dev and tst)')
tuning_parser = subparser.add_parser(TUNING_MODE, help='perform tuning')
binary_model_parser = subparser.add_parser(BIN_MODEL_MODE, help='build binary model')
lm_training_parser = subparser.add_parser(LM_MODE, help='build binary model')

# init arg parser
init_parser.add_argument('--local', type=str, required=True, help="local work path")
init_parser.add_argument('--mtdir', type=str, required=True, help='mt_bin path')

# merge parser
merge_parser.add_argument('--src', type=str, required=True, help='src training data')
merge_parser.add_argument('--tgt', type=str, required=True, help='tgt training data')
merge_parser.add_argument('--traindir', required=True, help="train path : HDFS path")
merge_parser.add_argument('--local', type=str, required=True, help="local work path")

# train arg parser
pipe_parser.add_argument('--config', type=str, required=True,
                         help='config for training, including dev and tst data')
pipe_parser.add_argument('--start', type=int, required=True,
                         help=('the start step, could be any of : ' + TRAINING_STEPS_HELP))
pipe_parser.add_argument('--end', type=int, required=True,
                         help='the end step')
pipe_parser.add_argument('--no-filter', dest='nofilter', action='store_true',
                         help='control filter corpus or not, default is false(do filtering)')

# prep arg parser
prep_parser.add_argument('--config', type=str, required=True,
                         help='config for dev and tst data, or monolingual data')
prep_parser.add_argument('--start', type=int, required=True,
                         help='the start step')
prep_parser.add_argument('--end', type=int, required=True,
                         help='the end step')
prep_parser.add_argument('--filter', dest='filter', action='store_true',
                         help='control filter corpus or not, default is false(no filter)')

tuning_parser.add_argument('--config', type=str, required=True,
                           help='config for training, including dev and tst data')
binary_model_parser.add_argument('--config', type=str, required=True,
                                 help='config for training, including the local and HDFS work dir')
lm_training_parser.add_argument('--config', type=str, required=True,
                                help='config for training, including the local and HDFS work dir')
opts = arg_parser.parse_args()
main(opts)