test - luckystar1992/ERM GitHub Wiki

#--coding:utf-8--

import jieba emoji = '#这是一个标签#你好[高兴]][[测试],'

emoji_left, emoji_right = [], [] emoji_find_left, emoji_find_right = False, False

label_index = []

for index, word in enumerate(emoji): if word == '[': if emoji_find_left: del emoji_left[-1] emoji_left.append(index) else: emoji_left.append(index) emoji_find_left = True elif word == ']' and emoji_find_left: if emoji_find_right: del emoji_right[-1] emoji_right.append(index) else: emoji_right.append(index) emoji_find_right = True elif word == "#": label_index.append(index)

if emoji_find_left and emoji_find_right:
	emoji_find_left, emoji_find_right = False, False

for (_left, _right) in zip(emoji_left, emoji_right): print(emoji[_left:_right+1])

for _index in range(0,len(label_index),2): print(label_index[_index]) print(label_index[_index+1]) print(emoji[label_index[_index]:label_index[_index+1]+1])

#--coding:utf-8-- from collections import Counter class Corpus(object): """Pre Work: please make sure you have filter all corpus files. """

def __init__(self, corpus_file):
    self._corpus_file = corpus_file
    words = []
    with open(self._corpus_file, encoding="utf8") as f:
        for line in f.readlines():
            for word in line.strip().split(" "):
                words.append(word)
        self.dictionary = set(words)
        self.counter = Counter(words)
        self.word2index = {word:index for index, word in enumerate(self.dictionary)}
        self.index2word = {index:word for index, word in enumerate(self.dictionary)}

def seperate(self):
    """Sperate corpus """
    with open(self._corpus_file, encoding="utf8") as f:
        for line in f.readlines():
            pass

@property
def w2i(self):
    return self.word2index

@property
def i2w(self):
    return self.index2word

def sen2i(self, sentence):
    """Get corresponding index list giving a sentence
    """
    return [self.word2index[word] for word in sentence]

def i2sen(self, index_list):
    """Get corresponding sentence giving a index list
    """
    word_list = [self.index2word[index] for index in index_list]
    return "".join(word_list)

def saveMeta(self):
    """Save meta data like counter, index corpus
    """
    with open("", 'a', encoding="utf8") as f:
        for word, index in self.counter.most_common(len(self.dictionary)):
            f.writelines('%s:%s\n'%(word, index))

if name == "main": corpus = Corpus('corpus.txt') # print(corpus.w2i['你']) # print(corpus.i2w[1]) # corpus.saveCouter("counter.txt") # # print(corpus.sen2i('测试,你好,你好')) # print(corpus.i2sen([1,2,3,4,2,3]))