test - luckystar1992/ERM GitHub Wiki
#--coding:utf-8--
import jieba emoji = '#这是一个标签#你好[高兴]][[测试],'
emoji_left, emoji_right = [], [] emoji_find_left, emoji_find_right = False, False
label_index = []
for index, word in enumerate(emoji): if word == '[': if emoji_find_left: del emoji_left[-1] emoji_left.append(index) else: emoji_left.append(index) emoji_find_left = True elif word == ']' and emoji_find_left: if emoji_find_right: del emoji_right[-1] emoji_right.append(index) else: emoji_right.append(index) emoji_find_right = True elif word == "#": label_index.append(index)
if emoji_find_left and emoji_find_right:
emoji_find_left, emoji_find_right = False, False
for (_left, _right) in zip(emoji_left, emoji_right): print(emoji[_left:_right+1])
for _index in range(0,len(label_index),2): print(label_index[_index]) print(label_index[_index+1]) print(emoji[label_index[_index]:label_index[_index+1]+1])
#--coding:utf-8-- from collections import Counter class Corpus(object): """Pre Work: please make sure you have filter all corpus files. """
def __init__(self, corpus_file):
self._corpus_file = corpus_file
words = []
with open(self._corpus_file, encoding="utf8") as f:
for line in f.readlines():
for word in line.strip().split(" "):
words.append(word)
self.dictionary = set(words)
self.counter = Counter(words)
self.word2index = {word:index for index, word in enumerate(self.dictionary)}
self.index2word = {index:word for index, word in enumerate(self.dictionary)}
def seperate(self):
"""Sperate corpus """
with open(self._corpus_file, encoding="utf8") as f:
for line in f.readlines():
pass
@property
def w2i(self):
return self.word2index
@property
def i2w(self):
return self.index2word
def sen2i(self, sentence):
"""Get corresponding index list giving a sentence
"""
return [self.word2index[word] for word in sentence]
def i2sen(self, index_list):
"""Get corresponding sentence giving a index list
"""
word_list = [self.index2word[index] for index in index_list]
return "".join(word_list)
def saveMeta(self):
"""Save meta data like counter, index corpus
"""
with open("", 'a', encoding="utf8") as f:
for word, index in self.counter.most_common(len(self.dictionary)):
f.writelines('%s:%s\n'%(word, index))
if name == "main": corpus = Corpus('corpus.txt') # print(corpus.w2i['你']) # print(corpus.i2w[1]) # corpus.saveCouter("counter.txt") # # print(corpus.sen2i('测试,你好,你好')) # print(corpus.i2sen([1,2,3,4,2,3]))