基于LSH执行数据去重策略 - peter-xbs/CommonCodes GitHub Wiki
基于LSH去重的一个实用流程
1. 定义迭代器
def case_iter():
for idx, js in enumerate(kept_lines):
doc = js['content']
doc = doc.strip()
yield {"id": idx, "doc": doc, "js": js}
2. 导入相关依赖包
from hashlib import sha1
import numpy as np
from datasketch.minhash import MinHash
from datasketch.weighted_minhash import WeightedMinHashGenerator
from datasketch.lsh import MinHashLSH
3. 定义相关功能函数
def build_db2(iterator, num_perm):
lsh = MinHashLSH(threshold=0.5, num_perm=num_perm)
for item in iterator:
doc_id, doc_text = item['id'], item['doc']
cur = MinHash(num_perm=num_perm)
for s in doc_text[:50]:
cur.update(s.encode('utf8'))
lsh.insert(doc_id, cur)
return lsh
def query_db2(item, lsh, num_perm):
doc_id, doc_text = item['id'], item['doc']
cur = MinHash(num_perm=num_perm)
for s in doc_text[:50]:
cur.update(s.encode('utf8'))
res = lsh.query(cur)
res = [x for x in res if x!=doc_id]
out = {'query': doc_id, 'sim': res}
return out
def dedup2(lsh, itera, outlist):
kept_set = set()
kept_lines = []
cnt = 0
for item in itera:
cnt += 1
if item['id'] in kept_set:
continue
out = query_db(item, lsh, num_perm=16)
kept_set.add(item['id'])
kept_set.update(out['sim'])
outlist.append(item)
print('proc finished: whole_ids: {}; dedup_ids: {}'.format(cnt, len(kept_lines)))
4. 执行&计时
st = time.time()
lsh = build_db2(case_iter(), num_perm=16)
ed = time.time()
print(ed-st)
topic_list1 = []
dedup2(lsh, case_iter(), topic_list1)
5. 分析&结论
最终结果保存在topic_list中,直接使用即可 ,整体上50万量级数据均在可接受范围内,数据量再大,可能会面临耗时过长的情形 如果有更大规模数据,建议执行快速聚类后,再在每一个cluster内部执行dedup功能或许可行