基于LSH执行数据去重策略 - peter-xbs/CommonCodes GitHub Wiki

基于LSH去重的一个实用流程

1. 定义迭代器

def case_iter():
    for idx, js in enumerate(kept_lines):
        doc = js['content']
        doc = doc.strip()
        yield {"id": idx, "doc": doc, "js": js}

2. 导入相关依赖包

from hashlib import sha1
import numpy as np
from datasketch.minhash import MinHash
from datasketch.weighted_minhash import WeightedMinHashGenerator
from datasketch.lsh import MinHashLSH

3. 定义相关功能函数

def build_db2(iterator, num_perm):
    lsh = MinHashLSH(threshold=0.5, num_perm=num_perm)
    for item in iterator:
        doc_id, doc_text = item['id'], item['doc']
        cur = MinHash(num_perm=num_perm)
        for s in doc_text[:50]:
            cur.update(s.encode('utf8'))
        lsh.insert(doc_id, cur)
    return lsh
def query_db2(item, lsh, num_perm):
    doc_id, doc_text = item['id'], item['doc']
    cur = MinHash(num_perm=num_perm)
    for s in doc_text[:50]:
        cur.update(s.encode('utf8'))
    res = lsh.query(cur)
    res = [x for x in res if x!=doc_id]
    out = {'query': doc_id, 'sim': res}
    return out
        
def dedup2(lsh, itera, outlist):
    kept_set = set()
    kept_lines = []
    cnt = 0
    for item in itera:
        cnt += 1
        if item['id'] in kept_set:
            continue
        out = query_db(item, lsh, num_perm=16)
        kept_set.add(item['id'])
        kept_set.update(out['sim'])
        outlist.append(item)
    print('proc finished: whole_ids: {}; dedup_ids: {}'.format(cnt, len(kept_lines)))

4. 执行&计时

st = time.time()
lsh = build_db2(case_iter(), num_perm=16)
ed = time.time()
print(ed-st)
topic_list1 = []
dedup2(lsh, case_iter(), topic_list1)   

5. 分析&结论

最终结果保存在topic_list中,直接使用即可 ,整体上50万量级数据均在可接受范围内,数据量再大,可能会面临耗时过长的情形 如果有更大规模数据,建议执行快速聚类后,再在每一个cluster内部执行dedup功能或许可行