[python] filtering malformed utf8 character sequence(string) - dsindex/blog GitHub Wiki

  • malformed utf8
# filtering malformed utf8 sequence
try : n_line = line.decode('utf-8','ignore').encode('utf-8')
except Exception, e :
    sys.stderr.write(str(e) + '\t' + line + '\n')
    continue
if n_line != line :
    sys.stderr.write('malformed utf8 sequence : %s\n' % (line))
    continue
  • invalid utf8mb4
def check_invalid_utf8mb4(s) :
    invalid = False
    i = 0
    j = len(s)
    s = list(s)
    while i < j :
        k = ord(s[i])
        # 1-byte
        if k <= 127 :
            i += 1
        # 2-byte
        elif k < 224 :
            i += 2
        # 3-byte
        elif k < 240 :
            i += 3
        # more than 4-byte
        else :
            invalid = True
            break
    return invalid