[python] filtering malformed utf8 character sequence(string) - dsindex/blog GitHub Wiki
# filtering malformed utf8 sequence
try : n_line = line.decode('utf-8','ignore').encode('utf-8')
except Exception, e :
sys.stderr.write(str(e) + '\t' + line + '\n')
continue
if n_line != line :
sys.stderr.write('malformed utf8 sequence : %s\n' % (line))
continue
def check_invalid_utf8mb4(s) :
invalid = False
i = 0
j = len(s)
s = list(s)
while i < j :
k = ord(s[i])
# 1-byte
if k <= 127 :
i += 1
# 2-byte
elif k < 224 :
i += 2
# 3-byte
elif k < 240 :
i += 3
# more than 4-byte
else :
invalid = True
break
return invalid