text = open('C:/Users/Administrator/Desktop/wordstata.txt','rb').read()
wfile=open('result.txt','w')
txet = text.decode('utf-8')
r = re.compile('[\x80-\xff]+')
m = r.findall(text)
dict={}
z1 = re.compile('[\x80-\xff]{2}')
z2 = re.compile('[\x80-\xff]{4}')
z3 = re.compile('[\x80-\xff]{6}')
z4 = re.compile('[\x80-\xff]{8}')
for i in m:
x = i.encode('gb18030')
i = z1.findall(x)
for j in i:
if(j in dict):
dict[j]+=1
dict=sorted(dict.items(), key=lambda d:d[1])
for a,b in dict:
if b>0:
wfile.write(a+','+str(b)+'\n')
Traceback (most recent call last):
File "C:/Users/Administrator/Desktop/wordstatabytxt.py", line 10, in <module>
txet = text.decode('utf-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte