- 帖子
- 10
- 精华
- 0
- 积分
- 34
- 阅读权限
- 10
- 注册时间
- 2017-12-14
- 最后登录
- 2018-7-26
|
# -*- coding: utf-8 -*-
import sys,re
import importlib
importlib.reload(sys)
text = open('C:/Users/Administrator/Desktop/wordstata.txt','rb').read()
wfile=open('result.txt','w')
txet = text.decode('utf-8')
r = re.compile('[\x80-\xff]+')
m = r.findall(text)
dict={}
z1 = re.compile('[\x80-\xff]{2}')
z2 = re.compile('[\x80-\xff]{4}')
z3 = re.compile('[\x80-\xff]{6}')
z4 = re.compile('[\x80-\xff]{8}')
for i in m:
x = i.encode('gb18030')
i = z1.findall(x)
for j in i:
if(j in dict):
dict[j]+=1
dict=sorted(dict.items(), key=lambda d:d[1])
for a,b in dict:
if b>0:
wfile.write(a+','+str(b)+'\n')
Traceback (most recent call last):
File "C:/Users/Administrator/Desktop/wordstatabytxt.py", line 10, in <module>
txet = text.decode('utf-8')
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte
|
|