import re
# 正则对字符串的清洗
def textParse(str_doc):
# 正则过滤掉特殊符号、标点、英文、数字等
r1 = '[a-zA-Z0-9'!"#$%&'()*+,-./::;;|<=>?@.-。?☆、]^_`{|}~]+'
# 去除空格
r2 = '\s+'
str_doc = re.sub(r1,' ',str_doc)
str_doc = re.sub(r2,' ',str_doc)
# 去除换行符
str_doc = str_doc.replace('n','')
return str_doc
def readFile(path):
str_doc=""
with open(path,'r',encoding='utf-8')as f:
str_doc = f.read()
return str_doc
if __name__=='__main__':
# 1.读取文本
path = r'../dataSet/CSCMNews/体育/0.txt'
str_doc = readFile(path)
# print(str_doc)
# 2.数据清洗
mystr=textParse(str_doc)
print(mystr)以上是课堂代码
1.with open这种方式可以自己关闭不用我们自己关