import re # 正则对字符串的清洗 def textParse(str_doc): # 正则过滤掉特殊符号、标点、英文、数字等 r1 = '[a-zA-Z0-9'!"#$%&'()*+,-./::;;|<=>?@.-。?☆、]^_`{|}~]+' # 去除空格 r2 = '\s+' str_doc = re.sub(r1,' ',str_doc) str_doc = re.sub(r2,' ',str_doc) # 去除换行符 str_doc = str_doc.replace('n','') return str_doc def readFile(path): str_doc="" with open(path,'r',encoding='utf-8')as f: str_doc = f.read() return str_doc if __name__=='__main__': # 1.读取文本 path = r'../dataSet/CSCMNews/体育/0.txt' str_doc = readFile(path) # print(str_doc) # 2.数据清洗 mystr=textParse(str_doc) print(mystr)
以上是课堂代码
1.with open这种方式可以自己关闭不用我们自己关