# 1. 从文件中提取停止词和训练文本
def read_data():
# 读取停用词
stop_words = []
with open("data/stop_words.txt","r",encoding="utf-8") as fStopWords:
line = fStopWords.readline()
while line:
stop_words.append((line[:-1])) # 去\n
line = fStopWords.readline()
stop_words = set(stop_words)
print("停用词读取完毕,共{n}个词".format(n=len(stop_words)))
# 读取文本,预处理,粉刺,去除停用词.得到词典
s_folder_path = "data/materials"
ls_files = []
for root,dirs,files in os.walk(s_folder_path):
for file in files:
if file.endswith(".txt"):
ls_files.append(os.path.join(root,file))
raw_word_list = []
for item in ls_files:
with open(item,"r",encoding="utf-8") as f:
line = f.readline()
while line:
while "\n" in line:
line = line.replace("\n","")
while " " in line:
line = line.replace(" ","")
# 如果句子非空
if len(line) > 0:
raw_words = list(jieba.cut(line,cut_all=False))
for _item in raw_words:
# 去除停用词
if _item not in raw_words:
raw_word_list.append(_item)
lin = f.readline()
return raw_word_list
words = read_data()
print("Data size:",len(words))