# 2. 建立词典以及生僻词用 UNK 代替
vocabulary_size = 100000
def build_dataset(arg_words):
# 词汇编码
l_count = [["UNK",-1]]
l_count.extend((collections.Counter(arg_words).most_common(vocabulary_size - 1)))
print("l_count:",len(l_count))
l_dictionary = dict()
for word, _ in l_count:
l_dictionary[word] = len(l_dictionary)
# 使用生成的词汇编码将前面的 string list[arg_words] 转为 num list[data]
l_data = list[]
unk_count = 0
for word in arg_words:
if word in l_dictionary:
index = l_dictionary[word]
else:
index = 0
unk_count += 1
l_data.append(index)
l_count[0][1] = unk_count
# 反转字典key为词汇编码,values为词汇本身
l_reverse_dictionary = dict(zip(l_dictionary.values(),l_dictionary.keys()))
return l_data,l_count,l_dictionary,l_reverse_dictionary
# 删除 words 节省内存
del words
data_index = 0