mnist=input_data.read_sets('Mnist_dataset',one_hot=True)
#载入数据集
mnist=input_data.read_sets('
什么是nlp?
nlp是英文natural language processing的英文缩写,中文翻译就是自然语言处理。他是一个交叉性的学科,包含以下内容:
1、计算机科学
2、人工智能
3、语言学
什么是自然语言
·语言是人类交际的工具,是人类思维的载体
·人造语言:编程语言,包括c++,basic等
·自然语言:
·形式:口语、书面语、手语
·语种:汉语,英语、日语、法语......
语言是研究语言规律的科学
语言的构成:
语言:词汇、和语法
词汇:词和熟语
词:词素
语法:词法和句法
词法:构形法和构词法
句法:词组造句法和造句法
自然语言的特点:
·自然语言充满歧义,很难完全消解
·句法结构歧义
·咬死了猎人的狗
三个大学老师
·词义歧义
他说:“她这个人真有意思”。她说:“他这个人怪有意思的”。于是人们以为他们有了那种意思,并让他向她意思意思。他火了:“我根本没有那个意思”!她也生气了:“你们这么说是什么意思”? 事后有人说,“真有意思”。也有人说:“真没意思”。
import pickle
# 定义符号表
def token_lookup():
symbols = {"。",",","“","”",";",",","!","?","(",")","-","\n"}
tokens = {"P","C","Q","T","S","E","M","I","O","D","R"}
return dict(zip(symbols,tokens))
# 保存预处理数据到指定的二进制文件中
def save_data(token,vocab_to_int,int_to_vocab):
pickle.dump((token,vocab_to_int,int_to_vocab),"data\preprocess.p","wb")
# 从保存的数据文件加载到内存
def load_data():
return pickle.load(open('data\preprocess.p',mode='rb'))
# 保存模型参数到二进制文件
def save_parameter(params):
pickle.dump(params,open("data\params.p",'wb'))
# 加载模型参数到内存
def load_parameter():
return pickle.load(open("data\params.p",mode="rb"))
# 1. 从文件中提取停止词和训练文本
def read_data():
# 读取停用词
stop_words = []
with open("data/stop_words.txt","r",encoding="utf-8") as fStopWords:
line = fStopWords.readline()
while line:
stop_words.append((line[:-1])) # 去\n
line = fStopWords.readline()
stop_words = set(stop_words)
print("停用词读取完毕,共{n}个词".format(n=len(stop_words)))
# 读取文本,预处理,粉刺,去除停用词.得到词典
s_folder_path = "data/materials"
ls_files = []
for root,dirs,files in os.walk(s_folder_path):
for file in files:
if file.endswith(".txt"):
ls_files.append(os.path.join(root,file))
raw_word_list = []
for item in ls_files:
with open(item,"r",encoding="utf-8") as f:
line = f.readline()
while line:
while "\n" in line:
line = line.replace("\n","")
while " " in line:
line = line.replace(" ","")
# 如果句子非空
if len(line) > 0:
raw_words = list(jieba.cut(line,cut_all=False))
for _item in raw_words:
# 去除停用词
if _item not in raw_words:
raw_word_list.append(_item)
lin = f.readline()
return raw_word_list
words = read_data()
print("Data size:",len(words))
# 5. 训练模型 # 定义模型步长 num_steps = 100001 with tf.compat.v1.Session(graph=graph) as session: init.run() average_loss = 0 for step in range(num_skips): batch_inputs ,batch_labels = generate_batch(batch_size,num_skips,skip_window) feed_dict = {train_inputs:batch_inputs,train_labels:batch_labels} _,loss_val = session.run([optimizer,loss],feed_dict=feed_dict) average_loss += loss_val if step % 2000 == 0: if step > 0: average_loss /= 2000 print("average loss at step:",step,":",average_loss) average_loss = 0 if step % 10000 == 0: sim = similary.eval() valid_word = reverse_dictionary[valid_examples] top_k = 8 nearest = (-sim[i,:]).argsort()[:top_k] log_str = "Nearest to %s"% valid_word for k in range(top_k): close_word = reverse_dictionary[nearest[k]] log_str = "%s %s,"%(log_str,close_word) print(log_str) final_embeddings = normnalized_embeddings.eval() # 6. 输出向量 with open('output/word2vect.text',"w",encoding="utf-8") as fw2v: fw2v.write(str(vocabulary_size) + " " + str(embedding_size) + "\n") for i in range(final_embeddings.shape[0]): sword = reverse_dictionary[i] svector = "" for j in range(final_embeddings.shape[1]): svector = svector + " " + str(final_embeddings[i,j]) fw2v.write(sword,svector + "\n")
# 4. 构建模型
batch_size = 128
embedding_size = 100
skip_window = 1
num_skips = 2
valid_size = 4 # 切记这个数字要和len(valid_word) 对应,否则会报错
valid_window = 100
num_sampled = 64
# 验证集
valid_word = ["说","实力","害怕","少林寺"]
valid_examples = [dictionary[li] for li in valid_word]
graph = tf.Graph()
with graph.as_default():
# 输入数据
train_inputs = tf.compat.v1.placeholder(tf.int32,shape=[batch_size])
train_labels = tf.compat.v1.placeholder(tf.int32,shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples,dtype=tf.int32)
# 权重矩阵
embeddings = tf.Variable(tf.random.uniform([vocabulary_size,embedding_size],-1.0,1.0))
# 选取张量 embeddings 中对应train_inputs 索引值
embed = tf.nn.embedding_lookup(embeddings,train_inputs)
# 转化变量输入,适配 NCE
nce_weights = tf.Variable(tf.random.truncated_normal([vocabulary_size,embedding_size],stddev=1.0/math.sqrt(embedding_size)))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]),dtype=tf.float32)
# 定义损失函数
loss = tf.reduce_mean(tf.nn.nce_loss(weights=nce_weights,biases=nce_biases,lables=train_labels,num_sampled=num_sampled,num_classes=vocabulary_size))
# 优化器
optimizer = tf.compat.v1.train.GradientDescentOptimizer(1.0).minimize(loss)
# 使用所学的词向量来计算一个给定的minibatch与所有单词之间的相识度
norm = tf.sqrt(tf.reduce_mean(tf.square(embeddings),1,keepdims=True))
normnalized_embeddings = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normnalized_embeddings,valid_dataset)
similary = tf.matmul(valid_embeddings,normnalized_embeddings,transpose_b=True)
init = tf.compat.v1.global_variables_initializer()
data,count,dictionary,reverse_dictionary = build_dataset(arg_words=words)
# 删除 words 节省内存
del words
data_index = 0
# 3. 为 skip_gram 模型生成训练参数
def generate_batch(arg_batch_size,arg_num_skips,arg_ski_windows):
global data_index
l_batch = np.ndarray(shape=arg_batch_size,dtype=np.int32) # (1,arg_batch_size)
l_labels = np.ndarray(shape=(arg_batch_size,1),dtype=np.int32) #(arg_batch_size,1)
span = 2 * arg_ski_windows + 1 # [我 爱 祖 国]
buffer = collections.deque(maxlen=span)
for _ in range(span):
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
for i in range(arg_batch_size // arg_num_skips):
target = arg_ski_windows
targets_to_avoid = [arg_ski_windows]
for j in range(arg_num_skips):
while target in targets_to_avoid:
target = random.randint(0,span - 1)
targets_to_avoid.append(target)
l_batch[i * arg_num_skips + j] = buffer[arg_ski_windows]
l_labels[i * arg_ski_windows + j, 0] = buffer[target]
buffer.append(data[data_index])
data_index = (data_index + 1) % len(data)
return l_batch, l_labels
# 显示示例
batch,lables = generate_batch(arg_batch_size = 8, arg_num_skips = 2, arg_ski_windows = 1)
for i in range(8):
print(batch[i],reverse_dictionary[batch[i]], "->", lables[i,0], reverse_dictionary[lables[i,0]])
# 2. 建立词典以及生僻词用 UNK 代替
vocabulary_size = 100000
def build_dataset(arg_words):
# 词汇编码
l_count = [["UNK",-1]]
l_count.extend((collections.Counter(arg_words).most_common(vocabulary_size - 1)))
print("l_count:",len(l_count))
l_dictionary = dict()
for word, _ in l_count:
l_dictionary[word] = len(l_dictionary)
# 使用生成的词汇编码将前面的 string list[arg_words] 转为 num list[data]
l_data = list[]
unk_count = 0
for word in arg_words:
if word in l_dictionary:
index = l_dictionary[word]
else:
index = 0
unk_count += 1
l_data.append(index)
l_count[0][1] = unk_count
# 反转字典key为词汇编码,values为词汇本身
l_reverse_dictionary = dict(zip(l_dictionary.values(),l_dictionary.keys()))
return l_data,l_count,l_dictionary,l_reverse_dictionary
# 删除 words 节省内存
del words
data_index = 0
记忆的概念
开发准备(二)
开发准备(一)
Reduction_indices=0 :按行压缩
Reduction_indices=1 :按列压缩
图片好形象
豁然开朗。
导入 seq2seq, 用他来计算算是函数loss
为词向量创建嵌入层,提升效率
LSTM模型构建的一些参数
RNN与LSTM
RNN对之前的数据有记忆,但不可能长期保持这些记忆,否则会带来数据分析和保存的问题。
LSTM是RNN的一种延申,选择性记忆。使用Dropout把最该记忆的学习下来并保存
为什么有了BP、CNN,还需要RNN。
传统的,输入、输出独立。
RNN引入了“记忆”
Word2Vec,优化器,算法,Adam
Word2Vec,优化器,梯度下降,SGD
Word2Vec 损失函数,引入了NEC、NEG, 解决了什么
Word2Vec, 损失函数Softmax,解决了什么问题,缺点
一、开发环境准备
1、下载Anaconda,国内镜像地址:https://mirrors.tuna.tsinghua.edu.cn/anaconda/archive/
2、进入Anaconda power shell,加入清华大学镜像源。
#conda config --add channels https://mirrors.tuna.tsinghua.edn.cn/anaconda/pkgs/free/
#conda config --set show_channel_urls yes
#conda create -n tensorflow python=3.7.3
3、安装使用virtualenv环境
#pip install virtualenv
#virtualenv venv_tensorflow
#cd venv_tensorflow
#cd Scripts
4、激活虚拟环境
#.\activate
5、激活虚拟环境后安装tensorflow
#pip install -i https://pypi.tuna.tsinghua.edu.cn/simple/ --upgrade tensorflow
skip-gram原理
CBOW原理
word2vec
语言的构成