import jieba
import sys
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from gensim.models.word2vec import Word2Vec
#Step1:加载文件,导入数据,分词
def loadfile():
neg = pd.read_excel('./data/neg.xls',header = None,index = None)
pos = pd.read_excel('./data/pos.xls',header = None,index = None)
cw = lambda x: list(jieba.cut(x))
pos['words'] = pos[0].apply(cw)
neg['words'] = neg[0].apply(cw)
#Use '1' for positive,'0' for negative
y = np.concatenate((np.ones(len(pos)),np.zeros(len(neg))))
x_train,x_test,y_train,y_test = train_test_split(np.concatenate((pos['words'],neg['words'])),y,test_size = 0.2,random_state=666)
np.save('./svm_data/y_train.npy',y_train)
np.save('./svm_data/y_test.npy',y_test)
return x_train,x_test
#Step2:计算词向量,并对每个评论的所有词向量取均值作为每个评论的输入
#这里的sentence应该指输入的一个句子,对句子的每个单词取词向量,然后取平均
def buildWordVector(sentence,size,w2v_model):
vec = np.zeros(size).reshape((1,size))
count = 0.
for word in sentence:
try:
vec += w2v_model[word].reshape((1,size))
count += 1.
except KeyError:
continue
if count != 0:
vec /= count
return vec
#计算词向量
def get_train_vecs(x_train,x_test):
#词向量维度:100
n_dim = 100
#Initialize model and build vocab
w2v_model = Word2Vec(size = n_dim,min_count = 10)
w2v_model.build_vocab(x_train)
#在训练集训练词向量模型
w2v_model.train(x_train,total_examples = w2v_model.corpus_count,epochs = w2v_model.iter)
#生成训练集词向量
train_vecs = np.concatenate([buildWordVector(line,n_dim,w2v_model) for line in x_train])
#保存训练集词向量文件
np.save('svm_data/train_vecs.npy',train_vecs)
print("Train word_vector shape:",train_vecs.shape)
#在测试集训练词向量模型
w2v_model.train(x_test,total_examples = w2v_model.corpus_count,epochs = w2v_model.iter)
#生成测试集词向量
test_vecs = np.concatenate([buildWordVector(line,n_dim,w2v_model) for line in x_test])
#保存测试集词向量文件
np.save('svm_data/test_vecs.npy',test_vecs)
print("Test word_vector shape:",test_vecs.shape)
#保存词向量模型
w2v_model.save('svm_data/w2v_model/w2v_model.pkl')
#训练SVM模型
def svm_train(train_vecs,y_train,test_vecs,y_test):
clf = SVC(kernel = 'rbf',verbose = True)
clf.fit(train_vecs,y_train)
#保存训练好的SVM模型
joblib.dump(clf,'svm_data/svm_model/model.pkl')
print('SVM score:',clf.score(test_vecs,y_test))
#得到待预测单个句子的词向量
def get_predict_vecs(sentence):
n_dim = 100
w2v_model = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
predict_vecs = buildWordVector(sentence,n_dim,w2v_model)
return predict_vecs
#对待预测句子进行情感判断
def svm_predict(comment):
sentence = jieba.lcut(comment)
predict_vecs = get_predict_vecs(sentence)
clf = joblib.load('svm_data/svm_model/model.pkl')
result = clf.predict(predict_vecs)
if int(result[0]) == 1:
print(comment,'\n AI预测结果: 好评')
else:
print(comment,'\n AI预测结果: 差评')
def get_data():
train_vecs = np.load('svm_data/train_vecs.npy')
y_train = np.load('svm_data/y_train.npy')
test_vecs = np.load('svm_data/test_vecs.npy')
y_test = np.load('svm_data/y_test.npy')
return train_vecs,y_train,test_vecs,y_test
if __name__ == '__main__':
#训练
x_train,x_test = loadfile()
get_train_vecs(x_train,x_test)
train_vecs,y_train,test_vecs,y_test = get_data()
svm_train(train_vecs,y_train,test_vecs,y_test)
#预测
string = '牛逼的手机,从3米高的地方摔下去都没坏,质量非常好'
svm_predict(string)
打开App,阅读手记