一,基础应用
from gensim import corpora
#创建数据集
def loadDataSet():
corpus = []
tiyu = ['中国足球','国际足球','篮球','NBA','综合体育','奥运','姚明','连败',
'休斯敦','记者','队友','羽毛球','巨人','网球','高尔夫','棋牌','彩票',
'欧冠','英超','西甲','意甲','德甲','中超','国足','足协杯','女足','赛车'
]
yule = ['娱乐','时尚','女人','健康','旅行','公益','星座','音乐','电影','电视剧','综艺','音乐',
'明星','视频','杨颖','刘德华','杨幂','鹿晗','轩辕剑','周杰伦','婚纱','曝光','胡歌','西装',
'笔挺','风度翩翩','郭碧婷','自曝','喜欢','向佐','真实原因','张艺兴','老艺术家们','同台','辛芷蕾','搭档'
]
jiaoyu = ['高考','考研','中考','外语类考试','自考','公务员','资格考试','等级考试','成考','高校库','高校热度榜','分数线','找专家',
'专业库','专业热度榜','专业测评','同分考生取向','新闻','时评','报考','院校','专业分','数线','状元','专家','备考','作文',
'家长必读','心理','营养','资讯','招生','保研','复试','分数线','调剂','四六级','雅思','托福','GRE','GMAT','SAT'
]
caijing = ['股票','新股','港股','美股','基金','期货','外汇','黄金','债券','理财','银行','保险','专家','信托','科创板','专栏','博客','股市汇','会议','数据',
'商品行情','外盘期货','商品持仓','现货报价','各国国债','期指行情','期指持仓','期指研究','行业指数','权重股票','期货名人',
'专家坐堂','高清解盘','期货入门','各国国债','期市要闻','期货研究','机构评论','品种大全','现货黄金','现货白银','现货铂金','现货钯金','国债收益率'
]
corpus.append(tiyu)
corpus.append(yule)
corpus.append(jiaoyu)
corpus.append(caijing)
classVec = ['体育','娱乐','教育','财经']
return corpus,classVec
#构建语料词典:去重操作
def gensim_Corpus(corpus = None):
#生成词典
dictionary = corpora.Dictionary(corpus)
print ('语料词典: \n',dictionary)
"""
运行结果:
语料词典:
Dictionary(145 unique tokens: ['NBA', '中国足球', '中超', '休斯敦', '国足']...)
"""
#选择只出现一次的词
once_ids = [tokenid for tokenid,docfreq in dictionary.dfs.items() if docfreq == 1]
print (once_ids)
"""
运行结果:
[1, 5, 15, 0, 16, 6, 8, 24, 3, 21, 25, 18, 9, 17,
26, 13, 10, 14, 19, 20, 12, 11, 2, 4, 23, 7, 22, 35,
40, 34, 27, 39, 28, 42, 59, 46, 47, 50, 41, 55, 45, 29,
44, 61, 56, 32, 36, 43, 52, 54, 49, 60, 58, 53, 33, 31,
48, 37, 51, 30, 57, 38, 102, 92, 70, 79, 93, 73, 96, 91,
82, 100, 101, 74, 84, 66, 68, 67, 75, 88, 89, 85, 98, 65,
87, 90, 69, 77, 71, 80, 81, 94, 97, 86, 72, 78, 95, 76,
99, 83, 63, 62, 64, 140, 119, 130, 138, 115, 124, 116, 144, 108,
136, 142, 106, 107, 137, 104, 109, 139, 105, 118, 113, 117, 112,
131, 110, 123, 121, 122, 141, 129, 126, 103, 143, 125, 120, 127,
128, 111, 135, 132, 134, 133, 114]
"""
#once_ids = [dictionary[tokenid] for tokenid,docfreq in dictionary.dfs.items() if docfreq == 1]
#print (once_ids)
#删除仅出现一次的词
dictionary.filter_tokens(once_ids)
print(dictionary)
#消除id序列在删除词后产生的不连续的序列
dictionary.compactify()
#将字典存储备用,方便以后使用
#dict方式存储
savePath = r'../dataSet/files/mycorpus.dict'
dictionary.save(savePath)
#加载字典
mydict = corpora.Dictionary.load(savePath)
#TXT方式存储
savePath = r'../dataSet/files/mycorpus.text'
dictionary.save_as_text(savePath)
#加载字典
mydict = corpora.Dictionary.load_from_text(savePath)
if __name__ == '__main__':
corpus,classVec = loadDataSet()
gensim_Corpus(corpus)
二,生成TFIDF模型
from gensim import corpora,models
#创建数据集
def loadDataSet():
corpus = []
tiyu = ['中国足球','国际足球','篮球','NBA','综合体育','奥运','姚明','连败',
'休斯敦','记者','队友','羽毛球','巨人','网球','高尔夫','棋牌','彩票',
'欧冠','英超','西甲','意甲','德甲','中超','国足','足协杯','女足','赛车'
]
yule = ['娱乐','时尚','女人','健康','旅行','公益','星座','音乐','电影','电视剧','综艺','音乐',
'明星','视频','杨颖','刘德华','杨幂','鹿晗','轩辕剑','周杰伦','婚纱','曝光','胡歌','西装',
'笔挺','风度翩翩','郭碧婷','自曝','喜欢','向佐','真实原因','张艺兴','老艺术家们','同台','辛芷蕾','搭档'
]
jiaoyu = ['高考','考研','中考','外语类考试','自考','公务员','资格考试','等级考试','成考','高校库','高校热度榜','分数线','找专家',
'专业库','专业热度榜','专业测评','同分考生取向','新闻','时评','报考','院校','专业分','数线','状元','专家','备考','作文',
'家长必读','心理','营养','资讯','招生','保研','复试','分数线','调剂','四六级','雅思','托福','GRE','GMAT','SAT'
]
caijing = ['股票','新股','港股','美股','基金','期货','外汇','黄金','债券','理财','银行','保险','专家','信托','科创板','专栏','博客','股市汇','会议','数据',
'商品行情','外盘期货','商品持仓','现货报价','各国国债','期指行情','期指持仓','期指研究','行业指数','权重股票','期货名人',
'专家坐堂','高清解盘','期货入门','各国国债','期市要闻','期货研究','机构评论','品种大全','现货黄金','现货白银','现货铂金','现货钯金','国债收益率'
]
corpus.append(tiyu)
corpus.append(yule)
corpus.append(jiaoyu)
corpus.append(caijing)
classVec = ['体育','娱乐','教育','财经']
return corpus,classVec
#统计特征词频
def gensim_Corpus(corpus = None):
dictionary = corpora.Dictionary(corpus)
#获取词频
dfs = dictionary.dfs
#获取词ID和词频
for key_id,freq in dfs.items():
print(dictionary[key_id],':',freq,'\n')
#计算TF-IDF
def gensim_tfidf(corpus = None,classVec = ' '):
dictionary = corpora.Dictionary(corpus)
#生成词袋模型
doc_bag_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]
print ('词袋模型实现文本向量化: \n',doc_bag_corpus)
#生成tfidf模型
tfidf_model = models.TfidfModel(dictionary = dictionary )
print ('生成tfidf模型: \n',tfidf_model)
"""
生成tfidf模型:
TfidfModel(num_docs=4, num_nnz=146)
num_nnz:每个文件中不重复词个数之和
"""
#tfidf字典
corpus_tfidf = { }
for doc_bow in doc_bag_corpus:
file_tfidf = tfidf_model[doc_bow] #词袋填充
catg = classVec[i]
temp_eval = corpus_tfidf.get(catg,[])
print(temp_eval)
temp_eval.append(file_tfidf)
print(temp_eval)
if temp_eval.__len__() == 1:
corpus_tfidf[catg] = temp_eval
if __name__ == '__main__':
corpus,classVec = loadDataSet()
gensim_Corpus(corpus)
gensim_tfidf(corpus,classVec)
三,生成LSA模型
from gensim import corpora,models
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
#数据序列号和反序列化存储
import pickle as pkl
#创建数据集
def loadDataSet():
corpus = []
tiyu = ['中国足球','国际足球','篮球','NBA','综合体育','奥运','姚明','连败',
'休斯敦','记者','队友','羽毛球','巨人','网球','高尔夫','棋牌','彩票',
'欧冠','英超','西甲','意甲','德甲','中超','国足','足协杯','女足','赛车'
]
yule = ['娱乐','时尚','女人','健康','旅行','公益','星座','音乐','电影','电视剧','综艺','音乐',
'明星','视频','杨颖','刘德华','杨幂','鹿晗','轩辕剑','周杰伦','婚纱','曝光','胡歌','西装',
'笔挺','风度翩翩','郭碧婷','自曝','喜欢','向佐','真实原因','张艺兴','老艺术家们','同台','辛芷蕾','搭档'
]
jiaoyu = ['高考','考研','中考','外语类考试','自考','公务员','资格考试','等级考试','成考','高校库','高校热度榜','分数线','找专家',
'专业库','专业热度榜','专业测评','同分考生取向','新闻','时评','报考','院校','专业分','数线','状元','专家','备考','作文',
'家长必读','心理','营养','资讯','招生','保研','复试','分数线','调剂','四六级','雅思','托福','GRE','GMAT','SAT'
]
caijing = ['股票','新股','港股','美股','基金','期货','外汇','黄金','债券','理财','银行','保险','专家','信托','科创板','专栏','博客','股市汇','会议','数据',
'商品行情','外盘期货','商品持仓','现货报价','各国国债','期指行情','期指持仓','期指研究','行业指数','权重股票','期货名人',
'专家坐堂','高清解盘','期货入门','各国国债','期市要闻','期货研究','机构评论','品种大全','现货黄金','现货白银','现货铂金','现货钯金','国债收益率'
]
corpus.append(tiyu)
corpus.append(yule)
corpus.append(jiaoyu)
corpus.append(caijing)
classVec = ['体育','娱乐','教育','财经']
return corpus,classVec
def gensim_lsa(corpus = None):
dictionary = corpora.Dictionary(corpus)
#1 doc_bow转换TF-IDF向量模型
doc_bow_corpus = [dictionary.doc2bow(doc_cut) for doc_cut in corpus]
tfidf_model = models.TfidfModel(dictionary = dictionary)
tfidf_corpus = [tfidf_model[doc_bow] for doc_bow in doc_bow_corpus]
print(tfidf_corpus)
#2.生成LSA/LSI 模型
lsi_model = models.LsiModel(corpus = tfidf_corpus,id2word = dictionary,num_topics = 20)
print ("LSA生成的模型: \n",lsi_model)
#LSA前20个的概率
lsi_corpus = [lsi_model[tfidf_doc] for tfidf_doc in tfidf_corpus]
print (lsi_corpus)
#本地化存储
savepath = r'../lsi_model.pkl'
lsi_file = open(savepath,'wb')
pkl.dump(lsi_model,lsi_file)
lsi_file.close()
if __name__ == '__main__':
corpus,classVec = loadDataSet()
gensim_lsa(corpus)
类似的还有LDA模型,RP模型,HDP模型,和上面的用法差不多
参考教程:
代码来源: https://www.imooc.com/video/19764
LSA教程: https://www.jianshu.com/p/9fe0a7004560