# -*- coding: utf-8 -*-
import jieba, os
import codecs
from gensim import corpora, models, similarities
from pprint import pprint
from collections import defaultdict
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def load_data():
walk = os.walk('D:/dev_data/sogou')
documents = []
for root, dirs, files in walk:
for name in files:
raw = codecs.open(os.path.join(root, name), 'r', 'utf-8','ignore').read()
documents.append(raw)
return documents
def preprocess(documents):
stoplist = codecs.open('tmp/stopword.txt','r',encoding='utf8').readlines()
stoplist = set(w.strip() for w in stoplist)
#分词,去停用词
texts = [[word for word in list(jieba.cut(document.replace('t','').replace('n',''), cut_all = True)) if word not in stoplist]
for document in documents]
#去除低频词
frequency = defaultdict(int)
for text in texts:
for token in text:
frequency[token] += 1
texts = [[token for token in text if frequency[token] > 2]
for text in texts]
dictionary = corpora.Dictionary(texts)
dictionary.save('tmp/sogou.dict')
print(dictionary)
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('tmp/sogou.mm', corpus)
return corpus,dictionary
def train_lda(corpus,dictionary):
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
# 模型训练
lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = 9)
#模型的保存/ 加载
lda.save('tmp/sogou_lda.model')
def load_lda():
lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
for i in range(4):
print lda.print_topic(i)
def test_lda():
lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
dictionary = corpora.Dictionary.load('tmp/sogou.dict')
corpus = corpora.MmCorpus('tmp/sogou.mm')
stoplist = codecs.open('tmp/stopword.txt', 'r', encoding='utf8').readlines()
unseen_document = """
在本赛季的这三场比赛中,骑士三战皆胜。值得一提的是,全场比赛骑士三分线外46投25中,打破NBA常规赛单场比赛单支球队三分球命中数纪录。
"""
d = "".join(unseen_document.split())
print "The unseen document is composed by the following text:", unseen_document
print
text = [word for word in list(jieba.cut(d, cut_all=True)) if word not in stoplist]
bow_vector = dictionary.doc2bow(text)
for i in range(0, 9):
print lda_model.print_topic(i)
print lda_model[bow_vector]
for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1 * tup[1]):
print "Score: {}t Topic: {}".format(score, lda_model.print_topic(index, 3))
def print_lda():
lda_model = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
for i in range(0, 9):
print i,lda_model.print_topic(i,10)
print 0, lda_model.print_topic(0, 10)
def train():
documents = load_data()
corpus,dictionary = preprocess(documents)
train_lda(corpus,dictionary)
def test():
#load_lda()
#test_lda()
dictionary = corpora.Dictionary.load('tmp/sogou.dict')
print dictionary[10]
print len(dictionary)
print dictionary
def test1():
lda = models.ldamodel.LdaModel.load('tmp/sogou_lda.model')
test_doc = """
中华网总经理陈晓薇表示,该公司将在今年首季推出生活频道及重建英语频道,并着手发展与其他国家及知名企业合作的资讯网页,此外在5月份,中华网推出针对内地专业人士的娱乐内容,作为将来3G手机内容的供应来源。(英宁)
"""
test_doc = list(jieba.cut(test_doc)) # 新文档进行分词
doc_bow = dictionary.doc2bow(test_doc) # 文档转换成bow
doc_lda = lda[doc_bow] # 得到新文档的主题分布
# 输出新文档的主题分布
print doc_lda
for topic in doc_lda:
print "%st%fn" % (lda.print_topic(topic[0]), topic[1])
#train()
print_lda()
服务器托管,北京服务器托管,服务器租用 http://www.fwqtg.net
机房租用,北京机房租用,IDC机房托管, http://www.fwqtg.net
数学建模中编程手们常常会被要求将大量的数据进行批量的预测操作,并写入某个文件中 Excel的批量导出数据,用循环就可以简单实现,例如 import pandas as pd for i in list(df[‘har_feacture’]): result =…