博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
Django 中使用 xlwt 导出生成的 Excel 关键词推荐结果
阅读量:4230 次
发布时间:2019-05-26

本文共 3810 字,大约阅读时间需要 12 分钟。

好多博客中都是直接上代码,不说使用的Python2还是Python3,代码中出现的模块也不提示import方式。我虽受其利,也深受其害,在此引以为戒。本文在Python3.6的编译环境下运行(其实只要是Python3应该都可以)
import gensimimport timeimport datetimeimport numpy as npimport xlwtfrom io import BytesIO  # Python2中请使用from StringIO import StringIOfrom django.http import HttpResponse
# 计算n-grams词组def compute_ngrams(word, min_n, max_n):    extended_word = word    ngrams = []    for ngram_length in range(min_n, min(len(extended_word), max_n) + 1):        for i in range(0, len(extended_word) - ngram_length + 1):            ngrams.append(extended_word[i:i + ngram_length])    return list(set(ngrams))
# 获得词的词向量def word_vector(word, wv_from_text, min_n=1, max_n=3): # 获得词语的词向量,实在找不到就返回0    # 确认词向量维度    word_size = wv_from_text.wv.syn0[0].shape[0]    # 计算word的ngrams词组    ngrams = compute_ngrams(word, min_n=min_n, max_n=max_n)    # 如果在词典之中,直接返回词向量    if word in wv_from_text.index2word:        return wv_from_text[word]    else:        # 不在词典的情况下,计算与词相近的词向量        word_vec = np.zeros(word_size, dtype=np.float32)        ngrams_found = 0        ngrams_single = [ng for ng in ngrams if len(ng) == 1]          ngrams_more = [ng for ng in ngrams if len(ng) > 1]          # 先只接受2个单词长度以上的词向量        for ngram in ngrams_more:            if ngram in wv_from_text.index2word:                word_vec += wv_from_text[ngram]                ngrams_found += 1                # print(ngram)        # 如果,没有匹配到,那么最后是考虑单个词向量,不过此时效果已经很不好了。        if ngrams_found == 0:            for ngram in ngrams_single:                if ngram in wv_from_text.index2word:                    word_vec += wv_from_text[ngram]                    ngrams_found += 1        if word_vec.any():  # 只要有一个不为0,进行算术平均            return word_vec / max(1, ngrams_found)        else: # 这里原本会抛出一个错误,但是我想让他跳过错误继续执行,就没主动raise Error。            print('all ngrams for word %s absent from model' % word)            return 0
# 获取关键词推荐结果并导出Exceldef get_keyword_recommend_result(result_list):  # result_list是一个关键词列表,这个你随意。    workbook = xlwt.Workbook(encoding='utf-8')    sheet = workbook.add_sheet('sheet1')    blue_style = xlwt.easyxf("font:colour_index blue;") # 设置样式    red_style = xlwt.easyxf("font:colour_index red;")    print("开始载入文件...", flush=True)    print("Now:", datetime.datetime.now(), flush=True)    t1 = time.time()    wv_from_text = gensim.models.KeyedVectors.load('ChineseEmbedding.bin', mmap='r')    # 腾讯的Tencent_AILab_ChineseEmbedding.txt的二进制存储形式,这样加载会快一些。    # 至于怎么生成bin文件,用model.save()    # wv_from_text = gensim.models.KeyedVectors.load_word2vec_format('ChineseEmbedding.txt', binary=False)    # wv_from_text.init_sims(replace=True)    # wv_from_text.save("ChineseEmbedding.bin")    print("文件载入完毕", flush=True)    row = 0    for keyword in result_list:        vec = word_vector(keyword, wv_from_text, min_n=1, max_n=3)  # 词向量获取        if vec is 0: # 如果确实找不到词向量,那只能跳过不管了            continue        similar_word = wv_from_text.most_similar(positive=[vec], topn=15)  # 相似词查找        result_word = [x[0] for x in similar_word] # 只要词,相似度数据暂时不用        # 开始往Excel写入结果        # Excel第一列是原始关键词,后面各列是推荐结果        if keyword in wv_from_text.index2word:            sheet.write(row, 0, keyword, blue_style)        else:            sheet.write(row, 0, keyword, red_style) # 对未登录词用红色进行标记        for col in range(0, len(result_word)):            sheet.write(row, col + 1, result_word[col]) # 写入关键词推荐结果        row += 1    output = BytesIO() # 据说Python2中使用StringIO.StringIO(),不过我没试过    workbook.save(output)    output.seek(0)    response = HttpResponse(content_type='application/vnd.ms-excel')    response['Content-Disposition'] = 'attachment;filename=keyword_recommend_result.xls'    response.write(output.getvalue())    print("推荐结果耗费时间:", (time.time() - t1) / 60, "minutes", flush=True)    return response # 在Django的视图函数中返回这个response可以在页面直接下载内存中生成好的Excel文件

转载地址:http://jsjqi.baihongyu.com/

你可能感兴趣的文章
Beginning Visual Web Developer 2005 Express: From Novice to Professional
查看>>
Beginning Programming
查看>>
Windows .NET Server 2003 Domains & Active Directory
查看>>
Information Systems : Achieving Success by Avoiding Failure
查看>>
Web Systems Design and Online Consumer Behavior
查看>>
VoIP For Dummies
查看>>
Administrator's Guide to SQL Server 2005
查看>>
Ajax Design Patterns
查看>>
DNS and BIND (5th Edition)
查看>>
Firewall Fundamentals
查看>>
Learning PHP and MySQL
查看>>
Agile Software Construction
查看>>
Computer Security Basics
查看>>
Sams Teach Yourself MySQL in 10 Minutes
查看>>
Information Systems : The State of the Field
查看>>
IPv6 Essentials
查看>>
Microsoft Visual C++ 2005 Express Edition Programming for the Absolute Beginner
查看>>
Microsoft Visual Basic 2005 Express Edition Programming for the Absolute Beginner
查看>>
Pro .NET 2.0 Windows Forms and Custom Controls in C#
查看>>
Beginning Regular Expressions
查看>>