相关词推荐 作者: admin 时间: 2020-03-19 分类: 默认分类 ### 需求 根据用户输入的搜索词进行搜索词推荐,增加用户的选择。 ### 实现方法 这里我们使用word2vec进行词向量模型训练,找到与搜索词最近的几个词作为推荐项。 ### 语料库 每个图片的所有关键词作为关联数据,格式如下 ``` 人物,人士,人氏,人选 人类,生人,全人类 人手,人员,人口,人丁,食指 劳力,劳动力,工作者 ``` ### 实现 ```python from gensim.models import Word2Vec import os class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname), errors='replace'): yield line.split() # 语料库所在的文件夹 sentences = MySentences('./yuliaoku') model = Word2Vec(sentences, size=300, window=5, min_count=1, workers=4) model.save("model/keyword") model.wv.save_word2vec_format('production_model/model.bin.gz', binary=True) ``` ### 增量训练 ```python from gensim.models import Word2Vec import os, shutil class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname), errors='ignore'): yield line.split() # 增量语料库所在的文件夹 sentences = MySentences('./update') model = Word2Vec.load("model/keyword") model.train(sentences, total_examples=model.corpus_count, epochs=1) model.save("model/keyword") model.wv.save_word2vec_format('production_model/model.bin.gz', binary=True) # 删除增量的语料 shutil.rmtree('./update') os.mkdir('./update') ``` ### 使用 安装fastapi和uvicorn ``` #vec.py from gensim.models import KeyedVectors import hanlp # 加载模型 model = KeyedVectors.load_word2vec_format('production_model/model.bin.gz', binary=True) # 加载分词模型 tokenizer = hanlp.load('PKU_NAME_MERGED_SIX_MONTHS_CONVSEG') def get_most_similar(keyword): try: word_list = tokenizer(keyword) search_words = [] for word in word_list: if word in model.vocab: search_words.append(word) return [i[0] for i in model.most_similar(search_words)] except Exception: return [] ``` ```python #main.py import json from extension.vec import get_most_similar from fastapi import FastAPI app = FastAPI() @app.get('/recommend') async def recommend(keyword: str): result = get_most_similar(keyword) response = { 'result': True, 'data': result } return response ``` 使用`uvicorn main:app --workers 4`运行服务 标签: word2vec, fastapi, 词向量, 相关词, 推荐算法