from gensim.models import KeyedVectors # Store just the words + their trained embeddings. word_vectors = model.wv word_vectors.save("word2vec.wordvectors") # Load back with memory-mapping = read-only, shared across processes. wv = KeyedVectors.load("word2vec.wordvectors", mmap='r') vector = wv['computer'] # Get numpy vector of a word
Gensim 还可以加载“word2vec C 格式”的词向量,作为 KeyedVectors 实例
1 2 3 4 5
from gensim.test.utils import datapath # Load a word2vec model stored in the C *text* format. wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # Load a word2vec model stored in the C *binary* format. wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True)
如果完成了模型的训练(即不再更新,仅查询),您可以切换到 KeyedVectors 实例:
1 2
word_vectors = model.wv del model
将训练好的参数导入到torch的embending中
1 2 3 4 5 6 7 8 9
vacab = {} #字典 vects = [] #参数矩阵 for i,key in enumerate(pretrained_vec.wv.index_to_key): vacab[key] = i vects.append(list(pretrained_vec.wv[key])) vects = torch.Tensor(vects) del pretrained_vec embeding = nn.Embedding(len(vacab), vects.shape[1]) embeding.weight = nn.Parameter(vects)
from gensim.models import Phrases # Train a bigram detector. bigram_transformer = Phrases(common_texts) # Apply the trained MWE detector to a corpus, using the result to train a Word2vec model. model = Word2Vec(bigram_transformer[common_texts], min_count=1)
预训练模型
在 Gensim 数据存储库中附带了几个已经预训练的模型:
1 2 3 4 5 6 7
import gensim.downloader # Show all available models in gensim-data print(list(gensim.downloader.info()['models'].keys())) # Download the "glove-twitter-25" embeddings glove_vectors = gensim.downloader.load('glove-twitter-25') # Use the downloaded vectors as usual: glove_vectors.most_similar('twitter')