梁越

word2vec中文词向量结合PCA算法在二维空间下可视化分析-代码

0 人看过

记录下

%matplotlib inline


from jupyterthemes import jtplot
jtplot.style(theme='grade3') #选择一个绘图主题

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
import adjustText

from gensim.models.keyedvectors import KeyedVectors

word_vectors = KeyedVectors.load_word2vec_format(\
    'C:/Users/yue/Desktop/1.bin', \
    binary = False, limit = 1000000)

def plot_2d_representation_of_words(
    word_list, 
    word_vectors, 
    flip_x_axis = False,
    flip_y_axis = False,
    label_x_axis = "x",
    label_y_axis = "y", 
    label_label = "fruit"):

    pca = PCA(n_components = 2)

    word_plus_coordinates=[]

    for word in word_list: 
        current_row = []
        current_row.append(word)
        current_row.extend(word_vectors[word])
        word_plus_coordinates.append(current_row)

    word_plus_coordinates = pd.DataFrame(word_plus_coordinates)

    coordinates_2d = pca.fit_transform(
        word_plus_coordinates.iloc[:,1:300])
    coordinates_2d = pd.DataFrame(
        coordinates_2d, columns=[label_x_axis, label_y_axis])
    coordinates_2d[label_label] = word_plus_coordinates.iloc[:,0]
    if flip_x_axis:
        coordinates_2d[label_x_axis] = \
        coordinates_2d[label_x_axis] * (-1)
    if flip_y_axis:
        coordinates_2d[label_y_axis] = \
        coordinates_2d[label_y_axis] * (-1)

    plt.figure(figsize = (5, 3))
    p1=sns.scatterplot(
        data=coordinates_2d, x=label_x_axis, y=label_y_axis)

    x = coordinates_2d[label_x_axis]
    y = coordinates_2d[label_y_axis]
    label = coordinates_2d[label_label]

    texts = [plt.text(x[i], y[i], label[i]) for i in range(len(x))]
    adjustText.adjust_text(texts)

from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['STZhongsong']    # 指定默认字体:解决plot不能显示中文问题
mpl.rcParams['axes.unicode_minus'] = False           # 解决保存图像是负号'-'显示为方块的问题

#fruits = ['apple','orange','banana','lemon','car','tram','boat','bicycle','cherry','mango','grape','durian','watermelon','train','motorbike','ship',  'peach','pear','pomegranate','strawberry','bike','bus','truck','subway','airplane']
fruits = ['苹果', '自行车', '香蕉', '汽车', '人']      

plot_2d_representation_of_words(
    word_list = fruits, 
    word_vectors = word_vectors, 
    flip_y_axis = True)

这是在jupyter notebook运行的,使用的是腾讯AI Lab的中文词向量,下载压缩包下来解压,最里面的txt改成bin文件

可以看到寓意之间的关系