Source code for embedding
import datetime
import os
import random
import gensim
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from medembed import DIR_PROCESSED
[docs]class Embedding:
"""
methods to generate and evaluate word embedding vector
"""
def __init__(self, verbose):
self.verbose = verbose
self.trained_model = None
self.name_model = None
[docs] def generate(self, model_type, dim, workers):
"""
Models word embedding vector and saves it to file
:param model_type: 'word2vec' or 'fasttext'
:param dim: dimensions of word emb edding vector
:param workers: number of workers to parallelise training of word embedding model
:return: None
"""
if self.verbose:
print('Generating word embedding vector with {} model_type'.format(model_type))
model = None
if model_type == 'word2vec':
sentences = gensim.models.word2vec.PathLineSentences(DIR_PROCESSED)
model = gensim.models.word2vec.Word2Vec(sentences, size=dim, window=5, sg=1, workers=workers)
elif model_type == 'fasttext':
sentences = gensim.models.word2vec.PathLineSentences(DIR_PROCESSED)
model = gensim.models.FastText(sentences, size=dim, window=5, workers=workers)
now = datetime.datetime.now()
name_model = model_type + '_' + now.strftime('%m-%d_%H:%M')
fname_model = name_model + '.bin'
save_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'we_models')
full_fname = os.path.join(save_path, fname_model)
if not os.path.exists(save_path):
if self.verbose:
print('Creating a directory for processed files at {}'.format(save_path))
os.makedirs(save_path)
model.save(full_fname)
if self.verbose:
print('Model saved as {} in Medembed/we_models'.format(fname_model))
self.trained_model = model
self.name_model = name_model
[docs] def tSNE(self, model_file = None):
"""
Creates TSNE model, plots it and saves it
:return: None
"""
if self.verbose:
print('Generating tSNE plot')
if model_file is not None:
model = gensim.models.KeyedVectors.load(model_file)
self.name_model = 'word2vec-04-27_05:00'
else:
model = self.trained_model
labels = []
tokens = []
for word in model.wv.vocab:
tokens.append(model[word])
if random.random() < 0.05:
labels.append(word)
else:
labels.append(' ')
tsne_model = TSNE(perplexity=30, n_components=2, init='pca', random_state=23)
new_values = tsne_model.fit_transform(tokens)
x = []
y = []
for value in new_values:
x.append(value[0])
y.append(value[1])
plt.figure(figsize=(16, 16))
sns.set()
for i in range(len(x)):
plt.scatter(x[i], y[i])
plt.annotate(labels[i],
xy=(x[i], y[i]),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
save_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'plots')
if not os.path.exists(save_path):
if self.verbose:
print('Creating a directory for plot files at {}'.format(save_path))
os.makedirs(save_path)
fname = self.name_model +'.png'
plt.savefig(os.path.join(save_path, fname), bbox_inches='tight')
if self.verbose:
print('Plot saved as {} in Medembed/plots'.format(fname))