Source code for transformer
import fileinput
import os
from nltk.tokenize import word_tokenize
from medembed import DIR_PROCESSED
[docs]class Transformer:
"""
Methods that process the dataset before generating word embedding model
"""
def __init__(self, categories, apikey=None):
self.category = None
self.category_tokens = set([' '.split(i) for i in categories])
self.word_freqs = dict()
self.apikey = apikey
[docs] def make_clean_sample(self, f, stops, stemmer, ftype='xml'):
"""
raw text -> clean text and generates word_frequency dictionary
:param f: raw text
:param stops: set of stopwords
:param stemmer: nltk stemmer
:param ftype: type of files to process
:return: processed text
"""
clean_sample = ''
if ftype == 'xml':
tokens = word_tokenize(f)
for token in tokens:
if token in ['?', '!', '.']:
clean_sample += '\n'
elif token not in stops and token.isalpha():
if token not in self.word_freqs:
self.word_freqs[token] = 0
else:
self.word_freqs[token] += 1
token = stemmer.lemmatize(token)
token = token.lower()
clean_sample += token + ' '
return clean_sample
for line in f:
if self.category is None:
clean_line = ''
tokens = word_tokenize(line)
for token in tokens:
if token not in stops and token.isalpha():
if token not in self.word_freqs:
self.word_freqs[token] = 0
else:
self.word_freqs[token] += 1
token = stemmer.lemmatize(token)
token = token.lower()
clean_line += token + ' '
clean_sample += clean_line
else:
if any(s in line for s in self.category):
clean_line = ''
tokens = word_tokenize(line)
for token in tokens:
if token not in stops and token not in self.category_tokens and token.isalpha():
if token not in self.word_freqs:
self.word_freqs[token] = 0
else:
self.word_freqs[token] += 1
token = stemmer.lemmatize(token)
token = token.lower()
clean_line += token + ' '
clean_sample += clean_line
return clean_sample
def transform(self):
clever_map = self._make_clever_map()
# umls_map = self._make_umls_map()
self._do_mapping(clever_map)
@staticmethod
def _make_clever_map():
clever_map = dict()
fname = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'clever_term', 'clever_base_terminology.txt')
with open(fname, 'r') as f:
for line in f:
tokens = line.split('|')
clever_map[tokens[1]] = tokens[2].strip('\n')
return clever_map
def _make_umls_map(self):
return None
def _do_mapping(self, clever_map, umls_map=None):
files = os.listdir(DIR_PROCESSED)
for fname in filter(lambda fname: fname.endswith('.txt'), files):
full_fname = os.path.join(DIR_PROCESSED, fname)
with open(full_fname, 'r') as f:
mapped_text = ''
for line in f:
tokens = word_tokenize(line)
newline = ' '.join(str(clever_map.get(token, token)) for token in tokens) + '\n'
mapped_text += newline
print(mapped_text, file=open(full_fname, 'w'))
[docs] @staticmethod
def _find_frequent(threshold_bigram):
"""
Reads preprocessed files and counts bigrams
:param threshold_bigram: minimum frequency of bigram
:return: all bigrams that occur more than threshold_bigram times
"""
bigram_dict = dict()
with fileinput.input(files=os.listdir(DIR_PROCESSED)) as f:
for line in f:
for i in range(len(line) - 1):
if line[i] != '\n' and line[i + 1] != '\n':
bigram = line[i] + '_' + line[i + 1]
if bigram not in bigram_dict:
bigram_dict[bigram] = 0
else:
bigram_dict[bigram] += 1
sorted_bigrams = set([k for (k, v) in bigram_dict.items() if v > threshold_bigram])
return sorted_bigrams