Source code for dataset
import os
from transformer import Transformer
try:
# noinspection PyPep8Naming
import xml.etree.cElementTree as ET
except ImportError:
# noinspection PyPep8Naming
import xml.etree.ElementTree as ET
import gensim
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from medembed import DIR_PROCESSED
[docs]class DataSet:
"""
Holds the dataset and the methods associated with it
"""
def __init__(self, directory, verbose, categories):
self.dir = directory
self.verbose = verbose
self.dictionary = None
self.categories = categories
self.type = None
self.stemmer = WordNetLemmatizer()
self.stops = self._make_stops()
def _read_extract(self, transformer):
raise NotImplementedError
@staticmethod
def _make_stops():
stops = set(stopwords.words('english'))
stops.difference_update({'no', 'nor', 'not'})
return stops
[docs] def preprocess(self):
"""
Calls pre-processing methods and prints progress (if verbose)
:return: None
"""
if self.verbose:
print('Processing files in directory {}'.format(self.dir))
categories = self._make_categories(self.categories)
transformer = Transformer(categories)
if not os.path.exists(DIR_PROCESSED):
if self.verbose:
print('Creating a directory for processed files at {}'.format(DIR_PROCESSED))
os.makedirs(DIR_PROCESSED)
self._read_extract(transformer)
if self.verbose:
print('Finding word and bigram frequencies')
if self.verbose:
print('{} unique words after processing'.format(len(transformer.word_freqs)))
self.dictionary = gensim.corpora.Dictionary(DataSet.iter_documents())
if self.verbose:
print('Performing UMLS and CLEVER mapping')
transformer.transform()
[docs] @staticmethod
def iter_documents():
"""
Generator: iterate over all relevant documents
:return: yields one document (=list of utf8 tokens) at a time
"""
for root, dirs, files in os.walk(DIR_PROCESSED):
for fname in filter(lambda fname: fname.endswith('.txt'), files):
document = open(os.path.join(root, fname)).read()
yield gensim.utils.tokenize(document, errors='ignore')
[docs] @staticmethod
def _make_categories(categories):
"""
Makes a list of categories to extract from a raw document
:return: category list, or None (if extracting all categories)
"""
if categories is None:
return []
else:
with open(categories, 'r') as f:
return f.readlines()
class XMLDataset(DataSet):
def __init__(self, directory, verbose, categories):
super().__init__(directory, verbose, categories)
self.type = 'xml'
def _read_extract(self, transformer):
"""
Reads xml files in data directory, cleans files and writes each file to preprocessed directory
:return: None
"""
directory_files = os.listdir(self.dir)
if self.verbose:
print('{} files found'.format(len(directory_files)))
file_count = 0
for fname in filter(lambda fname: fname.endswith('.xml'), directory_files):
tree = ET.parse(os.path.join(self.dir, fname))
f = ET.tostring(tree.getroot()[0]).decode()
clean_sample = transformer.make_clean_sample(f, self.stops, self.stemmer, self.type)
new_fname = fname.split('.xml')[0] + '.txt'
new_fname = os.path.join(DIR_PROCESSED, new_fname)
print(clean_sample, file=open(new_fname, 'w'))
file_count += 1
if self.verbose and file_count % 50 == 0:
print('Processed {} files'.format(file_count))
class TxtDataset(DataSet):
def __init__(self, directory, verbose, categories):
super().__init__(directory, verbose, categories)
self.type = 'txt'
def _read_extract(self, transformer):
"""
Reads txt files in data directory, cleans files and writes each file to preprocessed directory
:return: None
"""
directory_files = os.listdir(self.dir)
if self.verbose:
print('{} files found'.format(len(directory_files)))
file_count = 0
for fname in filter(lambda fname: fname.endswith('.txt'), directory_files):
with open(os.path.join(self.dir, fname), 'r') as f:
clean_sample = transformer.make_clean_sample(f, self.stops, self.stemmer, self.type)
new_fname = os.path.join(DIR_PROCESSED, fname)
print(clean_sample, file=open(new_fname, 'w'))
file_count += 1
if self.verbose and file_count % 50 == 0:
print('Processed {} files'.format(file_count))