WIP : Code Documentation & README Documentation

isaac
Pavan Mandava 5 years ago
parent 12c9610f0b
commit 281205b0df

@ -69,6 +69,6 @@ ALL_LEXICONS = {
'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
'CITATION': ['et al']
}

@ -13,12 +13,31 @@ from utils.data import Citation
@DatasetReader.register("citation_dataset_reader") # type for config files
class CitationDataSetReader(DatasetReader):
"""
We implement this CitationDataSetReader class by subclassing DatasetReader class,
we also need to override some super class methods
This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
We used the default SpacyTokenizer for this project.
We also need to register this dataset reader, for the Config files to be able to use this class.
"""
def __init__(self):
super().__init__()
# default Spacy Tokenizer
self.tokenizer = SpacyTokenizer()
@overrides
def _read(self, file_path: str) -> Iterable[Instance]:
"""
This function reads the JSON Lines file, tokenize the text for each data point
and returns a collection of Instances, each instance with tokens and label
:param file_path: takes the file path as an Argument
:return: returns a collection of Instances
"""
ds_reader = DataReaderJsonLines(file_path)
for citation in ds_reader.read():
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
@ -26,7 +45,15 @@ class CitationDataSetReader(DatasetReader):
@overrides
def text_to_instance(self, citation_text: str,
intent: str) -> Instance:
"""
:param citation_text: text from the data point
:param intent: true label of the data instance
:return: returns Instance class object with tokens & label fields.
"""
citation_tokens = self.tokenizer.tokenize(citation_text)
# Use ELMO Token Characters Indexer
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
@ -37,15 +64,27 @@ class CitationDataSetReader(DatasetReader):
class DataReaderJsonLines:
"""
Helper class for reading jsonl(JSON Line) files
"""
def __init__(self, file_path):
self.file_path = file_path
def read(self):
"""
This method opens the file, reads every line and returns a collection of lines
:return: collection of Citation Objects, with the required data
"""
for line in jsonlines.open(self.file_path):
yield read_json_line(line)
def read_json_line(line):
"""
:param line: takes the json line dictionary as a parameter
:return: returns a Citation Object
"""
citation = Citation(
text=line['string'],
citing_paper_id=line['citingPaperId'],

Loading…
Cancel
Save