diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py index 5326eda..24cfb30 100644 --- a/feature_extraction/lexicons.py +++ b/feature_extraction/lexicons.py @@ -69,6 +69,6 @@ ALL_LEXICONS = { 'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform', 'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'], - 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text + 'CITATION': ['et al'] } diff --git a/utils/reader.py b/utils/reader.py index 38199b6..ce606da 100644 --- a/utils/reader.py +++ b/utils/reader.py @@ -11,14 +11,33 @@ from overrides import overrides from utils.data import Citation -@DatasetReader.register("citation_dataset_reader") # type for config files +@DatasetReader.register("citation_dataset_reader") # type for config files class CitationDataSetReader(DatasetReader): + """ + We implement this CitationDataSetReader class by subclassing DatasetReader class, + we also need to override some super class methods + + This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances. + We used the default SpacyTokenizer for this project. + + We also need to register this dataset reader, for the Config files to be able to use this class. + """ + def __init__(self): super().__init__() + # default Spacy Tokenizer self.tokenizer = SpacyTokenizer() @overrides def _read(self, file_path: str) -> Iterable[Instance]: + """ + + This function reads the JSON Lines file, tokenize the text for each data point + and returns a collection of Instances, each instance with tokens and label + + :param file_path: takes the file path as an Argument + :return: returns a collection of Instances + """ ds_reader = DataReaderJsonLines(file_path) for citation in ds_reader.read(): yield self.text_to_instance(citation_text=citation.text, intent=citation.intent) @@ -26,7 +45,15 @@ class CitationDataSetReader(DatasetReader): @overrides def text_to_instance(self, citation_text: str, intent: str) -> Instance: + + """ + :param citation_text: text from the data point + :param intent: true label of the data instance + :return: returns Instance class object with tokens & label fields. + """ + citation_tokens = self.tokenizer.tokenize(citation_text) + # Use ELMO Token Characters Indexer token_indexers = {"elmo": ELMoTokenCharactersIndexer(), "tokens": SingleIdTokenIndexer()} @@ -37,15 +64,27 @@ class CitationDataSetReader(DatasetReader): class DataReaderJsonLines: + """ + Helper class for reading jsonl(JSON Line) files + """ def __init__(self, file_path): self.file_path = file_path def read(self): + """ + This method opens the file, reads every line and returns a collection of lines + :return: collection of Citation Objects, with the required data + """ for line in jsonlines.open(self.file_path): yield read_json_line(line) def read_json_line(line): + + """ + :param line: takes the json line dictionary as a parameter + :return: returns a Citation Object + """ citation = Citation( text=line['string'], citing_paper_id=line['citingPaperId'],