WIP : Code Documentation & README Documentation

5 years ago · 281205b0df
parent 12c9610f0b
commit 281205b0df
2 changed files with 41 additions and 2 deletions
--- a/feature_extraction/lexicons.py
+++ b/feature_extraction/lexicons.py
@ -69,6 +69,6 @@ ALL_LEXICONS = {
                         'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
                         'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],

-    'CITATION': ['et al'],  # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
+    'CITATION': ['et al']

 }
--- a/utils/reader.py
+++ b/utils/reader.py
@ -11,14 +11,33 @@ from overrides import overrides
 from utils.data import Citation


-@DatasetReader.register("citation_dataset_reader") # type for config files
+@DatasetReader.register("citation_dataset_reader")  # type for config files
 class CitationDataSetReader(DatasetReader):
+    """
+    We implement this CitationDataSetReader class by subclassing DatasetReader class,
+    we also need to override some super class methods
+
+    This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
+    We used the default SpacyTokenizer for this project.
+
+    We also need to register this dataset reader, for the Config files to be able to use this class.
+    """
+
    def __init__(self):
        super().__init__()
+        # default Spacy Tokenizer
        self.tokenizer = SpacyTokenizer()

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
+        """
+
+        This function reads the JSON Lines file, tokenize the text for each data point
+         and returns a collection of Instances, each instance with tokens and label
+
+        :param file_path: takes the file path as an Argument
+        :return: returns a collection of Instances
+        """
        ds_reader = DataReaderJsonLines(file_path)
        for citation in ds_reader.read():
            yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
@ -26,7 +45,15 @@ class CitationDataSetReader(DatasetReader):
    @overrides
    def text_to_instance(self, citation_text: str,
                         intent: str) -> Instance:
+
+        """
+        :param citation_text: text from the data point
+        :param intent: true label of the data instance
+        :return: returns Instance class object with tokens & label fields.
+        """
+
        citation_tokens = self.tokenizer.tokenize(citation_text)
+        # Use ELMO Token Characters Indexer
        token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                          "tokens": SingleIdTokenIndexer()}

@ -37,15 +64,27 @@ class CitationDataSetReader(DatasetReader):


 class DataReaderJsonLines:
+    """
+    Helper class for reading jsonl(JSON Line) files
+    """
    def __init__(self, file_path):
        self.file_path = file_path

    def read(self):
+        """
+        This method opens the file, reads every line and returns a collection of lines
+        :return: collection of Citation Objects, with the required data
+        """
        for line in jsonlines.open(self.file_path):
            yield read_json_line(line)


 def read_json_line(line):
+
+    """
+    :param line: takes the json line dictionary as a parameter
+    :return: returns a Citation Object
+    """
    citation = Citation(
        text=line['string'],
        citing_paper_id=line['citingPaperId'],