citation-analysis/utils/reader.py

from typing import Iterable

import jsonlines
from allennlp.data import Instance
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, LabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
from allennlp.data.tokenizers import SpacyTokenizer
from overrides import overrides

from utils.data import Citation


@DatasetReader.register("citation_dataset_reader")  # type for config files
class CitationDataSetReader(DatasetReader):
    """
    We implement this CitationDataSetReader class by subclassing DatasetReader class,
    we also need to override some super class methods

    This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
    We used the default SpacyTokenizer for this project.

    We also need to register this dataset reader, for the Config files to be able to use this class.
    """

    def __init__(self):
        super().__init__()
        # default Spacy Tokenizer
        self.tokenizer = SpacyTokenizer()

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
        """

        This function reads the JSON Lines file, tokenize the text for each data point
         and returns a collection of Instances, each instance with tokens and label

        :param file_path: takes the file path as an Argument
        :return: returns a collection of Instances
        """
        ds_reader = DataReaderJsonLines(file_path)
        for citation in ds_reader.read():
            yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)

    @overrides
    def text_to_instance(self, citation_text: str,
                         intent: str) -> Instance:

        """
        :param citation_text: text from the data point
        :param intent: true label of the data instance
        :return: returns Instance class object with tokens & label fields.
        """

        citation_tokens = self.tokenizer.tokenize(citation_text)
        # Use ELMO Token Characters Indexer
        token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                          "tokens": SingleIdTokenIndexer()}

        fields = {'tokens': TextField(citation_tokens, token_indexers),
                  'label': LabelField(intent)}

        return Instance(fields)


class DataReaderJsonLines:
    """
    Helper class for reading jsonl(JSON Line) files
    """
    def __init__(self, file_path):
        self.file_path = file_path

    def read(self):
        """
        This method opens the file, reads every line and returns a collection of lines
        :return: collection of Citation Objects, with the required data
        """
        for line in jsonlines.open(self.file_path):
            yield read_json_line(line)


def read_json_line(line):

    """
    :param line: takes the json line dictionary as a parameter
    :return: returns a Citation Object
    """
    citation = Citation(
        text=line['string'],
        citing_paper_id=line['citingPaperId'],
        cited_paper_id=line['citedPaperId'],
        section_title=line['sectionName'],
        intent=line['label'],
        citation_id=line['id'])

    return citation