You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

97 lines
3.2 KiB

from typing import Iterable
import jsonlines
from allennlp.data import Instance
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, LabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
from allennlp.data.tokenizers import SpacyTokenizer
from overrides import overrides
from utils.data import Citation
@DatasetReader.register("citation_dataset_reader") # type for config files
class CitationDataSetReader(DatasetReader):
"""
We implement this CitationDataSetReader class by subclassing DatasetReader class,
we also need to override some super class methods
This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
We used the default SpacyTokenizer for this project.
We also need to register this dataset reader, for the Config files to be able to use this class.
"""
def __init__(self):
super().__init__()
# default Spacy Tokenizer
self.tokenizer = SpacyTokenizer()
@overrides
def _read(self, file_path: str) -> Iterable[Instance]:
"""
This function reads the JSON Lines file, tokenize the text for each data point
and returns a collection of Instances, each instance with tokens and label
:param file_path: takes the file path as an Argument
:return: returns a collection of Instances
"""
ds_reader = DataReaderJsonLines(file_path)
for citation in ds_reader.read():
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
@overrides
def text_to_instance(self, citation_text: str,
intent: str) -> Instance:
"""
:param citation_text: text from the data point
:param intent: true label of the data instance
:return: returns Instance class object with tokens & label fields.
"""
citation_tokens = self.tokenizer.tokenize(citation_text)
# Use ELMO Token Characters Indexer
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
fields = {'tokens': TextField(citation_tokens, token_indexers),
'label': LabelField(intent)}
return Instance(fields)
class DataReaderJsonLines:
"""
Helper class for reading jsonl(JSON Line) files
"""
def __init__(self, file_path):
self.file_path = file_path
def read(self):
"""
This method opens the file, reads every line and returns a collection of lines
:return: collection of Citation Objects, with the required data
"""
for line in jsonlines.open(self.file_path):
yield read_json_line(line)
def read_json_line(line):
"""
:param line: takes the json line dictionary as a parameter
:return: returns a Citation Object
"""
citation = Citation(
text=line['string'],
citing_paper_id=line['citingPaperId'],
cited_paper_id=line['citedPaperId'],
section_title=line['sectionName'],
intent=line['label'],
citation_id=line['id'])
return citation