data readers added

isaac
Pavan Mandava 6 years ago
parent 05ccf02bb2
commit 9f9a271bc0

@ -0,0 +1,14 @@
from allennlp.modules.elmo import Elmo, batch_to_ids
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo = Elmo(options_file, weights_file, 1, dropout=0)
text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
batch = batch_to_ids(text)
print(batch)
dict = elmo.forward(batch)
print(dict['elmo_representations'])

@ -32,3 +32,5 @@ REGEX_CONSTANTS = {
'ENDS_WITH_ETHYL': re.compile(r"ethyl\b") 'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
} }
CLASS_LABELS = {"background": 0, "method": 1, "result": 2}

@ -0,0 +1,17 @@
class Citation(object):
""" Class representing a citation object """
def __init__(self,
text,
citing_paper_id,
cited_paper_id,
section_title=None,
intent=None,
citation_id=None
):
self.text = text
self.citing_paper_id = citing_paper_id
self.cited_paper_id = cited_paper_id
self.section_title = section_title
self.intent = intent
self.citation_id = citation_id

@ -0,0 +1,57 @@
from typing import Iterable, List
from overrides import overrides
import jsonlines
from utils.data import Citation
from allennlp.data.fields import TextField, LabelField, MultiLabelField
from allennlp.data import Instance, Tokenizer
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
import utils.constants as const
class CitationDataSetReader(DatasetReader):
def __init__(self):
super().__init__()
self.tokenizer = Tokenizer()
@overrides
def _read(self, file_path: str) -> Iterable[Instance]:
ds_reader = DataReaderJsonLines(file_path)
for citation in ds_reader.read():
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
@overrides
def text_to_instance(self, citation_text: str,
intent: List[str]) -> Instance:
citation_tokens = self.tokenizer.tokenize(citation_text)
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
fields = {'tokens': TextField(citation_tokens, token_indexers),
'label': MultiLabelField([const.CLASS_LABELS[i] for i in intent], skip_indexing=True,
num_labels=len(const.CLASS_LABELS))}
return Instance(fields)
class DataReaderJsonLines:
def __init__(self, file_path):
self.file_path = file_path
def read(self):
for line in jsonlines.open(self.file_path):
yield read_json_line(line)
def read_json_line(line):
citation = Citation(
text=line['string'],
citing_paper_id=line['citingPaperId'],
cited_paper_id=line['citedPaperId'],
section_title=line['sectionName'],
intent=line['label'],
citation_id=line['id'])
return citation
Loading…
Cancel
Save