parent
05ccf02bb2
commit
9f9a271bc0
@ -0,0 +1,14 @@
|
|||||||
|
from allennlp.modules.elmo import Elmo, batch_to_ids
|
||||||
|
|
||||||
|
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
|
||||||
|
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
|
||||||
|
|
||||||
|
elmo = Elmo(options_file, weights_file, 1, dropout=0)
|
||||||
|
text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
|
||||||
|
|
||||||
|
batch = batch_to_ids(text)
|
||||||
|
print(batch)
|
||||||
|
|
||||||
|
dict = elmo.forward(batch)
|
||||||
|
|
||||||
|
print(dict['elmo_representations'])
|
||||||
@ -0,0 +1,17 @@
|
|||||||
|
class Citation(object):
|
||||||
|
""" Class representing a citation object """
|
||||||
|
|
||||||
|
def __init__(self,
|
||||||
|
text,
|
||||||
|
citing_paper_id,
|
||||||
|
cited_paper_id,
|
||||||
|
section_title=None,
|
||||||
|
intent=None,
|
||||||
|
citation_id=None
|
||||||
|
):
|
||||||
|
self.text = text
|
||||||
|
self.citing_paper_id = citing_paper_id
|
||||||
|
self.cited_paper_id = cited_paper_id
|
||||||
|
self.section_title = section_title
|
||||||
|
self.intent = intent
|
||||||
|
self.citation_id = citation_id
|
||||||
@ -0,0 +1,57 @@
|
|||||||
|
from typing import Iterable, List
|
||||||
|
from overrides import overrides
|
||||||
|
import jsonlines
|
||||||
|
from utils.data import Citation
|
||||||
|
|
||||||
|
from allennlp.data.fields import TextField, LabelField, MultiLabelField
|
||||||
|
from allennlp.data import Instance, Tokenizer
|
||||||
|
from allennlp.data.dataset_readers import DatasetReader
|
||||||
|
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
|
||||||
|
|
||||||
|
import utils.constants as const
|
||||||
|
|
||||||
|
|
||||||
|
class CitationDataSetReader(DatasetReader):
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.tokenizer = Tokenizer()
|
||||||
|
|
||||||
|
@overrides
|
||||||
|
def _read(self, file_path: str) -> Iterable[Instance]:
|
||||||
|
ds_reader = DataReaderJsonLines(file_path)
|
||||||
|
for citation in ds_reader.read():
|
||||||
|
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
|
||||||
|
|
||||||
|
@overrides
|
||||||
|
def text_to_instance(self, citation_text: str,
|
||||||
|
intent: List[str]) -> Instance:
|
||||||
|
citation_tokens = self.tokenizer.tokenize(citation_text)
|
||||||
|
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
|
||||||
|
"tokens": SingleIdTokenIndexer()}
|
||||||
|
|
||||||
|
fields = {'tokens': TextField(citation_tokens, token_indexers),
|
||||||
|
'label': MultiLabelField([const.CLASS_LABELS[i] for i in intent], skip_indexing=True,
|
||||||
|
num_labels=len(const.CLASS_LABELS))}
|
||||||
|
|
||||||
|
return Instance(fields)
|
||||||
|
|
||||||
|
|
||||||
|
class DataReaderJsonLines:
|
||||||
|
def __init__(self, file_path):
|
||||||
|
self.file_path = file_path
|
||||||
|
|
||||||
|
def read(self):
|
||||||
|
for line in jsonlines.open(self.file_path):
|
||||||
|
yield read_json_line(line)
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_line(line):
|
||||||
|
citation = Citation(
|
||||||
|
text=line['string'],
|
||||||
|
citing_paper_id=line['citingPaperId'],
|
||||||
|
cited_paper_id=line['citedPaperId'],
|
||||||
|
section_title=line['sectionName'],
|
||||||
|
intent=line['label'],
|
||||||
|
citation_id=line['id'])
|
||||||
|
|
||||||
|
return citation
|
||||||
Loading…
Reference in new issue