parent
05ccf02bb2
commit
9f9a271bc0
@ -0,0 +1,14 @@
|
||||
from allennlp.modules.elmo import Elmo, batch_to_ids
|
||||
|
||||
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
|
||||
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
|
||||
|
||||
elmo = Elmo(options_file, weights_file, 1, dropout=0)
|
||||
text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
|
||||
|
||||
batch = batch_to_ids(text)
|
||||
print(batch)
|
||||
|
||||
dict = elmo.forward(batch)
|
||||
|
||||
print(dict['elmo_representations'])
|
||||
@ -0,0 +1,17 @@
|
||||
class Citation(object):
|
||||
""" Class representing a citation object """
|
||||
|
||||
def __init__(self,
|
||||
text,
|
||||
citing_paper_id,
|
||||
cited_paper_id,
|
||||
section_title=None,
|
||||
intent=None,
|
||||
citation_id=None
|
||||
):
|
||||
self.text = text
|
||||
self.citing_paper_id = citing_paper_id
|
||||
self.cited_paper_id = cited_paper_id
|
||||
self.section_title = section_title
|
||||
self.intent = intent
|
||||
self.citation_id = citation_id
|
||||
@ -0,0 +1,57 @@
|
||||
from typing import Iterable, List
|
||||
from overrides import overrides
|
||||
import jsonlines
|
||||
from utils.data import Citation
|
||||
|
||||
from allennlp.data.fields import TextField, LabelField, MultiLabelField
|
||||
from allennlp.data import Instance, Tokenizer
|
||||
from allennlp.data.dataset_readers import DatasetReader
|
||||
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
|
||||
|
||||
import utils.constants as const
|
||||
|
||||
|
||||
class CitationDataSetReader(DatasetReader):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.tokenizer = Tokenizer()
|
||||
|
||||
@overrides
|
||||
def _read(self, file_path: str) -> Iterable[Instance]:
|
||||
ds_reader = DataReaderJsonLines(file_path)
|
||||
for citation in ds_reader.read():
|
||||
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
|
||||
|
||||
@overrides
|
||||
def text_to_instance(self, citation_text: str,
|
||||
intent: List[str]) -> Instance:
|
||||
citation_tokens = self.tokenizer.tokenize(citation_text)
|
||||
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
|
||||
"tokens": SingleIdTokenIndexer()}
|
||||
|
||||
fields = {'tokens': TextField(citation_tokens, token_indexers),
|
||||
'label': MultiLabelField([const.CLASS_LABELS[i] for i in intent], skip_indexing=True,
|
||||
num_labels=len(const.CLASS_LABELS))}
|
||||
|
||||
return Instance(fields)
|
||||
|
||||
|
||||
class DataReaderJsonLines:
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
|
||||
def read(self):
|
||||
for line in jsonlines.open(self.file_path):
|
||||
yield read_json_line(line)
|
||||
|
||||
|
||||
def read_json_line(line):
|
||||
citation = Citation(
|
||||
text=line['string'],
|
||||
citing_paper_id=line['citingPaperId'],
|
||||
cited_paper_id=line['citedPaperId'],
|
||||
section_title=line['sectionName'],
|
||||
intent=line['label'],
|
||||
citation_id=line['id'])
|
||||
|
||||
return citation
|
||||
Loading…
Reference in new issue