diff --git a/feature_extraction/elmo.py b/feature_extraction/elmo.py new file mode 100644 index 0000000..d2a137f --- /dev/null +++ b/feature_extraction/elmo.py @@ -0,0 +1,14 @@ +from allennlp.modules.elmo import Elmo, batch_to_ids + +weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' +options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' + +elmo = Elmo(options_file, weights_file, 1, dropout=0) +text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games'] + +batch = batch_to_ids(text) +print(batch) + +dict = elmo.forward(batch) + +print(dict['elmo_representations']) \ No newline at end of file diff --git a/utils/constants.py b/utils/constants.py index 7476fb6..5389eae 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -32,3 +32,5 @@ REGEX_CONSTANTS = { 'ENDS_WITH_ETHYL': re.compile(r"ethyl\b") } + +CLASS_LABELS = {"background": 0, "method": 1, "result": 2} diff --git a/utils/data.py b/utils/data.py new file mode 100644 index 0000000..893a3fd --- /dev/null +++ b/utils/data.py @@ -0,0 +1,17 @@ +class Citation(object): + """ Class representing a citation object """ + + def __init__(self, + text, + citing_paper_id, + cited_paper_id, + section_title=None, + intent=None, + citation_id=None + ): + self.text = text + self.citing_paper_id = citing_paper_id + self.cited_paper_id = cited_paper_id + self.section_title = section_title + self.intent = intent + self.citation_id = citation_id diff --git a/utils/reader.py b/utils/reader.py new file mode 100644 index 0000000..b19d6e5 --- /dev/null +++ b/utils/reader.py @@ -0,0 +1,57 @@ +from typing import Iterable, List +from overrides import overrides +import jsonlines +from utils.data import Citation + +from allennlp.data.fields import TextField, LabelField, MultiLabelField +from allennlp.data import Instance, Tokenizer +from allennlp.data.dataset_readers import DatasetReader +from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer + +import utils.constants as const + + +class CitationDataSetReader(DatasetReader): + def __init__(self): + super().__init__() + self.tokenizer = Tokenizer() + + @overrides + def _read(self, file_path: str) -> Iterable[Instance]: + ds_reader = DataReaderJsonLines(file_path) + for citation in ds_reader.read(): + yield self.text_to_instance(citation_text=citation.text, intent=citation.intent) + + @overrides + def text_to_instance(self, citation_text: str, + intent: List[str]) -> Instance: + citation_tokens = self.tokenizer.tokenize(citation_text) + token_indexers = {"elmo": ELMoTokenCharactersIndexer(), + "tokens": SingleIdTokenIndexer()} + + fields = {'tokens': TextField(citation_tokens, token_indexers), + 'label': MultiLabelField([const.CLASS_LABELS[i] for i in intent], skip_indexing=True, + num_labels=len(const.CLASS_LABELS))} + + return Instance(fields) + + +class DataReaderJsonLines: + def __init__(self, file_path): + self.file_path = file_path + + def read(self): + for line in jsonlines.open(self.file_path): + yield read_json_line(line) + + +def read_json_line(line): + citation = Citation( + text=line['string'], + citing_paper_id=line['citingPaperId'], + cited_paper_id=line['citedPaperId'], + section_title=line['sectionName'], + intent=line['label'], + citation_id=line['id']) + + return citation