data readers added

6 years ago · 9f9a271bc0
parent 05ccf02bb2
commit 9f9a271bc0
4 changed files with 90 additions and 0 deletions
--- a/feature_extraction/elmo.py
+++ b/feature_extraction/elmo.py
@ -0,0 +1,14 @@
+from allennlp.modules.elmo import Elmo, batch_to_ids
+
+weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
+options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
+
+elmo = Elmo(options_file, weights_file, 1, dropout=0)
+text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
+
+batch = batch_to_ids(text)
+print(batch)
+
+dict = elmo.forward(batch)
+
+print(dict['elmo_representations'])
--- a/utils/constants.py
+++ b/utils/constants.py
@ -32,3 +32,5 @@ REGEX_CONSTANTS = {
    'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")

 }
+
+CLASS_LABELS = {"background": 0, "method": 1, "result": 2}
--- a/utils/data.py
+++ b/utils/data.py
@ -0,0 +1,17 @@
+class Citation(object):
+    """ Class representing a citation object """
+
+    def __init__(self,
+                 text,
+                 citing_paper_id,
+                 cited_paper_id,
+                 section_title=None,
+                 intent=None,
+                 citation_id=None
+                 ):
+        self.text = text
+        self.citing_paper_id = citing_paper_id
+        self.cited_paper_id = cited_paper_id
+        self.section_title = section_title
+        self.intent = intent
+        self.citation_id = citation_id
--- a/utils/reader.py
+++ b/utils/reader.py
@ -0,0 +1,57 @@
+from typing import Iterable, List
+from overrides import overrides
+import jsonlines
+from utils.data import Citation
+
+from allennlp.data.fields import TextField, LabelField, MultiLabelField
+from allennlp.data import Instance, Tokenizer
+from allennlp.data.dataset_readers import DatasetReader
+from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
+
+import utils.constants as const
+
+
+class CitationDataSetReader(DatasetReader):
+    def __init__(self):
+        super().__init__()
+        self.tokenizer = Tokenizer()
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        ds_reader = DataReaderJsonLines(file_path)
+        for citation in ds_reader.read():
+            yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
+
+    @overrides
+    def text_to_instance(self, citation_text: str,
+                         intent: List[str]) -> Instance:
+        citation_tokens = self.tokenizer.tokenize(citation_text)
+        token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
+                          "tokens": SingleIdTokenIndexer()}
+
+        fields = {'tokens': TextField(citation_tokens, token_indexers),
+                  'label': MultiLabelField([const.CLASS_LABELS[i] for i in intent], skip_indexing=True,
+                                           num_labels=len(const.CLASS_LABELS))}
+
+        return Instance(fields)
+
+
+class DataReaderJsonLines:
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    def read(self):
+        for line in jsonlines.open(self.file_path):
+            yield read_json_line(line)
+
+
+def read_json_line(line):
+    citation = Citation(
+        text=line['string'],
+        citing_paper_id=line['citingPaperId'],
+        cited_paper_id=line['citedPaperId'],
+        section_title=line['sectionName'],
+        intent=line['label'],
+        citation_id=line['id'])
+
+    return citation