moved Isaac's commit to the current package

6 years ago · 5daea1a2a8
parent e9b1f31c49
commit 5daea1a2a8
8 changed files with 52 additions and 296 deletions
--- a/.allennlp_plugins
+++ b/.allennlp_plugins
@ -0,0 +1 @@
+classifier
--- a/allennlp_project/classifier/init.py
+++ b/allennlp_project/classifier/init.py
@ -1,2 +0,0 @@
-from .nn import *
-from utils.reader import *
--- a/allennlp_project/classifier/nn.py
+++ b/allennlp_project/classifier/nn.py
@ -1,157 +0,0 @@
-from typing import Dict
-
-import numpy as np
-import torch
-from allennlp.common.checks import ConfigurationError
-from allennlp.data import Vocabulary
-from allennlp.models import Model
-from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder, FeedForward, Elmo
-from allennlp.nn import util
-from allennlp.training.metrics import CategoricalAccuracy, F1Measure
-from overrides import overrides
-from torch.nn import Parameter
-
-
-@Model.register("basic_bilstm_classifier")
-class BiLstmClassifier(Model):
-
-    def __init__(self, vocab: Vocabulary,
-                 text_field_embedder: TextFieldEmbedder,
-                 encoder: Seq2SeqEncoder,
-                 classifier_feedforward: FeedForward,
-                 elmo: Elmo = None,
-                 use_input_elmo: bool = False):
-        super().__init__(vocab)
-        self.elmo = elmo
-        self.use_elmo = use_input_elmo
-        self.text_field_embedder = text_field_embedder
-        self.num_classes = self.vocab.get_vocab_size("label")
-        self.encoder = encoder
-        self.classifier_feed_forward = classifier_feedforward
-        self.label_accuracy = CategoricalAccuracy()
-
-        self.label_f1_metrics = {}
-
-        for i in range(self.num_classes):
-            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] = \
-                F1Measure(positive_label=i)
-
-        self.loss = torch.nn.CrossEntropyLoss()
-
-        self.attention = Attention(encoder.get_output_dim())
-
-    @overrides
-    def forward(self, tokens: Dict[str, torch.LongTensor],
-                label: torch.LongTensor) -> Dict[str, torch.LongTensor]:
-
-        global input_elmo
-        elmo_tokens = tokens.pop("elmo", None)
-
-        embedded_text = self.text_field_embedder(tokens)
-        text_mask = util.get_text_field_mask(tokens)
-
-        if elmo_tokens is not None:
-            tokens["elmo"] = elmo_tokens
-
-            # Create ELMo embeddings if applicable
-            if self.elmo:
-                if elmo_tokens is not None:
-                    elmo_representations = self.elmo(elmo_tokens["elmo_tokens"])["elmo_representations"]
-                    if self.use_elmo:
-                        input_elmo = elmo_representations.pop()
-                    assert not elmo_representations
-                else:
-                    raise ConfigurationError("Model was built to use Elmo, but input text is not tokenized for Elmo.")
-
-            if self.use_elmo:
-                if embedded_text is not None:
-                    embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
-                else:
-                    embedded_text = input_elmo
-
-        encoded_text = self.encoder(embedded_text, text_mask)
-
-        # Attention
-        attn_dist, encoded_text = self.attention(encoded_text, return_attn_distribution=True)
-
-        if label is not None:
-            logits = self.classifier_feed_forward(encoded_text)
-            class_probabilities = torch.nn.functional.softmax(logits, dim=1)
-
-            output_dict = {"logits": logits}
-
-            loss = self.loss(logits, label)
-            output_dict["loss"] = loss
-
-            # compute F1 per label
-            for i in range(self.num_classes):
-                metric = self.label_f1_metrics[self.vocab.get_token_from_index(index=i, namespace="label")]
-                metric(class_probabilities, label)
-            output_dict['label'] = label
-
-            output_dict['tokens'] = tokens['tokens']
-
-        return output_dict
-
-    @overrides
-    def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        class_probabilities = torch.nn.functional.softmax(output_dict['logits'], dim=-1)
-        predictions = class_probabilities.cpu().data.numpy()
-        argmax_indices = np.argmax(predictions, axis=-1)
-        label = [self.vocab.get_token_from_index(x, namespace="label")
-                 for x in argmax_indices]
-        output_dict['probabilities'] = class_probabilities
-        output_dict['positive_label'] = label
-        output_dict['prediction'] = label
-        citation_text = []
-        for batch_text in output_dict['tokens']:
-            citation_text.append([self.vocab.get_token_from_index(token_id.item()) for token_id in batch_text])
-        output_dict['tokens'] = citation_text
-
-        return output_dict
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        metric_dict = {}
-
-        sum_f1 = 0.0
-        for name, metric in self.label_f1_metrics.items():
-            metric_val = metric.get_metric(reset)
-            metric_dict[name + '_P'] = metric_val[0]
-            metric_dict[name + '_R'] = metric_val[1]
-            metric_dict[name + '_F1'] = metric_val[2]
-            if name != 'none':  # do not consider `none` label in averaging F1
-                sum_f1 += metric_val[2]
-
-        names = list(self.label_f1_metrics.keys())
-        total_len = len(names) if 'none' not in names else len(names) - 1
-        average_f1 = sum_f1 / total_len
-        metric_dict['average_F1'] = average_f1
-
-        return metric_dict
-
-
-def new_parameter(*size):
-    out = Parameter(torch.FloatTensor(*size))
-    torch.nn.init.xavier_normal_(out)
-    return out
-
-
-class Attention(torch.nn.Module):
-    """ Simple multiplicative attention"""
-
-    def __init__(self, attention_size):
-        super(Attention, self).__init__()
-        self.attention = new_parameter(attention_size, 1)
-
-    def forward(self, x_in, reduction_dim=-2, return_attn_distribution=False):
-        # calculate attn weights
-        attn_score = torch.matmul(x_in, self.attention).squeeze()
-        # add one dimension at the end and get a distribution out of scores
-        attn_distrib = torch.nn.functional.softmax(attn_score.squeeze(), dim=-1).unsqueeze(-1)
-        scored_x = x_in * attn_distrib
-        weighted_sum = torch.sum(scored_x, dim=reduction_dim)
-        if return_attn_distribution:
-            return attn_distrib.reshape(x_in.shape[0], -1), weighted_sum
-        else:
-            return weighted_sum
--- a/allennlp_project/configs/basic_model.json
+++ b/allennlp_project/configs/basic_model.json
@ -1,56 +0,0 @@
-{
-  "dataset_reader": {
-    "type": "citation_dataset_reader"
-  },
-  "train_data_path": "data/jsonl/train.jsonl",
-  "validation_data_path": "data/jsonl/test.jsonl",
-  "test_data_path": "data/jsonl/test.jsonl",
-  "model": {
-    "type": "basic_bilstm_classifier",
-    "text_field_embedder": {
-      "token_embedders": {
-        "tokens": {
-            "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
-            "type": "embedding",
-            "embedding_dim": 100,
-            "trainable": false
-        }
-      }
-    },
-    "encoder": {
-      "type": "lstm",
-      "input_size": 1124,
-      "hidden_size": 100,
-      "num_layers": 1,
-      "bidirectional": true
-    },
-    "elmo": {
-        "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
-        "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
-        "do_layer_norm": true,
-        "dropout": 0.5,
-        "num_output_representations": 1
-    },
-    "use_input_elmo": true,
-    "classifier_feedforward": {
-      "input_dim": 200,
-      "num_layers": 2,
-      "hidden_dims": [20, 3],
-      "activations": ["linear", "linear"]
-    }
-  },
-  "data_loader": {
-    "batch_sampler": {
-      "type": "bucket",
-      "batch_size" : 16
-    }
-  },
-  "trainer": {
-    "optimizer": {
-      "type": "adam",
-      "lr": 0.001
-    },
-    "num_epochs": 2,
-    "cuda_device": -1
-  }
-}
--- a/allennlp_project/utils/reader.py
+++ b/allennlp_project/utils/reader.py
@ -1,57 +0,0 @@
-from typing import Iterable
-
-import jsonlines
-from allennlp.data import Instance
-from allennlp.data.dataset_readers import DatasetReader
-from allennlp.data.fields import TextField, LabelField
-from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
-from allennlp.data.tokenizers import SpacyTokenizer
-from overrides import overrides
-
-from utils.data import Citation
-
-
-@DatasetReader.register("citation_dataset_reader") # type for config files
-class CitationDataSetReader(DatasetReader):
-    def __init__(self):
-        super().__init__()
-        self.tokenizer = SpacyTokenizer()
-
-    @overrides
-    def _read(self, file_path: str) -> Iterable[Instance]:
-        ds_reader = DataReaderJsonLines(file_path)
-        for citation in ds_reader.read():
-            yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
-
-    @overrides
-    def text_to_instance(self, citation_text: str,
-                         intent: str) -> Instance:
-        citation_tokens = self.tokenizer.tokenize(citation_text)
-        token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
-                          "tokens": SingleIdTokenIndexer()}
-
-        fields = {'tokens': TextField(citation_tokens, token_indexers),
-                  'label': LabelField(intent)}
-
-        return Instance(fields)
-
-
-class DataReaderJsonLines:
-    def __init__(self, file_path):
-        self.file_path = file_path
-
-    def read(self):
-        for line in jsonlines.open(self.file_path):
-            yield read_json_line(line)
-
-
-def read_json_line(line):
-    citation = Citation(
-        text=line['string'],
-        citing_paper_id=line['citingPaperId'],
-        cited_paper_id=line['citedPaperId'],
-        section_title=line['sectionName'],
-        intent=line['label'],
-        citation_id=line['id'])
-
-    return citation
--- a/classifier/init.py
+++ b/classifier/init.py
@ -0,0 +1,2 @@
+from .nn import *
+from utils.reader import *
--- a/configs/basic_model.json
+++ b/configs/basic_model.json
@ -7,24 +7,50 @@
  "test_data_path": "data/jsonl/test.jsonl",
  "model": {
    "type": "basic_bilstm_classifier",
-    "elmo_text_field_embedder": {
-      "tokens": {
-        "type": "embedding",
-        "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
-        "embedding_dim": 100,
-        "trainable": false
-      },
-      "elmo": {
-        "type": "elmo_token_embedder",
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
+            "type": "embedding",
+            "embedding_dim": 100,
+            "trainable": false
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 1124,
+      "hidden_size": 100,
+      "num_layers": 1,
+      "bidirectional": true
+    },
+    "elmo": {
        "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
        "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
        "do_layer_norm": true,
-        "dropout": 0.5
-      }
+        "dropout": 0.5,
+        "num_output_representations": 1
+    },
+    "use_input_elmo": true,
+    "classifier_feedforward": {
+      "input_dim": 200,
+      "num_layers": 2,
+      "hidden_dims": [20, 3],
+      "activations": ["linear", "linear"]
+    }
+  },
+  "data_loader": {
+    "batch_sampler": {
+      "type": "bucket",
+      "batch_size" : 16
    }
  },
  "trainer": {
-    "optimizer": "adam",
-    "num_epochs": 10
+    "optimizer": {
+      "type": "adam",
+      "lr": 0.001
+    },
+    "num_epochs": 2,
+    "cuda_device": -1
  }
 }
--- a/utils/reader.py
+++ b/utils/reader.py
@ -1,21 +1,21 @@
-from typing import Iterable, List
-from overrides import overrides
-import jsonlines
-from utils.data import Citation
+from typing import Iterable

-from allennlp.data.fields import TextField, LabelField, MultiLabelField
-from allennlp.data import Instance, Tokenizer
+import jsonlines
+from allennlp.data import Instance
 from allennlp.data.dataset_readers import DatasetReader
+from allennlp.data.fields import TextField, LabelField
 from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
+from allennlp.data.tokenizers import SpacyTokenizer
+from overrides import overrides

-import utils.constants as const
+from utils.data import Citation


@DatasetReader.register("citation_dataset_reader") # type for config files
 class CitationDataSetReader(DatasetReader):
    def __init__(self):
        super().__init__()
-        self.tokenizer = Tokenizer()
+        self.tokenizer = SpacyTokenizer()

    @overrides
    def _read(self, file_path: str) -> Iterable[Instance]:
@ -25,14 +25,13 @@ class CitationDataSetReader(DatasetReader):

    @overrides
    def text_to_instance(self, citation_text: str,
-                         intent: List[str]) -> Instance:
+                         intent: str) -> Instance:
        citation_tokens = self.tokenizer.tokenize(citation_text)
        token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
                          "tokens": SingleIdTokenIndexer()}

        fields = {'tokens': TextField(citation_tokens, token_indexers),
-                  'label': MultiLabelField([const.CLASS_LABELS[i] for i in intent], skip_indexing=True,
-                                           num_labels=len(const.CLASS_LABELS))}
+                  'label': LabelField(intent)}

        return Instance(fields)