From 18b7847bcfbc2a9d0c254385401eaccadffe9c0a Mon Sep 17 00:00:00 2001
From: Isaac Riley <isaac.r.riley@gmail.com>
Date: Mon, 6 Jul 2020 00:50:44 +0200
Subject: [PATCH] ffnn and folder for sota model

---
 allennlp_project/classifier/__init__.py   |   2 +
 allennlp_project/classifier/nn.py         | 157 ++++++++++++++++++++++
 allennlp_project/configs/basic_model.json |  56 ++++++++
 allennlp_project/utils/reader.py          |  57 ++++++++
 classifier/nn_ff.py                       |  88 ++++++++++++
 utils/nn_reader.py                        |  43 ++++++
 6 files changed, 403 insertions(+)
 create mode 100644 allennlp_project/classifier/__init__.py
 create mode 100644 allennlp_project/classifier/nn.py
 create mode 100644 allennlp_project/configs/basic_model.json
 create mode 100644 allennlp_project/utils/reader.py
 create mode 100644 classifier/nn_ff.py
 create mode 100644 utils/nn_reader.py

diff --git a/allennlp_project/classifier/__init__.py b/allennlp_project/classifier/__init__.py
new file mode 100644
index 0000000..81d9d69
--- /dev/null
+++ b/allennlp_project/classifier/__init__.py
@@ -0,0 +1,2 @@
+from .nn import *
+from utils.reader import *
diff --git a/allennlp_project/classifier/nn.py b/allennlp_project/classifier/nn.py
new file mode 100644
index 0000000..5f1addf
--- /dev/null
+++ b/allennlp_project/classifier/nn.py
@@ -0,0 +1,157 @@
+from typing import Dict
+
+import numpy as np
+import torch
+from allennlp.common.checks import ConfigurationError
+from allennlp.data import Vocabulary
+from allennlp.models import Model
+from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder, FeedForward, Elmo
+from allennlp.nn import util
+from allennlp.training.metrics import CategoricalAccuracy, F1Measure
+from overrides import overrides
+from torch.nn import Parameter
+
+
+@Model.register("basic_bilstm_classifier")
+class BiLstmClassifier(Model):
+
+    def __init__(self, vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 encoder: Seq2SeqEncoder,
+                 classifier_feedforward: FeedForward,
+                 elmo: Elmo = None,
+                 use_input_elmo: bool = False):
+        super().__init__(vocab)
+        self.elmo = elmo
+        self.use_elmo = use_input_elmo
+        self.text_field_embedder = text_field_embedder
+        self.num_classes = self.vocab.get_vocab_size("label")
+        self.encoder = encoder
+        self.classifier_feed_forward = classifier_feedforward
+        self.label_accuracy = CategoricalAccuracy()
+
+        self.label_f1_metrics = {}
+
+        for i in range(self.num_classes):
+            self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] = \
+                F1Measure(positive_label=i)
+
+        self.loss = torch.nn.CrossEntropyLoss()
+
+        self.attention = Attention(encoder.get_output_dim())
+
+    @overrides
+    def forward(self, tokens: Dict[str, torch.LongTensor],
+                label: torch.LongTensor) -> Dict[str, torch.LongTensor]:
+
+        global input_elmo
+        elmo_tokens = tokens.pop("elmo", None)
+
+        embedded_text = self.text_field_embedder(tokens)
+        text_mask = util.get_text_field_mask(tokens)
+
+        if elmo_tokens is not None:
+            tokens["elmo"] = elmo_tokens
+
+            # Create ELMo embeddings if applicable
+            if self.elmo:
+                if elmo_tokens is not None:
+                    elmo_representations = self.elmo(elmo_tokens["elmo_tokens"])["elmo_representations"]
+                    if self.use_elmo:
+                        input_elmo = elmo_representations.pop()
+                    assert not elmo_representations
+                else:
+                    raise ConfigurationError("Model was built to use Elmo, but input text is not tokenized for Elmo.")
+
+            if self.use_elmo:
+                if embedded_text is not None:
+                    embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
+                else:
+                    embedded_text = input_elmo
+
+        encoded_text = self.encoder(embedded_text, text_mask)
+
+        # Attention
+        attn_dist, encoded_text = self.attention(encoded_text, return_attn_distribution=True)
+
+        if label is not None:
+            logits = self.classifier_feed_forward(encoded_text)
+            class_probabilities = torch.nn.functional.softmax(logits, dim=1)
+
+            output_dict = {"logits": logits}
+
+            loss = self.loss(logits, label)
+            output_dict["loss"] = loss
+
+            # compute F1 per label
+            for i in range(self.num_classes):
+                metric = self.label_f1_metrics[self.vocab.get_token_from_index(index=i, namespace="label")]
+                metric(class_probabilities, label)
+            output_dict['label'] = label
+
+            output_dict['tokens'] = tokens['tokens']
+
+        return output_dict
+
+    @overrides
+    def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        class_probabilities = torch.nn.functional.softmax(output_dict['logits'], dim=-1)
+        predictions = class_probabilities.cpu().data.numpy()
+        argmax_indices = np.argmax(predictions, axis=-1)
+        label = [self.vocab.get_token_from_index(x, namespace="label")
+                 for x in argmax_indices]
+        output_dict['probabilities'] = class_probabilities
+        output_dict['positive_label'] = label
+        output_dict['prediction'] = label
+        citation_text = []
+        for batch_text in output_dict['tokens']:
+            citation_text.append([self.vocab.get_token_from_index(token_id.item()) for token_id in batch_text])
+        output_dict['tokens'] = citation_text
+
+        return output_dict
+
+    @overrides
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metric_dict = {}
+
+        sum_f1 = 0.0
+        for name, metric in self.label_f1_metrics.items():
+            metric_val = metric.get_metric(reset)
+            metric_dict[name + '_P'] = metric_val[0]
+            metric_dict[name + '_R'] = metric_val[1]
+            metric_dict[name + '_F1'] = metric_val[2]
+            if name != 'none':  # do not consider `none` label in averaging F1
+                sum_f1 += metric_val[2]
+
+        names = list(self.label_f1_metrics.keys())
+        total_len = len(names) if 'none' not in names else len(names) - 1
+        average_f1 = sum_f1 / total_len
+        metric_dict['average_F1'] = average_f1
+
+        return metric_dict
+
+
+def new_parameter(*size):
+    out = Parameter(torch.FloatTensor(*size))
+    torch.nn.init.xavier_normal_(out)
+    return out
+
+
+class Attention(torch.nn.Module):
+    """ Simple multiplicative attention"""
+
+    def __init__(self, attention_size):
+        super(Attention, self).__init__()
+        self.attention = new_parameter(attention_size, 1)
+
+    def forward(self, x_in, reduction_dim=-2, return_attn_distribution=False):
+        # calculate attn weights
+        attn_score = torch.matmul(x_in, self.attention).squeeze()
+        # add one dimension at the end and get a distribution out of scores
+        attn_distrib = torch.nn.functional.softmax(attn_score.squeeze(), dim=-1).unsqueeze(-1)
+        scored_x = x_in * attn_distrib
+        weighted_sum = torch.sum(scored_x, dim=reduction_dim)
+        if return_attn_distribution:
+            return attn_distrib.reshape(x_in.shape[0], -1), weighted_sum
+        else:
+            return weighted_sum
diff --git a/allennlp_project/configs/basic_model.json b/allennlp_project/configs/basic_model.json
new file mode 100644
index 0000000..4887b66
--- /dev/null
+++ b/allennlp_project/configs/basic_model.json
@@ -0,0 +1,56 @@
+{
+  "dataset_reader": {
+    "type": "citation_dataset_reader"
+  },
+  "train_data_path": "data/jsonl/train.jsonl",
+  "validation_data_path": "data/jsonl/test.jsonl",
+  "test_data_path": "data/jsonl/test.jsonl",
+  "model": {
+    "type": "basic_bilstm_classifier",
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz",
+            "type": "embedding",
+            "embedding_dim": 100,
+            "trainable": false
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 1124,
+      "hidden_size": 100,
+      "num_layers": 1,
+      "bidirectional": true
+    },
+    "elmo": {
+        "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
+        "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
+        "do_layer_norm": true,
+        "dropout": 0.5,
+        "num_output_representations": 1
+    },
+    "use_input_elmo": true,
+    "classifier_feedforward": {
+      "input_dim": 200,
+      "num_layers": 2,
+      "hidden_dims": [20, 3],
+      "activations": ["linear", "linear"]
+    }
+  },
+  "data_loader": {
+    "batch_sampler": {
+      "type": "bucket",
+      "batch_size" : 16
+    }
+  },
+  "trainer": {
+    "optimizer": {
+      "type": "adam",
+      "lr": 0.001
+    },
+    "num_epochs": 2,
+    "cuda_device": -1
+  }
+}
\ No newline at end of file
diff --git a/allennlp_project/utils/reader.py b/allennlp_project/utils/reader.py
new file mode 100644
index 0000000..38199b6
--- /dev/null
+++ b/allennlp_project/utils/reader.py
@@ -0,0 +1,57 @@
+from typing import Iterable
+
+import jsonlines
+from allennlp.data import Instance
+from allennlp.data.dataset_readers import DatasetReader
+from allennlp.data.fields import TextField, LabelField
+from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
+from allennlp.data.tokenizers import SpacyTokenizer
+from overrides import overrides
+
+from utils.data import Citation
+
+
+@DatasetReader.register("citation_dataset_reader") # type for config files
+class CitationDataSetReader(DatasetReader):
+    def __init__(self):
+        super().__init__()
+        self.tokenizer = SpacyTokenizer()
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        ds_reader = DataReaderJsonLines(file_path)
+        for citation in ds_reader.read():
+            yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
+
+    @overrides
+    def text_to_instance(self, citation_text: str,
+                         intent: str) -> Instance:
+        citation_tokens = self.tokenizer.tokenize(citation_text)
+        token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
+                          "tokens": SingleIdTokenIndexer()}
+
+        fields = {'tokens': TextField(citation_tokens, token_indexers),
+                  'label': LabelField(intent)}
+
+        return Instance(fields)
+
+
+class DataReaderJsonLines:
+    def __init__(self, file_path):
+        self.file_path = file_path
+
+    def read(self):
+        for line in jsonlines.open(self.file_path):
+            yield read_json_line(line)
+
+
+def read_json_line(line):
+    citation = Citation(
+        text=line['string'],
+        citing_paper_id=line['citingPaperId'],
+        cited_paper_id=line['citedPaperId'],
+        section_title=line['sectionName'],
+        intent=line['label'],
+        citation_id=line['id'])
+
+    return citation
diff --git a/classifier/nn_ff.py b/classifier/nn_ff.py
new file mode 100644
index 0000000..b8b8fc9
--- /dev/null
+++ b/classifier/nn_ff.py
@@ -0,0 +1,88 @@
+from utils.nn_reader import read_csv
+
+import torch
+
+
+
+
+
+class Feedforward(torch.nn.Module):
+        def __init__(self, input_size, hidden_size, output_size):
+            super(Feedforward, self).__init__()
+            self.input_size = input_size
+            self.hidden_size  = hidden_size
+            self.output_size = output_size
+            self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
+            self.relu = torch.nn.ReLU()
+            self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
+            self.sigmoid = torch.nn.Sigmoid()
+            self.softmax = torch.nn.Softmax(dim=1)
+        
+        def forward(self, x):
+            hidden = self.fc1(x)
+            relu = self.relu(hidden)
+            output = self.fc2(relu)
+            output = self.softmax(output)
+            return output
+
+
+"""
+from sklearn.datasets import make_blobs
+def blob_label(y, label, loc): # assign labels
+    target = numpy.copy(y)
+    for l in loc:
+        target[y == l] = label
+    return target
+X_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True)
+X_train = torch.FloatTensor(X_train)
+y_train = torch.FloatTensor(blob_label(y_train, 0, [0]))
+y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3]))
+x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True)
+x_test = torch.FloatTensor(x_test)
+y_test = torch.FloatTensor(blob_label(y_test, 0, [0]))
+y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3]))
+"""
+
+
+X_train = torch.as_tensor(X_train)
+X_test = torch.as_tensor(X_test)
+y_train = torch.as_tensor(y_train)
+
+
+model = Feedforward(28, 9, 3)
+criterion = torch.nn.CrossEntropyLoss()
+optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
+
+
+
+model.eval()
+y_pred = model(X_train)
+y_pred = torch.Tensor([list(x).index(x.max()) for x in y_pred])
+y_pred = 
+before_train = criterion(y_train, y_pred)
+print('Test loss before training' , before_train.item())
+
+
+
+model.train()
+epoch = 20
+for epoch in range(epoch):
+    optimizer.zero_grad()
+    # Forward pass
+    y_pred = model(X_train)
+    # Compute Loss
+    loss = criterion(y_pred.squeeze(), y_train)
+   
+    print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
+    # Backward pass
+    loss.backward()
+    optimizer.step()
+
+
+model.eval()
+y_pred = model(X_test)
+after_train = criterion(y_pred.squeeze(), y_test) 
+print('Test loss after Training' , after_train.item())
+
+
+
diff --git a/utils/nn_reader.py b/utils/nn_reader.py
new file mode 100644
index 0000000..b68bc8e
--- /dev/null
+++ b/utils/nn_reader.py
@@ -0,0 +1,43 @@
+import numpy as np
+from iterables import chain
+from utils.csv import read_csv_file
+
+train_file_path = 'data/tsv/train.tsv'
+test_file_path = 'data/tsv/test.tsv'
+train_raw = read_csv_file(train_file_path, '\t')
+
+features = [x.features for x in train_raw]
+features_unique = list(set(chain.from_iterable(features)))
+nobs = len(features)
+nfeats = len(features_unique)
+
+X_train = np.zeros((nobs, nfeats))
+
+for j in range(nfeats):
+    f = features_unique[j]
+    for i in range(nobs):
+        if f in features[i]:
+            X_train[i,j] = 1
+
+y_train_raw = np.array([x.true_label for x in train_raw])
+y_unique = sorted(list(set(y_train_raw)))
+y_dim = len(y_unique)
+y_train = np.zeros((nobs,y_dim))
+
+for j in range(y_dim):
+    y_train[:,j] = y_raw == y_unique[j]
+
+test_raw = read_csv_file(test_file_path, '\t')
+features = [x.features for x in test_raw]
+#features_unique = list(set(chain.from_iterable(features)))
+nobs = len(features)
+nfeats = len(features_unique)
+
+X_test = np.zeros((nobs, nfeats))
+for j in range(nfeats):
+    f = features_unique[j]
+    for i in range(nobs):
+        if f in features[i]:
+            X_test[i,j] = 1
+
+