From 18b7847bcfbc2a9d0c254385401eaccadffe9c0a Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Mon, 6 Jul 2020 00:50:44 +0200 Subject: [PATCH] ffnn and folder for sota model --- allennlp_project/classifier/__init__.py | 2 + allennlp_project/classifier/nn.py | 157 ++++++++++++++++++++++ allennlp_project/configs/basic_model.json | 56 ++++++++ allennlp_project/utils/reader.py | 57 ++++++++ classifier/nn_ff.py | 88 ++++++++++++ utils/nn_reader.py | 43 ++++++ 6 files changed, 403 insertions(+) create mode 100644 allennlp_project/classifier/__init__.py create mode 100644 allennlp_project/classifier/nn.py create mode 100644 allennlp_project/configs/basic_model.json create mode 100644 allennlp_project/utils/reader.py create mode 100644 classifier/nn_ff.py create mode 100644 utils/nn_reader.py diff --git a/allennlp_project/classifier/__init__.py b/allennlp_project/classifier/__init__.py new file mode 100644 index 0000000..81d9d69 --- /dev/null +++ b/allennlp_project/classifier/__init__.py @@ -0,0 +1,2 @@ +from .nn import * +from utils.reader import * diff --git a/allennlp_project/classifier/nn.py b/allennlp_project/classifier/nn.py new file mode 100644 index 0000000..5f1addf --- /dev/null +++ b/allennlp_project/classifier/nn.py @@ -0,0 +1,157 @@ +from typing import Dict + +import numpy as np +import torch +from allennlp.common.checks import ConfigurationError +from allennlp.data import Vocabulary +from allennlp.models import Model +from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder, FeedForward, Elmo +from allennlp.nn import util +from allennlp.training.metrics import CategoricalAccuracy, F1Measure +from overrides import overrides +from torch.nn import Parameter + + +@Model.register("basic_bilstm_classifier") +class BiLstmClassifier(Model): + + def __init__(self, vocab: Vocabulary, + text_field_embedder: TextFieldEmbedder, + encoder: Seq2SeqEncoder, + classifier_feedforward: FeedForward, + elmo: Elmo = None, + use_input_elmo: bool = False): + super().__init__(vocab) + self.elmo = elmo + self.use_elmo = use_input_elmo + self.text_field_embedder = text_field_embedder + self.num_classes = self.vocab.get_vocab_size("label") + self.encoder = encoder + self.classifier_feed_forward = classifier_feedforward + self.label_accuracy = CategoricalAccuracy() + + self.label_f1_metrics = {} + + for i in range(self.num_classes): + self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="label")] = \ + F1Measure(positive_label=i) + + self.loss = torch.nn.CrossEntropyLoss() + + self.attention = Attention(encoder.get_output_dim()) + + @overrides + def forward(self, tokens: Dict[str, torch.LongTensor], + label: torch.LongTensor) -> Dict[str, torch.LongTensor]: + + global input_elmo + elmo_tokens = tokens.pop("elmo", None) + + embedded_text = self.text_field_embedder(tokens) + text_mask = util.get_text_field_mask(tokens) + + if elmo_tokens is not None: + tokens["elmo"] = elmo_tokens + + # Create ELMo embeddings if applicable + if self.elmo: + if elmo_tokens is not None: + elmo_representations = self.elmo(elmo_tokens["elmo_tokens"])["elmo_representations"] + if self.use_elmo: + input_elmo = elmo_representations.pop() + assert not elmo_representations + else: + raise ConfigurationError("Model was built to use Elmo, but input text is not tokenized for Elmo.") + + if self.use_elmo: + if embedded_text is not None: + embedded_text = torch.cat([embedded_text, input_elmo], dim=-1) + else: + embedded_text = input_elmo + + encoded_text = self.encoder(embedded_text, text_mask) + + # Attention + attn_dist, encoded_text = self.attention(encoded_text, return_attn_distribution=True) + + if label is not None: + logits = self.classifier_feed_forward(encoded_text) + class_probabilities = torch.nn.functional.softmax(logits, dim=1) + + output_dict = {"logits": logits} + + loss = self.loss(logits, label) + output_dict["loss"] = loss + + # compute F1 per label + for i in range(self.num_classes): + metric = self.label_f1_metrics[self.vocab.get_token_from_index(index=i, namespace="label")] + metric(class_probabilities, label) + output_dict['label'] = label + + output_dict['tokens'] = tokens['tokens'] + + return output_dict + + @overrides + def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + class_probabilities = torch.nn.functional.softmax(output_dict['logits'], dim=-1) + predictions = class_probabilities.cpu().data.numpy() + argmax_indices = np.argmax(predictions, axis=-1) + label = [self.vocab.get_token_from_index(x, namespace="label") + for x in argmax_indices] + output_dict['probabilities'] = class_probabilities + output_dict['positive_label'] = label + output_dict['prediction'] = label + citation_text = [] + for batch_text in output_dict['tokens']: + citation_text.append([self.vocab.get_token_from_index(token_id.item()) for token_id in batch_text]) + output_dict['tokens'] = citation_text + + return output_dict + + @overrides + def get_metrics(self, reset: bool = False) -> Dict[str, float]: + metric_dict = {} + + sum_f1 = 0.0 + for name, metric in self.label_f1_metrics.items(): + metric_val = metric.get_metric(reset) + metric_dict[name + '_P'] = metric_val[0] + metric_dict[name + '_R'] = metric_val[1] + metric_dict[name + '_F1'] = metric_val[2] + if name != 'none': # do not consider `none` label in averaging F1 + sum_f1 += metric_val[2] + + names = list(self.label_f1_metrics.keys()) + total_len = len(names) if 'none' not in names else len(names) - 1 + average_f1 = sum_f1 / total_len + metric_dict['average_F1'] = average_f1 + + return metric_dict + + +def new_parameter(*size): + out = Parameter(torch.FloatTensor(*size)) + torch.nn.init.xavier_normal_(out) + return out + + +class Attention(torch.nn.Module): + """ Simple multiplicative attention""" + + def __init__(self, attention_size): + super(Attention, self).__init__() + self.attention = new_parameter(attention_size, 1) + + def forward(self, x_in, reduction_dim=-2, return_attn_distribution=False): + # calculate attn weights + attn_score = torch.matmul(x_in, self.attention).squeeze() + # add one dimension at the end and get a distribution out of scores + attn_distrib = torch.nn.functional.softmax(attn_score.squeeze(), dim=-1).unsqueeze(-1) + scored_x = x_in * attn_distrib + weighted_sum = torch.sum(scored_x, dim=reduction_dim) + if return_attn_distribution: + return attn_distrib.reshape(x_in.shape[0], -1), weighted_sum + else: + return weighted_sum diff --git a/allennlp_project/configs/basic_model.json b/allennlp_project/configs/basic_model.json new file mode 100644 index 0000000..4887b66 --- /dev/null +++ b/allennlp_project/configs/basic_model.json @@ -0,0 +1,56 @@ +{ + "dataset_reader": { + "type": "citation_dataset_reader" + }, + "train_data_path": "data/jsonl/train.jsonl", + "validation_data_path": "data/jsonl/test.jsonl", + "test_data_path": "data/jsonl/test.jsonl", + "model": { + "type": "basic_bilstm_classifier", + "text_field_embedder": { + "token_embedders": { + "tokens": { + "pretrained_file": "https://s3-us-west-2.amazonaws.com/allennlp/datasets/glove/glove.6B.100d.txt.gz", + "type": "embedding", + "embedding_dim": 100, + "trainable": false + } + } + }, + "encoder": { + "type": "lstm", + "input_size": 1124, + "hidden_size": 100, + "num_layers": 1, + "bidirectional": true + }, + "elmo": { + "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json", + "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5", + "do_layer_norm": true, + "dropout": 0.5, + "num_output_representations": 1 + }, + "use_input_elmo": true, + "classifier_feedforward": { + "input_dim": 200, + "num_layers": 2, + "hidden_dims": [20, 3], + "activations": ["linear", "linear"] + } + }, + "data_loader": { + "batch_sampler": { + "type": "bucket", + "batch_size" : 16 + } + }, + "trainer": { + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + "num_epochs": 2, + "cuda_device": -1 + } +} \ No newline at end of file diff --git a/allennlp_project/utils/reader.py b/allennlp_project/utils/reader.py new file mode 100644 index 0000000..38199b6 --- /dev/null +++ b/allennlp_project/utils/reader.py @@ -0,0 +1,57 @@ +from typing import Iterable + +import jsonlines +from allennlp.data import Instance +from allennlp.data.dataset_readers import DatasetReader +from allennlp.data.fields import TextField, LabelField +from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer +from allennlp.data.tokenizers import SpacyTokenizer +from overrides import overrides + +from utils.data import Citation + + +@DatasetReader.register("citation_dataset_reader") # type for config files +class CitationDataSetReader(DatasetReader): + def __init__(self): + super().__init__() + self.tokenizer = SpacyTokenizer() + + @overrides + def _read(self, file_path: str) -> Iterable[Instance]: + ds_reader = DataReaderJsonLines(file_path) + for citation in ds_reader.read(): + yield self.text_to_instance(citation_text=citation.text, intent=citation.intent) + + @overrides + def text_to_instance(self, citation_text: str, + intent: str) -> Instance: + citation_tokens = self.tokenizer.tokenize(citation_text) + token_indexers = {"elmo": ELMoTokenCharactersIndexer(), + "tokens": SingleIdTokenIndexer()} + + fields = {'tokens': TextField(citation_tokens, token_indexers), + 'label': LabelField(intent)} + + return Instance(fields) + + +class DataReaderJsonLines: + def __init__(self, file_path): + self.file_path = file_path + + def read(self): + for line in jsonlines.open(self.file_path): + yield read_json_line(line) + + +def read_json_line(line): + citation = Citation( + text=line['string'], + citing_paper_id=line['citingPaperId'], + cited_paper_id=line['citedPaperId'], + section_title=line['sectionName'], + intent=line['label'], + citation_id=line['id']) + + return citation diff --git a/classifier/nn_ff.py b/classifier/nn_ff.py new file mode 100644 index 0000000..b8b8fc9 --- /dev/null +++ b/classifier/nn_ff.py @@ -0,0 +1,88 @@ +from utils.nn_reader import read_csv + +import torch + + + + + +class Feedforward(torch.nn.Module): + def __init__(self, input_size, hidden_size, output_size): + super(Feedforward, self).__init__() + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = output_size + self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size) + self.relu = torch.nn.ReLU() + self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size) + self.sigmoid = torch.nn.Sigmoid() + self.softmax = torch.nn.Softmax(dim=1) + + def forward(self, x): + hidden = self.fc1(x) + relu = self.relu(hidden) + output = self.fc2(relu) + output = self.softmax(output) + return output + + +""" +from sklearn.datasets import make_blobs +def blob_label(y, label, loc): # assign labels + target = numpy.copy(y) + for l in loc: + target[y == l] = label + return target +X_train, y_train = make_blobs(n_samples=40, n_features=2, cluster_std=1.5, shuffle=True) +X_train = torch.FloatTensor(X_train) +y_train = torch.FloatTensor(blob_label(y_train, 0, [0])) +y_train = torch.FloatTensor(blob_label(y_train, 1, [1,2,3])) +x_test, y_test = make_blobs(n_samples=10, n_features=2, cluster_std=1.5, shuffle=True) +x_test = torch.FloatTensor(x_test) +y_test = torch.FloatTensor(blob_label(y_test, 0, [0])) +y_test = torch.FloatTensor(blob_label(y_test, 1, [1,2,3])) +""" + + +X_train = torch.as_tensor(X_train) +X_test = torch.as_tensor(X_test) +y_train = torch.as_tensor(y_train) + + +model = Feedforward(28, 9, 3) +criterion = torch.nn.CrossEntropyLoss() +optimizer = torch.optim.SGD(model.parameters(), lr = 0.01) + + + +model.eval() +y_pred = model(X_train) +y_pred = torch.Tensor([list(x).index(x.max()) for x in y_pred]) +y_pred = +before_train = criterion(y_train, y_pred) +print('Test loss before training' , before_train.item()) + + + +model.train() +epoch = 20 +for epoch in range(epoch): + optimizer.zero_grad() + # Forward pass + y_pred = model(X_train) + # Compute Loss + loss = criterion(y_pred.squeeze(), y_train) + + print('Epoch {}: train loss: {}'.format(epoch, loss.item())) + # Backward pass + loss.backward() + optimizer.step() + + +model.eval() +y_pred = model(X_test) +after_train = criterion(y_pred.squeeze(), y_test) +print('Test loss after Training' , after_train.item()) + + + diff --git a/utils/nn_reader.py b/utils/nn_reader.py new file mode 100644 index 0000000..b68bc8e --- /dev/null +++ b/utils/nn_reader.py @@ -0,0 +1,43 @@ +import numpy as np +from iterables import chain +from utils.csv import read_csv_file + +train_file_path = 'data/tsv/train.tsv' +test_file_path = 'data/tsv/test.tsv' +train_raw = read_csv_file(train_file_path, '\t') + +features = [x.features for x in train_raw] +features_unique = list(set(chain.from_iterable(features))) +nobs = len(features) +nfeats = len(features_unique) + +X_train = np.zeros((nobs, nfeats)) + +for j in range(nfeats): + f = features_unique[j] + for i in range(nobs): + if f in features[i]: + X_train[i,j] = 1 + +y_train_raw = np.array([x.true_label for x in train_raw]) +y_unique = sorted(list(set(y_train_raw))) +y_dim = len(y_unique) +y_train = np.zeros((nobs,y_dim)) + +for j in range(y_dim): + y_train[:,j] = y_raw == y_unique[j] + +test_raw = read_csv_file(test_file_path, '\t') +features = [x.features for x in test_raw] +#features_unique = list(set(chain.from_iterable(features))) +nobs = len(features) +nfeats = len(features_unique) + +X_test = np.zeros((nobs, nfeats)) +for j in range(nfeats): + f = features_unique[j] + for i in range(nobs): + if f in features[i]: + X_test[i,j] = 1 + +