Compare commits
No commits in common. 'master' and 'pavan' have entirely different histories.
@ -1,2 +0,0 @@
|
|||||||
classifier
|
|
||||||
utils
|
|
||||||
Binary file not shown.
@ -1,3 +0,0 @@
|
|||||||
from .nn import *
|
|
||||||
from utils.reader import *
|
|
||||||
from.intent_predictor import *
|
|
||||||
@ -1,105 +0,0 @@
|
|||||||
from typing import Dict, List, Tuple
|
|
||||||
|
|
||||||
from allennlp.common import JsonDict
|
|
||||||
from allennlp.data import Instance
|
|
||||||
from allennlp.predictors import Predictor
|
|
||||||
from overrides import overrides
|
|
||||||
from allennlp.models import Model
|
|
||||||
from allennlp.data.dataset_readers import DatasetReader
|
|
||||||
from allennlp.models.archival import load_archive
|
|
||||||
from utils.reader import DataReaderJsonLines, CitationDataSetReader
|
|
||||||
|
|
||||||
import os
|
|
||||||
|
|
||||||
|
|
||||||
@Predictor.register('citation_intent_predictor')
|
|
||||||
class IntentClassificationPredictor(Predictor):
|
|
||||||
"""
|
|
||||||
~~~Predictor for Citation Intent Classifier~~~
|
|
||||||
|
|
||||||
- This is just a wrapper class around AllenNLP Model
|
|
||||||
used for making predictions from the trained/saved model
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def predict(self, text: str, intent: str):
|
|
||||||
"""
|
|
||||||
This function can be called for each data point from the test dataset,
|
|
||||||
takes citation text and the target intent as parameters and
|
|
||||||
returns output dictionary from :func: `~classifier.nn.BiLstmClassifier.forward` method
|
|
||||||
|
|
||||||
:param text: Citation text from test data
|
|
||||||
:param intent: target intent of the data point
|
|
||||||
:return: returns output dictionary from Model's forward method
|
|
||||||
"""
|
|
||||||
return self.predict_json({"string": text, "label": intent})
|
|
||||||
|
|
||||||
@overrides
|
|
||||||
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
|
|
||||||
"""
|
|
||||||
we get a callback to this method from AllenNLP Predictor,
|
|
||||||
passes JsonDict as a parameter with the data that we passed to the prediction_json function earlier.
|
|
||||||
|
|
||||||
And this callback should return the AllenNLP Instance with tokens and target label.
|
|
||||||
|
|
||||||
:param json_dict: json dictionary data with text and intent label
|
|
||||||
:return: returns AllenNLP Instance with tokens(ELMo) and target label
|
|
||||||
"""
|
|
||||||
return self._dataset_reader.text_to_instance(json_dict["string"], json_dict["label"])
|
|
||||||
|
|
||||||
|
|
||||||
def make_predictions(model: Model, dataset_reader: DatasetReader, dataset_file_path: str) -> Tuple[list, list]:
|
|
||||||
"""
|
|
||||||
This function takes the pre-trained(saved) Model and DatasetReader(and dataset file path) as arguments
|
|
||||||
and returns a Tuple of prediction list and gold/true list.
|
|
||||||
|
|
||||||
- Creates a predictor object with the pre-trained model and dataset reader.
|
|
||||||
- Read the data from the passed dataset file path and for each data point, use predictor to predict the intent
|
|
||||||
|
|
||||||
:param model: a trained/saved AllenNLP Model
|
|
||||||
:param dataset_reader: Dataset reader object (for tokenizing text and creating Instances)
|
|
||||||
:param dataset_file_path: a dataset file path to make predictions
|
|
||||||
|
|
||||||
:return: returns a Tuple of prediction list and true labels list
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Create predictor class object
|
|
||||||
predictor = IntentClassificationPredictor(model, dataset_reader)
|
|
||||||
|
|
||||||
prediction_list = []
|
|
||||||
true_list = []
|
|
||||||
|
|
||||||
# read JSON Lines file and Iterate through each datapoint to predict
|
|
||||||
jsonl_reader = DataReaderJsonLines(dataset_file_path)
|
|
||||||
for citation in jsonl_reader.read():
|
|
||||||
true_list.append(citation.intent)
|
|
||||||
output = predictor.predict(citation.text, citation.intent)
|
|
||||||
prediction_list.append(output['prediction'])
|
|
||||||
|
|
||||||
# returns prediction list and gold labels list - Tuple
|
|
||||||
return prediction_list, true_list
|
|
||||||
|
|
||||||
|
|
||||||
def load_model_and_predict_test_data(saved_model_dir: str):
|
|
||||||
"""
|
|
||||||
|
|
||||||
This function loads the saved model from the specified directory and calls make_predictions function.
|
|
||||||
|
|
||||||
:param saved_model_dir: path of the saved AllenNLP model (typically from IMS common space)
|
|
||||||
|
|
||||||
:return: returns a list of prediction list and true list
|
|
||||||
"""
|
|
||||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
dev_file_path = project_root + '/data/jsonl/dev.jsonl'
|
|
||||||
test_file_path = project_root + '/data/jsonl/test.jsonl'
|
|
||||||
|
|
||||||
# load the archived/saved model
|
|
||||||
model_archive = load_archive(os.path.join(saved_model_dir, 'model.tar.gz'))
|
|
||||||
|
|
||||||
# create dataset reader object
|
|
||||||
citation_dataset_reader = CitationDataSetReader()
|
|
||||||
|
|
||||||
# make predictions
|
|
||||||
y_pred, y_true = make_predictions(model_archive.model, citation_dataset_reader, test_file_path)
|
|
||||||
|
|
||||||
return y_pred, y_true
|
|
||||||
@ -1,240 +0,0 @@
|
|||||||
from utils.csv import DataInstance
|
|
||||||
from feature_extraction.features import FEATURE_LIST, THETA_BIAS_FEATURE
|
|
||||||
from collections import OrderedDict
|
|
||||||
import random
|
|
||||||
|
|
||||||
|
|
||||||
class Perceptron:
|
|
||||||
|
|
||||||
"""
|
|
||||||
Perceptron is an algorithm for supervised learning of binary classifiers,
|
|
||||||
which can decide whether or not an input(features) belongs to some specific class.
|
|
||||||
It's a linear classifier, which makes predictions by combining weights with feature vector.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, label: str, weights: dict, theta_bias: float):
|
|
||||||
"""
|
|
||||||
:type label: str
|
|
||||||
:type weights: dict
|
|
||||||
:type theta_bias: float
|
|
||||||
|
|
||||||
:param label: Label for the Perceptron Classifier (useful while dealing with Multi-Class Perceptron)
|
|
||||||
:param weights: dictionary of feature name and feature weights(random number)
|
|
||||||
:param theta_bias: value of the theta bias variable, threshold weight in other words
|
|
||||||
"""
|
|
||||||
self.label = label
|
|
||||||
self.weights = weights
|
|
||||||
self.theta_bias = theta_bias
|
|
||||||
|
|
||||||
def score(self, features: list):
|
|
||||||
"""
|
|
||||||
This function takes the list of features as parameter and
|
|
||||||
computes score by adding all the weights that corresponds to these features
|
|
||||||
|
|
||||||
:type features: list
|
|
||||||
|
|
||||||
:param features: list of features from a DataInstance
|
|
||||||
:return: returns the computed score
|
|
||||||
"""
|
|
||||||
score_val = 0
|
|
||||||
for feature in features:
|
|
||||||
score_val += self.weights[feature]
|
|
||||||
|
|
||||||
return score_val
|
|
||||||
|
|
||||||
def update_weights(self, features: list, learning_rate: float = 1.0, penalize: bool = False, reward: bool = False):
|
|
||||||
"""
|
|
||||||
This function is used to update weights during the training of the Perceptron Classifier.
|
|
||||||
It takes a list of features as parameter and updates(either increase or decrease) the
|
|
||||||
weights for these individual features based on learning rate parameter
|
|
||||||
|
|
||||||
:param features: list of features from Input DataInstance
|
|
||||||
:param learning_rate: Default is 1.0
|
|
||||||
:param penalize: If True, decreases the weights for each feature. Default is False
|
|
||||||
:param reward: If True, increases the weights for each feature. Default is False
|
|
||||||
|
|
||||||
- If both penalize and reward params are False, weights will not get updated.
|
|
||||||
- If both penalize and reward are True without a learning rate(or learning rate 1),
|
|
||||||
weights for the features remain the same.
|
|
||||||
"""
|
|
||||||
for feature in features:
|
|
||||||
feature_weight = self.weights[feature]
|
|
||||||
if penalize:
|
|
||||||
self.weights[feature] = round(feature_weight - (learning_rate * 1), 5)
|
|
||||||
if reward:
|
|
||||||
self.weights[feature] = round(feature_weight + (learning_rate * 1), 5)
|
|
||||||
|
|
||||||
|
|
||||||
class MultiClassPerceptron:
|
|
||||||
"""
|
|
||||||
Perceptron is a binary classifier, can only separate between two classes.
|
|
||||||
Multi-Class Perceptron can be used, where multiple labels can be assigned to each data instance.
|
|
||||||
|
|
||||||
Multi-Class Perceptron creates one Perceptron Classifier for each label, while training
|
|
||||||
it takes the score for each label(from Perceptron Classifier) and
|
|
||||||
the label with the highest score is the predicted label
|
|
||||||
|
|
||||||
If the predicted label is different from true label of data instance,
|
|
||||||
this model updates the weights as follows:
|
|
||||||
- decrease the weights for the Perceptron Classifier of predicted label (penalize)
|
|
||||||
- increase the weights for the Perceptron Classifier of true label (reward)
|
|
||||||
|
|
||||||
This model also shuffles the training data after each epoch.
|
|
||||||
"""
|
|
||||||
def __init__(self, epochs: int = 5, learning_rate: float = 1.0, random_state: int = 42):
|
|
||||||
"""
|
|
||||||
:type epochs: int
|
|
||||||
:type learning_rate: float
|
|
||||||
:type random_state: int
|
|
||||||
|
|
||||||
:param epochs: number of training iterations
|
|
||||||
:param learning_rate: learning rate for updating weights, Default is 1
|
|
||||||
:param random_state: random state for shuffling the data, useful for reproducing the results.
|
|
||||||
Default is 42.
|
|
||||||
"""
|
|
||||||
self.random_state = random_state
|
|
||||||
self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label
|
|
||||||
self.epochs = epochs
|
|
||||||
self.learning_rate = learning_rate
|
|
||||||
|
|
||||||
def fit(self, X_train: list, labels: list):
|
|
||||||
"""
|
|
||||||
This function takes the training data and labels as parameters and trains the model
|
|
||||||
|
|
||||||
:type X_train: list[DataInstance]
|
|
||||||
:type labels: list[str]
|
|
||||||
|
|
||||||
:param X_train: list of training Data Instances
|
|
||||||
:param labels: list of classes
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Check if labels parameter is empty and raise Exception
|
|
||||||
if labels is None or len(labels) <= 0:
|
|
||||||
raise Exception('The labels parameter must contain at least one label')
|
|
||||||
|
|
||||||
# Check if Training Data is empty and raise Exception
|
|
||||||
if X_train is None or len(X_train) <= 0:
|
|
||||||
raise Exception('Training data can\'t be Empty')
|
|
||||||
|
|
||||||
# Check the data type of training Instances
|
|
||||||
if not isinstance(X_train, list) and not isinstance(X_train[0], DataInstance):
|
|
||||||
raise Exception('Training Data must be a list of type DataInstance(model)')
|
|
||||||
|
|
||||||
train_len = len(X_train)
|
|
||||||
|
|
||||||
# Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
|
|
||||||
for label in labels:
|
|
||||||
sample_weights = get_sample_weights_with_features(theta_bias=-0.25, random_state=self.random_state)
|
|
||||||
self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=-0.25)
|
|
||||||
|
|
||||||
# Training Iterations
|
|
||||||
for epoch in range(self.epochs):
|
|
||||||
|
|
||||||
print('Training Epoch :: (', (epoch+1), '/', self.epochs, ')')
|
|
||||||
|
|
||||||
for i in range(train_len):
|
|
||||||
|
|
||||||
# Pick a number from random list
|
|
||||||
inst = X_train[i]
|
|
||||||
|
|
||||||
perceptron_scores = [] # list for storing perceptron scores for each label
|
|
||||||
for label, perceptron in self.perceptron_dict.items():
|
|
||||||
perceptron_scores.append(perceptron.score(inst.features))
|
|
||||||
|
|
||||||
# find the max score from the list of scores
|
|
||||||
max_score = max(perceptron_scores)
|
|
||||||
|
|
||||||
# find the label that corresponds to max score
|
|
||||||
label_max_score = labels[perceptron_scores.index(max_score)]
|
|
||||||
|
|
||||||
# if the label with max score is different from the label of this data instance,
|
|
||||||
# then decrease the weights(penalize) for the Perceptron of label with max score
|
|
||||||
# and increase the weights(reward) for the Perceptron of data instance label
|
|
||||||
if inst.true_label != label_max_score:
|
|
||||||
# decrease weights
|
|
||||||
self.perceptron_dict[label_max_score].update_weights(inst.features, self.learning_rate, penalize=True)
|
|
||||||
# increase weights
|
|
||||||
self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
|
|
||||||
|
|
||||||
# It's important to shuffle the list during every epoch
|
|
||||||
random.Random(self.random_state).shuffle(X_train)
|
|
||||||
|
|
||||||
def predict(self, X_test: list):
|
|
||||||
"""
|
|
||||||
This function takes testing instances as parameters and assigns a predicted label.
|
|
||||||
|
|
||||||
Takes the score from each Perceptron Classifier, label with the highest score is the predicted label
|
|
||||||
|
|
||||||
:param X_test: list of test data instances
|
|
||||||
:return: list of predicted labels
|
|
||||||
"""
|
|
||||||
|
|
||||||
if X_test is None or len(X_test) <= 0:
|
|
||||||
raise Exception('Testing Data cannot be empty')
|
|
||||||
|
|
||||||
print('Predicting..... ')
|
|
||||||
|
|
||||||
y_test = []
|
|
||||||
labels = list(self.perceptron_dict.keys())
|
|
||||||
for test_inst in X_test:
|
|
||||||
perceptron_scores = [] # list for storing perceptron scores for each label
|
|
||||||
for label in labels:
|
|
||||||
perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
|
|
||||||
# find the max score from the list of scores
|
|
||||||
max_score = max(perceptron_scores)
|
|
||||||
|
|
||||||
label_max_score = labels[perceptron_scores.index(max_score)]
|
|
||||||
y_test.append(label_max_score)
|
|
||||||
|
|
||||||
return y_test
|
|
||||||
|
|
||||||
|
|
||||||
def get_class_scores(self, X_test: list):
|
|
||||||
"""
|
|
||||||
This function takes testing instances as parameters and returns the probability for each
|
|
||||||
predicted label.
|
|
||||||
|
|
||||||
|
|
||||||
:param X_test: list of test data instances
|
|
||||||
:return: list of predicted label probabilities
|
|
||||||
"""
|
|
||||||
|
|
||||||
if X_test is None or len(X_test) <= 0:
|
|
||||||
raise Exception('Testing Data cannot be empty')
|
|
||||||
|
|
||||||
print('Predicting..... ')
|
|
||||||
|
|
||||||
y_test = []
|
|
||||||
labels = list(self.perceptron_dict.keys())
|
|
||||||
for test_inst in X_test:
|
|
||||||
perceptron_scores = [] # list for storing perceptron scores for each label
|
|
||||||
for label in labels:
|
|
||||||
perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
|
|
||||||
# find the max score from the list of scores
|
|
||||||
#max_score = max(perceptron_scores)
|
|
||||||
|
|
||||||
#label_max_score = labels[perceptron_scores.index(max_score)]
|
|
||||||
y_test.append(perceptron_scores)
|
|
||||||
|
|
||||||
return y_test
|
|
||||||
|
|
||||||
|
|
||||||
def get_sample_weights_with_features(theta_bias: float = 0.0, random_state: int = 42):
|
|
||||||
"""
|
|
||||||
This function creates a dictionary with feature as a key and a random floating number (feature weight) as value.
|
|
||||||
Weights for each feature is a floating number between -1 and 1
|
|
||||||
|
|
||||||
:type theta_bias: float
|
|
||||||
:type random_state: int
|
|
||||||
|
|
||||||
:param theta_bias: value of theta bias variable
|
|
||||||
:param random_state: random seed number for reproducing the results
|
|
||||||
|
|
||||||
:return: returns a dictionary of random weights for each feature
|
|
||||||
"""
|
|
||||||
weights = {THETA_BIAS_FEATURE: theta_bias}
|
|
||||||
for feature in FEATURE_LIST:
|
|
||||||
random.seed(random_state)
|
|
||||||
weights[feature] = round(random.uniform(-1.0, 1.0), 5)
|
|
||||||
|
|
||||||
return weights
|
|
||||||
@ -1,211 +0,0 @@
|
|||||||
from typing import Dict
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from allennlp.common.checks import ConfigurationError
|
|
||||||
from allennlp.data import Vocabulary
|
|
||||||
from allennlp.models import Model
|
|
||||||
from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder, FeedForward, Elmo
|
|
||||||
from allennlp.nn import util
|
|
||||||
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
|
|
||||||
from overrides import overrides
|
|
||||||
from torch.nn import Parameter
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("basic_bilstm_classifier")
|
|
||||||
class BiLstmClassifier(Model):
|
|
||||||
|
|
||||||
"""
|
|
||||||
Two things to note first:
|
|
||||||
- This BiLstmClassifier is a subclass of AllenNLP's Model class
|
|
||||||
- This class registers the type "basic_bilstm_classifier" using @Model.register() decorator,
|
|
||||||
this is required for the Config file to identify the Model class.
|
|
||||||
|
|
||||||
AllenNLP Model is similar to PyTorch Module, it implements forward() method and returns an output dictionary
|
|
||||||
with loss, logits and more....
|
|
||||||
|
|
||||||
The constructor parameters should match with configuration in the config file, the Vocabulary is composed by
|
|
||||||
the library or train pipeline after reading data using Dataset Reader.
|
|
||||||
|
|
||||||
In this model, we used Elmo embeddings, 1 layer BiLSTM (encoder) and 2 Feed-forward layers.
|
|
||||||
The train command/pipeline calls the forward method for a batch of Instances,
|
|
||||||
and the forward method returns the output dictionary with loss, logits, label and F1 metrics
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, vocab: Vocabulary,
|
|
||||||
text_field_embedder: TextFieldEmbedder,
|
|
||||||
encoder: Seq2SeqEncoder,
|
|
||||||
classifier_feedforward: FeedForward,
|
|
||||||
elmo: Elmo = None,
|
|
||||||
use_input_elmo: bool = False):
|
|
||||||
super().__init__(vocab)
|
|
||||||
self.elmo = elmo
|
|
||||||
self.use_elmo = use_input_elmo
|
|
||||||
self.text_field_embedder = text_field_embedder
|
|
||||||
self.num_classes = self.vocab.get_vocab_size("labels")
|
|
||||||
self.encoder = encoder
|
|
||||||
self.classifier_feed_forward = classifier_feedforward
|
|
||||||
self.label_accuracy = CategoricalAccuracy()
|
|
||||||
|
|
||||||
self.label_f1_metrics = {}
|
|
||||||
|
|
||||||
# create F1 Measures for each class
|
|
||||||
for i in range(self.num_classes):
|
|
||||||
self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = \
|
|
||||||
F1Measure(positive_label=i)
|
|
||||||
|
|
||||||
self.loss = torch.nn.CrossEntropyLoss()
|
|
||||||
|
|
||||||
self.attention = Attention(encoder.get_output_dim())
|
|
||||||
|
|
||||||
@overrides
|
|
||||||
def forward(self, tokens: Dict[str, torch.LongTensor],
|
|
||||||
label: torch.LongTensor) -> Dict[str, torch.LongTensor]:
|
|
||||||
|
|
||||||
"""
|
|
||||||
The training loop takes a batch of Instances and passes it to the forward method
|
|
||||||
|
|
||||||
:param tokens: tokens from the Instance
|
|
||||||
:param label: label from the data Instance
|
|
||||||
|
|
||||||
:return: returns an output dictionary after forwarding inputs to the model
|
|
||||||
"""
|
|
||||||
|
|
||||||
input_elmo = None
|
|
||||||
# pop the "elmo" key and add it later
|
|
||||||
elmo_tokens = tokens.pop("elmo", None)
|
|
||||||
|
|
||||||
embedded_text = self.text_field_embedder(tokens)
|
|
||||||
text_mask = util.get_text_field_mask(tokens)
|
|
||||||
|
|
||||||
if elmo_tokens is not None:
|
|
||||||
tokens["elmo"] = elmo_tokens
|
|
||||||
|
|
||||||
# Create ELMo embeddings if applicable
|
|
||||||
if self.elmo:
|
|
||||||
if elmo_tokens is not None:
|
|
||||||
# get elmo representations from Tokens
|
|
||||||
elmo_representations = self.elmo(elmo_tokens["elmo_tokens"])["elmo_representations"]
|
|
||||||
if self.use_elmo:
|
|
||||||
input_elmo = elmo_representations.pop()
|
|
||||||
assert not elmo_representations
|
|
||||||
else:
|
|
||||||
raise ConfigurationError("Model was built to use Elmo, but input text is not tokenized for Elmo.")
|
|
||||||
|
|
||||||
if self.use_elmo:
|
|
||||||
if embedded_text is not None:
|
|
||||||
embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
|
|
||||||
else:
|
|
||||||
embedded_text = input_elmo
|
|
||||||
|
|
||||||
# pass the embedded text to the LSTM encoder
|
|
||||||
encoded_text = self.encoder(embedded_text, text_mask)
|
|
||||||
|
|
||||||
# Attention
|
|
||||||
attn_dist, encoded_text = self.attention(encoded_text, return_attn_distribution=True)
|
|
||||||
|
|
||||||
output_dict = {}
|
|
||||||
if label is not None:
|
|
||||||
logits = self.classifier_feed_forward(encoded_text)
|
|
||||||
|
|
||||||
# Probabilities from Softmax
|
|
||||||
class_probabilities = torch.nn.functional.softmax(logits, dim=1)
|
|
||||||
|
|
||||||
output_dict["logits"] = logits
|
|
||||||
|
|
||||||
# loss calculation
|
|
||||||
loss = self.loss(logits, label)
|
|
||||||
output_dict["loss"] = loss
|
|
||||||
|
|
||||||
# compute F1 per label
|
|
||||||
for i in range(self.num_classes):
|
|
||||||
metric = self.label_f1_metrics[self.vocab.get_token_from_index(index=i, namespace="labels")]
|
|
||||||
metric(class_probabilities, label)
|
|
||||||
output_dict['label'] = label
|
|
||||||
|
|
||||||
output_dict['tokens'] = tokens['tokens']
|
|
||||||
|
|
||||||
return output_dict
|
|
||||||
|
|
||||||
@overrides
|
|
||||||
def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
|
|
||||||
"""
|
|
||||||
|
|
||||||
The predict command/pipeline calls this method with the output dictionary from forward() method.
|
|
||||||
|
|
||||||
The returned output dictionary will also be printed in the console when the predict command is executed
|
|
||||||
|
|
||||||
:param output_dict: output dictionary
|
|
||||||
:return: returns human readable output dictionary
|
|
||||||
"""
|
|
||||||
class_probabilities = torch.nn.functional.softmax(output_dict['logits'], dim=-1)
|
|
||||||
predictions = class_probabilities.cpu().data.numpy()
|
|
||||||
argmax_indices = np.argmax(predictions, axis=-1)
|
|
||||||
|
|
||||||
# get the label from vocabulary
|
|
||||||
label = [self.vocab.get_token_from_index(x, namespace="labels")
|
|
||||||
for x in argmax_indices]
|
|
||||||
output_dict['probabilities'] = class_probabilities
|
|
||||||
output_dict['positive_label'] = label
|
|
||||||
output_dict['prediction'] = label
|
|
||||||
|
|
||||||
# return ouput dictionary
|
|
||||||
return output_dict
|
|
||||||
|
|
||||||
@overrides
|
|
||||||
def get_metrics(self, reset: bool = False) -> Dict[str, float]:
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
This method gets a call from the train pipeline,
|
|
||||||
and the returned metrics dictionary will be printed in the Console while Training.
|
|
||||||
|
|
||||||
The returned metrics dictionary contains class-wise F1 Scores, Average F1 score and loss
|
|
||||||
|
|
||||||
:param reset: boolean
|
|
||||||
|
|
||||||
:return: returns a metrics dictionary with Class Level F1 scores and losses
|
|
||||||
"""
|
|
||||||
|
|
||||||
metric_dict = {}
|
|
||||||
|
|
||||||
sum_f1 = 0.0
|
|
||||||
for name, metric in self.label_f1_metrics.items():
|
|
||||||
metric_val = metric.get_metric(reset)
|
|
||||||
metric_dict[name + '_F1'] = metric_val[2]
|
|
||||||
if name != 'none': # do not consider `none` label in averaging F1
|
|
||||||
sum_f1 += metric_val[2]
|
|
||||||
|
|
||||||
names = list(self.label_f1_metrics.keys())
|
|
||||||
total_len = len(names) if 'none' not in names else len(names) - 1
|
|
||||||
average_f1 = sum_f1 / total_len
|
|
||||||
metric_dict['AVG_F1_Score'] = average_f1
|
|
||||||
|
|
||||||
return metric_dict
|
|
||||||
|
|
||||||
|
|
||||||
def new_parameter(*size):
|
|
||||||
out = Parameter(torch.FloatTensor(*size))
|
|
||||||
torch.nn.init.xavier_normal_(out)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
class Attention(torch.nn.Module):
|
|
||||||
""" Simple multiplicative attention"""
|
|
||||||
|
|
||||||
def __init__(self, attention_size):
|
|
||||||
super(Attention, self).__init__()
|
|
||||||
self.attention = new_parameter(attention_size, 1)
|
|
||||||
|
|
||||||
def forward(self, x_in, reduction_dim=-2, return_attn_distribution=False):
|
|
||||||
# calculate attn weights
|
|
||||||
attn_score = torch.matmul(x_in, self.attention).squeeze()
|
|
||||||
# add one dimension at the end and get a distribution out of scores
|
|
||||||
attn_distrib = torch.nn.functional.softmax(attn_score.squeeze(), dim=-1).unsqueeze(-1)
|
|
||||||
scored_x = x_in * attn_distrib
|
|
||||||
weighted_sum = torch.sum(scored_x, dim=reduction_dim)
|
|
||||||
if return_attn_distribution:
|
|
||||||
return attn_distrib.reshape(x_in.shape[0], -1), weighted_sum
|
|
||||||
else:
|
|
||||||
return weighted_sum
|
|
||||||
@ -1,128 +0,0 @@
|
|||||||
"""
|
|
||||||
Simple feed-forward neural network in PyTorch for baseline results on Scicite data.
|
|
||||||
Created: July 5th, 2020
|
|
||||||
"""
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from utils.nn_reader import read_csv_nn
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
class FeedForward(torch.nn.Module):
|
|
||||||
"""
|
|
||||||
Creates and trains a basic feedforward neural network.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, input_size, hidden_size, output_size):
|
|
||||||
""" Sets up all basic elements of NN. """
|
|
||||||
super(FeedForward, self).__init__()
|
|
||||||
self.input_size = input_size
|
|
||||||
self.hidden_size = hidden_size
|
|
||||||
self.output_size = output_size
|
|
||||||
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
|
|
||||||
self.tanh = torch.nn.Tanh()
|
|
||||||
self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
|
|
||||||
self.sigmoid = torch.nn.Sigmoid()
|
|
||||||
self.softmax = torch.nn.Softmax(dim=1)
|
|
||||||
self.read_data()
|
|
||||||
|
|
||||||
def forward(self, x):
|
|
||||||
""" Computes output from a given input x. """
|
|
||||||
hidden = self.fc1(x)
|
|
||||||
tanh = self.tanh(hidden)
|
|
||||||
output = self.fc2(tanh)
|
|
||||||
output = self.softmax(output)
|
|
||||||
return output
|
|
||||||
|
|
||||||
def read_data(self):
|
|
||||||
"""" Reads in training and test data and converts it to proper format. """
|
|
||||||
self.X_train_, self.y_train_, self.X_test, self.y_test_ = read_csv_nn()
|
|
||||||
self.X_test = torch.FloatTensor(self.X_test)
|
|
||||||
yclass = np.array([(x[1] == 1) + 2 * (x[2] == 1) for x in self.y_train_])
|
|
||||||
is0 = yclass == 0
|
|
||||||
is1 = yclass == 1
|
|
||||||
is2 = yclass == 2
|
|
||||||
self.X0 = torch.FloatTensor(self.X_train_[is0])
|
|
||||||
self.X1 = torch.FloatTensor(self.X_train_[is1])
|
|
||||||
self.X2 = torch.FloatTensor(self.X_train_[is2])
|
|
||||||
self.y0 = torch.LongTensor(np.zeros((sum(is0),)))
|
|
||||||
self.y1 = torch.LongTensor(np.ones((sum(is1),)))
|
|
||||||
self.y2 = torch.LongTensor(2 * np.ones((sum(is2),)))
|
|
||||||
self.l0 = sum(is0)
|
|
||||||
self.l1 = sum(is1)
|
|
||||||
self.l2 = sum(is2)
|
|
||||||
self.y_test = (self.y_test_[:, 1] == 1) + 2 * (self.y_test_[:, 2] == 1)
|
|
||||||
|
|
||||||
def fit(self, epochs=100, batch_size=16, lr=0.01, samples=(1000, 1000, 1000)):
|
|
||||||
""" Trains model, using cross entropy loss and SGD optimizer. """
|
|
||||||
self.criterion = torch.nn.CrossEntropyLoss()
|
|
||||||
self.optimizer = torch.optim.SGD(self.parameters(), lr)
|
|
||||||
self.samples0, self.samples1, self.samples2 = samples
|
|
||||||
|
|
||||||
self.eval() # put into eval mode
|
|
||||||
|
|
||||||
# initialize training data
|
|
||||||
self.shuffle()
|
|
||||||
y_pred = self.forward(self.X_train)
|
|
||||||
before_train = self.criterion(y_pred, self.y_train)
|
|
||||||
print('Test loss before training', before_train.item())
|
|
||||||
|
|
||||||
# setup for batches
|
|
||||||
l = self.samples0 + self.samples1 + self.samples2 # total length
|
|
||||||
batch_indices = list(zip(list(range(0, l, batch_size))[:-1], list(range(16, l, batch_size))))
|
|
||||||
batch_indices[-1] = (batch_indices[-1][0], l)
|
|
||||||
|
|
||||||
# train model
|
|
||||||
self.train() # put into training mode
|
|
||||||
for epoch in range(epochs):
|
|
||||||
batch = 0
|
|
||||||
for a, b in batch_indices:
|
|
||||||
self.optimizer.zero_grad()
|
|
||||||
|
|
||||||
# forward pass
|
|
||||||
y_pred = self.forward(self.X_train[a:b])
|
|
||||||
loss = self.criterion(y_pred, self.y_train[a:b])
|
|
||||||
|
|
||||||
# backward pass
|
|
||||||
loss.backward()
|
|
||||||
self.optimizer.step()
|
|
||||||
batch += 1
|
|
||||||
|
|
||||||
# get loss following epoch
|
|
||||||
y_pred = self.forward(self.X_train)
|
|
||||||
loss = self.criterion(y_pred, self.y_train)
|
|
||||||
print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
|
|
||||||
|
|
||||||
# shuffle dataset
|
|
||||||
self.shuffle()
|
|
||||||
|
|
||||||
# display final loss
|
|
||||||
self.eval() # back to eval mode
|
|
||||||
y_pred = self.forward(self.X_train)
|
|
||||||
after_train = self.criterion(y_pred, self.y_train)
|
|
||||||
print('Training loss after training', after_train.item())
|
|
||||||
|
|
||||||
def predict(self):
|
|
||||||
""" Generates predictions from model, using test data. """
|
|
||||||
|
|
||||||
# post-process to get predictions & get back to np format
|
|
||||||
y_pred = self.forward(self.X_test)
|
|
||||||
y_pred_np = y_pred.detach().numpy()
|
|
||||||
predmax = np.amax(y_pred_np, axis=1)
|
|
||||||
self.preds = 1 * (y_pred_np[:, 1] == predmax) + 2 * (y_pred_np[:, 2] == predmax)
|
|
||||||
self.probs = y_pred.detach().numpy()
|
|
||||||
|
|
||||||
def shuffle(self):
|
|
||||||
""" Samples and shuffles training data. """
|
|
||||||
|
|
||||||
# create permutations for shuffling
|
|
||||||
p0 = torch.randperm(self.l0)
|
|
||||||
p1 = torch.randperm(self.l1)
|
|
||||||
p2 = torch.randperm(self.l2)
|
|
||||||
n = self.samples0 + self.samples1 + self.samples2
|
|
||||||
p = torch.randperm(n)
|
|
||||||
|
|
||||||
# sample and shuffle data
|
|
||||||
self.X_train = \
|
|
||||||
torch.cat((self.X0[p0][:self.samples0], self.X1[p1][:self.samples1], self.X2[p2][:self.samples2]))[p]
|
|
||||||
self.y_train = torch.cat((self.y0[:self.samples0], self.y1[:self.samples1], self.y2[:self.samples2]))[p]
|
|
||||||
@ -1,57 +0,0 @@
|
|||||||
{
|
|
||||||
"dataset_reader": {
|
|
||||||
"type": "citation_dataset_reader"
|
|
||||||
},
|
|
||||||
"train_data_path": "data/jsonl/train.jsonl",
|
|
||||||
"validation_data_path": "data/jsonl/dev.jsonl",
|
|
||||||
"test_data_path": "data/jsonl/test.jsonl",
|
|
||||||
"evaluate_on_test": true,
|
|
||||||
"model": {
|
|
||||||
"type": "basic_bilstm_classifier",
|
|
||||||
"text_field_embedder": {
|
|
||||||
"token_embedders": {
|
|
||||||
"tokens": {
|
|
||||||
"pretrained_file": "/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/glove.6B.100d.txt.gz",
|
|
||||||
"type": "embedding",
|
|
||||||
"embedding_dim": 100,
|
|
||||||
"trainable": false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"encoder": {
|
|
||||||
"type": "lstm",
|
|
||||||
"input_size": 1124,
|
|
||||||
"hidden_size": 100,
|
|
||||||
"num_layers": 1,
|
|
||||||
"bidirectional": true
|
|
||||||
},
|
|
||||||
"elmo": {
|
|
||||||
"options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
|
|
||||||
"weight_file": "/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
|
|
||||||
"do_layer_norm": true,
|
|
||||||
"dropout": 0.5,
|
|
||||||
"num_output_representations": 1
|
|
||||||
},
|
|
||||||
"use_input_elmo": true,
|
|
||||||
"classifier_feedforward": {
|
|
||||||
"input_dim": 200,
|
|
||||||
"num_layers": 2,
|
|
||||||
"hidden_dims": [20, 3],
|
|
||||||
"activations": ["linear", "linear"]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"data_loader": {
|
|
||||||
"batch_sampler": {
|
|
||||||
"type": "bucket",
|
|
||||||
"batch_size" : 16
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"trainer": {
|
|
||||||
"optimizer": {
|
|
||||||
"type": "adagrad",
|
|
||||||
"lr": 0.005
|
|
||||||
},
|
|
||||||
"num_epochs": 10,
|
|
||||||
"cuda_device": 3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
from allennlp.modules.elmo import Elmo, batch_to_ids
|
|
||||||
|
|
||||||
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
|
|
||||||
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
|
|
||||||
|
|
||||||
elmo = Elmo(options_file, weights_file, 1, dropout=0)
|
|
||||||
text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
|
|
||||||
|
|
||||||
batch = batch_to_ids(text)
|
|
||||||
print(batch)
|
|
||||||
|
|
||||||
dict = elmo.forward(batch)
|
|
||||||
|
|
||||||
print(dict['elmo_representations'])
|
|
||||||
@ -1,48 +0,0 @@
|
|||||||
import feature_extraction.lexicons as lexicons
|
|
||||||
from utils.constants import REGEX_CONSTANTS
|
|
||||||
|
|
||||||
""" List of supported features for feature extraction from Input String """
|
|
||||||
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
|
|
||||||
'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
|
|
||||||
'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
|
|
||||||
'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE',
|
|
||||||
'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL']
|
|
||||||
|
|
||||||
""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
|
|
||||||
THETA_BIAS_FEATURE = 'THETA_BIAS'
|
|
||||||
|
|
||||||
|
|
||||||
def extract_features_from_text(text: str):
|
|
||||||
"""
|
|
||||||
This function takes text string as input, extracts and returns a list of features by checking each word in
|
|
||||||
:`~feature_extraction.lexicons.ALL_LEXICONS`
|
|
||||||
:param text: takes string text as param
|
|
||||||
:return: returns a list of extracted features from the text, empty list for no features
|
|
||||||
"""
|
|
||||||
|
|
||||||
# ALL_LEXICONS
|
|
||||||
lexicon_dict = lexicons.ALL_LEXICONS
|
|
||||||
|
|
||||||
# Initialize the feature list with Theta Bias feature, this feature must be added to all data instances
|
|
||||||
text_feature_list = [THETA_BIAS_FEATURE]
|
|
||||||
|
|
||||||
# Iterate through the list features and get list of words from the lexicon dictionary,
|
|
||||||
# for each word in the word list, check if it appears in input text and add it to the text feature list
|
|
||||||
for feature in FEATURE_LIST:
|
|
||||||
|
|
||||||
# If the feature is Regex Pattern Match, get the pattern from :`~utils.constants.REGEX_CONSTANTS`
|
|
||||||
# and match it with the input text
|
|
||||||
if feature in REGEX_CONSTANTS:
|
|
||||||
pattern = REGEX_CONSTANTS[feature]
|
|
||||||
if bool(pattern.search(text)):
|
|
||||||
text_feature_list.append(feature)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# If the feature is not Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
|
|
||||||
word_list = lexicon_dict[feature]
|
|
||||||
for word in word_list:
|
|
||||||
if word in text.lower():
|
|
||||||
text_feature_list.append(feature)
|
|
||||||
break
|
|
||||||
|
|
||||||
return text_feature_list
|
|
||||||
@ -1,74 +0,0 @@
|
|||||||
"""
|
|
||||||
Dictionary of Lexicons used for Feature Extraction
|
|
||||||
"""
|
|
||||||
ALL_LEXICONS = {
|
|
||||||
|
|
||||||
'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low',
|
|
||||||
'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
|
|
||||||
|
|
||||||
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
|
|
||||||
|
|
||||||
'RESULT': ['estimate', 'evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result', 'performance'],
|
|
||||||
|
|
||||||
'INCREASE': ['increase', 'grow', 'intensify', 'build up', 'explode'],
|
|
||||||
|
|
||||||
'CHANGE': ['adapt', 'adjust', 'augment', 'combine', 'change', 'decrease', 'elaborate', 'expand', 'expand on',
|
|
||||||
'extend', 'derive', 'incorporate', 'increase', 'manipulate', 'modify', 'optimize', 'optimise', 'refine',
|
|
||||||
'render', 'replace', 'revise', 'substitute', 'tailor', 'upgrade', 'grow'],
|
|
||||||
|
|
||||||
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
|
|
||||||
|
|
||||||
'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate',
|
|
||||||
'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate',
|
|
||||||
'specify', 'explain'],
|
|
||||||
|
|
||||||
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
|
|
||||||
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate',
|
|
||||||
'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'],
|
|
||||||
|
|
||||||
'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
|
|
||||||
'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
|
|
||||||
'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
|
|
||||||
'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation',
|
|
||||||
'literature', 'experience'],
|
|
||||||
|
|
||||||
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
|
|
||||||
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
|
|
||||||
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strateg',
|
|
||||||
'system', 'technique', 'theory', 'tool', 'treatment'],
|
|
||||||
|
|
||||||
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
|
|
||||||
'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
|
|
||||||
'suggest', 'swear', 'write'],
|
|
||||||
|
|
||||||
'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
|
|
||||||
|
|
||||||
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
|
|
||||||
|
|
||||||
'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
|
|
||||||
'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
|
|
||||||
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
|
|
||||||
|
|
||||||
'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase',
|
|
||||||
'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death',
|
|
||||||
'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
|
|
||||||
'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
|
|
||||||
'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
|
|
||||||
'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity',
|
|
||||||
'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop',
|
|
||||||
'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium', 'rna', 'organism', 'biolog', 'x-ray'],
|
|
||||||
|
|
||||||
'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
|
|
||||||
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
|
|
||||||
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
|
|
||||||
'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter',
|
|
||||||
'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'],
|
|
||||||
|
|
||||||
'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network',
|
|
||||||
'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml',
|
|
||||||
'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
|
|
||||||
'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
|
|
||||||
|
|
||||||
'CITATION': ['et al']
|
|
||||||
|
|
||||||
}
|
|
||||||
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 32 KiB |
Binary file not shown.
@ -1,10 +0,0 @@
|
|||||||
allennlp==1.0.0
|
|
||||||
jsonlines==1.2.0
|
|
||||||
matplotlib==3.3.0
|
|
||||||
numpy==1.19.0
|
|
||||||
overrides==3.0.0
|
|
||||||
scikit-learn==0.23.1
|
|
||||||
six==1.15.0
|
|
||||||
spacy==2.2.4
|
|
||||||
torch==1.5.1
|
|
||||||
torchvision==0.6.1
|
|
||||||
@ -1,2 +0,0 @@
|
|||||||
from utils.reader import *
|
|
||||||
from classifier.nn import *
|
|
||||||
@ -1,14 +0,0 @@
|
|||||||
import classifier.intent_predictor as pred
|
|
||||||
|
|
||||||
import eval.metrics as metrics
|
|
||||||
|
|
||||||
saved_model_dir = '/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/saved_models/experiment_4'
|
|
||||||
y_pred, y_true = pred.load_model_and_predict_test_data(saved_model_dir)
|
|
||||||
|
|
||||||
confusion_matrix = metrics.get_confusion_matrix(y_true, y_pred)
|
|
||||||
|
|
||||||
print("Confusion Matrix :: ")
|
|
||||||
print(confusion_matrix)
|
|
||||||
|
|
||||||
plot_file_path = saved_model_dir+'/confusion_matrix_plot.png'
|
|
||||||
metrics.plot_confusion_matrix(confusion_matrix, "BiLSTM Classifier + Attention with ELMo", plot_file_path)
|
|
||||||
@ -1,27 +0,0 @@
|
|||||||
import os
|
|
||||||
from utils.csv import read_csv_file
|
|
||||||
|
|
||||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
train_file_path = project_root+'/data/tsv/train.tsv'
|
|
||||||
test_file_path = project_root+'/data/tsv/test.tsv'
|
|
||||||
|
|
||||||
print(train_file_path)
|
|
||||||
|
|
||||||
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
|
|
||||||
|
|
||||||
i = 0
|
|
||||||
feature_dict = {}
|
|
||||||
for inst in data[:20]:
|
|
||||||
inst.print()
|
|
||||||
# print('Data Points without Features :: ', i)
|
|
||||||
|
|
||||||
# tokens = inst.text.split()
|
|
||||||
# for token in tokens:
|
|
||||||
# if token not in feature_dict:
|
|
||||||
# feature_dict[token] = 1
|
|
||||||
# continue
|
|
||||||
# feature_dict[token] += 1
|
|
||||||
#
|
|
||||||
# for key in sorted(feature_dict, key=feature_dict.get, reverse=True):
|
|
||||||
# print(key, ' -> ', feature_dict.get(key))
|
|
||||||
@ -1,34 +0,0 @@
|
|||||||
import sys
|
|
||||||
import os
|
|
||||||
sys.path.append(os.getcwd())
|
|
||||||
from classifier.nn_ff import FeedForward
|
|
||||||
from sklearn.metrics import f1_score
|
|
||||||
from eval.metrics import plot_confusion_matrix, get_confusion_matrix
|
|
||||||
|
|
||||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
|
|
||||||
clf = FeedForward(28, 9, 3)
|
|
||||||
clf.fit()
|
|
||||||
clf.predict()
|
|
||||||
|
|
||||||
# predict
|
|
||||||
y_test = clf.preds
|
|
||||||
y_true = clf.y_test
|
|
||||||
|
|
||||||
# Model Evaluation
|
|
||||||
labels = set(['background', 'method', 'result'])
|
|
||||||
f1_score_micro = f1_score(y_true, y_test, average='micro')
|
|
||||||
f1_score_macro = f1_score(y_true, y_test, average='macro')
|
|
||||||
|
|
||||||
# Print F1 Score
|
|
||||||
print('F1 score (micro): ', f1_score_micro)
|
|
||||||
print('F1 score (macro): ', f1_score_macro)
|
|
||||||
|
|
||||||
# plot confusion matrix
|
|
||||||
classdict = {0: 'background', 1: 'method', 2: 'result'}
|
|
||||||
y_test = [classdict[x] for x in y_test]
|
|
||||||
y_true = [classdict[x] for x in y_true]
|
|
||||||
plot_path = project_root + '/plots/confusion_matrix_plot_ff.png'
|
|
||||||
plot_confusion_matrix(get_confusion_matrix(y_true, y_test), 'Feed-forward NN Classifier (Baseline)', plot_path)
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,45 +0,0 @@
|
|||||||
from classifier.linear_model import MultiClassPerceptron
|
|
||||||
from utils.csv import read_csv_file
|
|
||||||
from eval.metrics import f1_score, plot_confusion_matrix, get_confusion_matrix
|
|
||||||
import utils.constants as const
|
|
||||||
import os
|
|
||||||
|
|
||||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
||||||
train_file_path = project_root + '/data/tsv/train.tsv'
|
|
||||||
test_file_path = project_root + '/data/tsv/test.tsv'
|
|
||||||
|
|
||||||
# Read the training dataset
|
|
||||||
X_train_inst = read_csv_file(train_file_path, '\t')
|
|
||||||
|
|
||||||
# set of labels from Training data
|
|
||||||
labels = set([inst.true_label for inst in X_train_inst])
|
|
||||||
|
|
||||||
# Read test data set
|
|
||||||
X_test_inst = read_csv_file(test_file_path, '\t')
|
|
||||||
|
|
||||||
# number of training iterations
|
|
||||||
epochs = 50
|
|
||||||
|
|
||||||
# create MultiClassPerceptron classifier object
|
|
||||||
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.7, random_state=101)
|
|
||||||
|
|
||||||
# train the model
|
|
||||||
clf.fit(X_train=X_train_inst, labels=list(labels))
|
|
||||||
|
|
||||||
# predict
|
|
||||||
y_test = clf.predict(X_test_inst)
|
|
||||||
|
|
||||||
y_true = [inst.true_label for inst in X_test_inst]
|
|
||||||
|
|
||||||
# Model Evaluation
|
|
||||||
f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
|
|
||||||
f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
|
|
||||||
f1_score_none = f1_score(y_true, y_test, labels, None)
|
|
||||||
|
|
||||||
# Print F1 Score
|
|
||||||
for result in f1_score_micro + f1_score_macro + f1_score_none:
|
|
||||||
result.print_result()
|
|
||||||
|
|
||||||
# plot confusion matrix
|
|
||||||
plot_path = project_root + '/plots/confusion_matrix_plot.png'
|
|
||||||
plot_confusion_matrix(get_confusion_matrix(y_true, y_test), 'Perceptron Classifier (Baseline)', plot_path)
|
|
||||||
@ -1,66 +0,0 @@
|
|||||||
#import os
|
|
||||||
#os.chdir('/Users/iriley/code/citation-analysis')
|
|
||||||
import sys
|
|
||||||
sys.path.append('/Users/iriley/code/citation-analysis')
|
|
||||||
from classifier.linear_model import MultiClassPerceptron
|
|
||||||
from sklearn.metrics import confusion_matrix as cm
|
|
||||||
from utils.csv import read_csv_file
|
|
||||||
from eval.metrics import f1_score
|
|
||||||
import utils.constants as const
|
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
train_file_path = '/Users/iriley/code/citation-analysis/data/tsv/train.tsv'
|
|
||||||
dev_file_path = '/Users/iriley/code/citation-analysis/data/tsv/test.tsv'
|
|
||||||
|
|
||||||
|
|
||||||
# Read the training dataset
|
|
||||||
X_train_inst = read_csv_file(train_file_path, '\t')
|
|
||||||
|
|
||||||
# set of labels from Training data
|
|
||||||
labels = set([inst.true_label for inst in X_train_inst])
|
|
||||||
|
|
||||||
# Read test data set
|
|
||||||
X_dev_inst = read_csv_file(dev_file_path, '\t')
|
|
||||||
|
|
||||||
# number of training iterations
|
|
||||||
epochs = 50
|
|
||||||
|
|
||||||
# create MultiClassPerceptron classifier object
|
|
||||||
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.5, random_state=101)
|
|
||||||
|
|
||||||
# train the model
|
|
||||||
clf.fit(X_train=X_train_inst, labels=list(labels))
|
|
||||||
|
|
||||||
# predict
|
|
||||||
y_pred = clf.predict(X_dev_inst)
|
|
||||||
y_scores = np.array(clf.get_class_scores(X_dev_inst))
|
|
||||||
|
|
||||||
y_true = [inst.true_label for inst in X_dev_inst]
|
|
||||||
|
|
||||||
labeldict = {'background': 0, 'method': 1, 'result': 2}
|
|
||||||
y_pred = np.array([labeldict[x] for x in y_pred])
|
|
||||||
y_true = np.array([labeldict[x] for x in y_true])
|
|
||||||
|
|
||||||
conmat = cm(y_true, y_pred)
|
|
||||||
|
|
||||||
df = pd.DataFrame()
|
|
||||||
df['pred'] = y_pred
|
|
||||||
df['true'] = y_true
|
|
||||||
df['correct'] = y_pred==y_true
|
|
||||||
df['score0'] = np.round(y_scores[:,0],3)
|
|
||||||
df['score1'] = np.round(y_scores[:,1],3)
|
|
||||||
df['score2'] = np.round(y_scores[:,2],3)
|
|
||||||
|
|
||||||
df.to_csv('/Users/iriley/code/machine_learning/lab2020/y_pred_model1.csv', index=False)
|
|
||||||
|
|
||||||
## Model Evaluation
|
|
||||||
#f1_score_micro = f1_score(y_true, y_pred, labels, const.AVG_MICRO)
|
|
||||||
#f1_score_macro = f1_score(y_true, y_pred, labels, const.AVG_MACRO)
|
|
||||||
#f1_score_none = f1_score(y_true, y_pred, labels, None)
|
|
||||||
|
|
||||||
## Print F1 Score
|
|
||||||
#for result in f1_score_micro + f1_score_macro + f1_score_none:
|
|
||||||
# result.print_result()
|
|
||||||
@ -1,37 +1,2 @@
|
|||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
AVG_MICRO = 'MICRO'
|
AVG_MICRO = 'MICRO'
|
||||||
AVG_MACRO = 'MACRO'
|
AVG_MACRO = 'MACRO'
|
||||||
|
|
||||||
REGEX_CONSTANTS = {
|
|
||||||
|
|
||||||
# Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2
|
|
||||||
# 'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"),
|
|
||||||
'ACRONYM': re.compile(r"^[A-Z]{2,}[\.,:;\b\s]|[\s\b]m?[A-Z]{2,}[\.,:;\b\s]"),
|
|
||||||
|
|
||||||
# Regex for matching Years in the text - > 1995 / 2020 / 2019
|
|
||||||
'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"),
|
|
||||||
|
|
||||||
# Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
|
|
||||||
'SEQUENCE': re.compile(r"\([\d.*]\)"),
|
|
||||||
|
|
||||||
# Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
|
|
||||||
'REFERENCE': re.compile(r"\[\d.*\]"),
|
|
||||||
|
|
||||||
# Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
|
|
||||||
'PERCENTAGE': re.compile(r"\d[\d\.\-]+%"),
|
|
||||||
|
|
||||||
# Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals.
|
|
||||||
'CONTAINS_URL': re.compile(r"https?://\S+"),
|
|
||||||
|
|
||||||
'ENDS_WITH_RIDE': re.compile(r"ride\b"),
|
|
||||||
|
|
||||||
'ENDS_WITH_RINE': re.compile(r"rine\b"),
|
|
||||||
|
|
||||||
'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
CLASS_LABELS = {"background": 0, "method": 1, "result": 2}
|
|
||||||
CLASS_LABELS_LIST = ['background', 'method', 'result']
|
|
||||||
@ -1,17 +0,0 @@
|
|||||||
class Citation(object):
|
|
||||||
""" Class representing a citation object """
|
|
||||||
|
|
||||||
def __init__(self,
|
|
||||||
text,
|
|
||||||
citing_paper_id,
|
|
||||||
cited_paper_id,
|
|
||||||
section_title=None,
|
|
||||||
intent=None,
|
|
||||||
citation_id=None
|
|
||||||
):
|
|
||||||
self.text = text
|
|
||||||
self.citing_paper_id = citing_paper_id
|
|
||||||
self.cited_paper_id = cited_paper_id
|
|
||||||
self.section_title = section_title
|
|
||||||
self.intent = intent
|
|
||||||
self.citation_id = citation_id
|
|
||||||
@ -0,0 +1,13 @@
|
|||||||
|
|
||||||
|
class DataInstance:
|
||||||
|
"""
|
||||||
|
Model Class for carrying Training and Testing data from tsc/csv file
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, r_id, text, true_label):
|
||||||
|
self.did = r_id
|
||||||
|
self.text = text
|
||||||
|
self.true_label = true_label
|
||||||
|
|
||||||
|
def print(self):
|
||||||
|
print('True Label :: ', self.true_label, ' Text :: ', self.text)
|
||||||
@ -1,57 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
from itertools import chain
|
|
||||||
from utils.csv import read_csv_file
|
|
||||||
|
|
||||||
# TODO: clean up, transform into class, allow for command-line arguments
|
|
||||||
|
|
||||||
def read_csv_nn(scicite_dir=None):
|
|
||||||
|
|
||||||
train_file_path = 'data/tsv/train.tsv'
|
|
||||||
test_file_path = 'data/tsv/test.tsv'
|
|
||||||
train_raw = read_csv_file(train_file_path, '\t')
|
|
||||||
test_raw = read_csv_file(test_file_path, '\t')
|
|
||||||
|
|
||||||
features = [x.features for x in train_raw]
|
|
||||||
features_unique = list(set(chain.from_iterable(features)))
|
|
||||||
nobs = len(features)
|
|
||||||
nfeats = len(features_unique)
|
|
||||||
|
|
||||||
X_train = np.zeros((nobs, nfeats))
|
|
||||||
|
|
||||||
for j in range(nfeats):
|
|
||||||
f = features_unique[j]
|
|
||||||
for i in range(nobs):
|
|
||||||
if f in features[i]:
|
|
||||||
X_train[i,j] = 1
|
|
||||||
|
|
||||||
y_train_raw = np.array([x.true_label for x in train_raw])
|
|
||||||
y_unique = sorted(list(set(y_train_raw)))
|
|
||||||
y_dim = len(y_unique)
|
|
||||||
y_train = np.zeros((nobs,y_dim))
|
|
||||||
|
|
||||||
for j in range(y_dim):
|
|
||||||
y_train[:,j] = y_train_raw == y_unique[j]
|
|
||||||
|
|
||||||
test_raw = read_csv_file(test_file_path, '\t')
|
|
||||||
features = [x.features for x in test_raw]
|
|
||||||
#features_unique = list(set(chain.from_iterable(features)))
|
|
||||||
nobs = len(features)
|
|
||||||
nfeats = len(features_unique)
|
|
||||||
|
|
||||||
X_test = np.zeros((nobs, nfeats))
|
|
||||||
for j in range(nfeats):
|
|
||||||
f = features_unique[j]
|
|
||||||
for i in range(nobs):
|
|
||||||
if f in features[i]:
|
|
||||||
X_test[i,j] = 1
|
|
||||||
|
|
||||||
y_test_raw = np.array([x.true_label for x in test_raw])
|
|
||||||
y_test = np.zeros((nobs, y_dim))
|
|
||||||
|
|
||||||
for j in range(y_dim):
|
|
||||||
y_test[:, j] = y_test_raw == y_unique[j]
|
|
||||||
|
|
||||||
return X_train, y_train, X_test, y_test
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -1,33 +0,0 @@
|
|||||||
import numpy as np
|
|
||||||
from itertools import chain
|
|
||||||
from utils.csv import read_csv_file
|
|
||||||
|
|
||||||
|
|
||||||
def read_csv_nn_dev(scicite_dir=None):
|
|
||||||
|
|
||||||
dev_file_path = 'data/tsv/dev.tsv'
|
|
||||||
dev_raw = read_csv_file(dev_file_path, '\t')
|
|
||||||
|
|
||||||
features = [x.features for x in dev_raw]
|
|
||||||
features_unique = list(set(chain.from_iterable(features)))
|
|
||||||
nobs = len(features)
|
|
||||||
nfeats = len(features_unique)
|
|
||||||
|
|
||||||
X_dev = np.zeros((nobs, nfeats))
|
|
||||||
|
|
||||||
for j in range(nfeats):
|
|
||||||
f = features_unique[j]
|
|
||||||
for i in range(nobs):
|
|
||||||
if f in features[i]:
|
|
||||||
X_dev[i,j] = 1
|
|
||||||
|
|
||||||
y_dev_raw = np.array([x.true_label for x in dev_raw])
|
|
||||||
y_unique = sorted(list(set(y_dev_raw)))
|
|
||||||
y_dim = len(y_unique)
|
|
||||||
y_dev = np.zeros((nobs,y_dim))
|
|
||||||
|
|
||||||
for j in range(y_dim):
|
|
||||||
y_dev[:,j] = y_dev_raw == y_unique[j]
|
|
||||||
|
|
||||||
return X_dev, y_dev
|
|
||||||
|
|
||||||
@ -1,97 +0,0 @@
|
|||||||
from typing import Iterable
|
|
||||||
|
|
||||||
import jsonlines
|
|
||||||
from allennlp.data import Instance
|
|
||||||
from allennlp.data.dataset_readers import DatasetReader
|
|
||||||
from allennlp.data.fields import TextField, LabelField
|
|
||||||
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
|
|
||||||
from allennlp.data.tokenizers import SpacyTokenizer
|
|
||||||
from overrides import overrides
|
|
||||||
|
|
||||||
from utils.data import Citation
|
|
||||||
|
|
||||||
|
|
||||||
@DatasetReader.register("citation_dataset_reader") # type for config files
|
|
||||||
class CitationDataSetReader(DatasetReader):
|
|
||||||
"""
|
|
||||||
We implement this CitationDataSetReader class by subclassing DatasetReader class,
|
|
||||||
we also need to override some super class methods
|
|
||||||
|
|
||||||
This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
|
|
||||||
We used the default SpacyTokenizer for this project.
|
|
||||||
|
|
||||||
We also need to register this dataset reader, for the Config files to be able to use this class.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
super().__init__()
|
|
||||||
# default Spacy Tokenizer
|
|
||||||
self.tokenizer = SpacyTokenizer()
|
|
||||||
|
|
||||||
@overrides
|
|
||||||
def _read(self, file_path: str) -> Iterable[Instance]:
|
|
||||||
"""
|
|
||||||
|
|
||||||
This function reads the JSON Lines file, tokenize the text for each data point
|
|
||||||
and returns a collection of Instances, each instance with tokens and label
|
|
||||||
|
|
||||||
:param file_path: takes the file path as an Argument
|
|
||||||
:return: returns a collection of Instances
|
|
||||||
"""
|
|
||||||
ds_reader = DataReaderJsonLines(file_path)
|
|
||||||
for citation in ds_reader.read():
|
|
||||||
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
|
|
||||||
|
|
||||||
@overrides
|
|
||||||
def text_to_instance(self, citation_text: str,
|
|
||||||
intent: str) -> Instance:
|
|
||||||
|
|
||||||
"""
|
|
||||||
:param citation_text: text from the data point
|
|
||||||
:param intent: true label of the data instance
|
|
||||||
:return: returns Instance class object with tokens & label fields.
|
|
||||||
"""
|
|
||||||
|
|
||||||
citation_tokens = self.tokenizer.tokenize(citation_text)
|
|
||||||
# Use ELMO Token Characters Indexer
|
|
||||||
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
|
|
||||||
"tokens": SingleIdTokenIndexer()}
|
|
||||||
|
|
||||||
fields = {'tokens': TextField(citation_tokens, token_indexers),
|
|
||||||
'label': LabelField(intent)}
|
|
||||||
|
|
||||||
return Instance(fields)
|
|
||||||
|
|
||||||
|
|
||||||
class DataReaderJsonLines:
|
|
||||||
"""
|
|
||||||
Helper class for reading jsonl(JSON Line) files
|
|
||||||
"""
|
|
||||||
def __init__(self, file_path):
|
|
||||||
self.file_path = file_path
|
|
||||||
|
|
||||||
def read(self):
|
|
||||||
"""
|
|
||||||
This method opens the file, reads every line and returns a collection of lines
|
|
||||||
:return: collection of Citation Objects, with the required data
|
|
||||||
"""
|
|
||||||
with jsonlines.open(self.file_path) as jl_reader:
|
|
||||||
for line in jl_reader:
|
|
||||||
yield read_json_line(line)
|
|
||||||
|
|
||||||
|
|
||||||
def read_json_line(line):
|
|
||||||
|
|
||||||
"""
|
|
||||||
:param line: takes the json line dictionary as a parameter
|
|
||||||
:return: returns a Citation Object
|
|
||||||
"""
|
|
||||||
citation = Citation(
|
|
||||||
text=line['string'],
|
|
||||||
citing_paper_id=line['citingPaperId'],
|
|
||||||
cited_paper_id=line['citedPaperId'],
|
|
||||||
section_title=line['sectionName'],
|
|
||||||
intent=line['label'],
|
|
||||||
citation_id=line['id'])
|
|
||||||
|
|
||||||
return citation
|
|
||||||
Loading…
Reference in new issue