Compare commits
No commits in common. 'master' and 'pavan' have entirely different histories.
@ -1,2 +0,0 @@
|
||||
classifier
|
||||
utils
|
||||
Binary file not shown.
@ -1,3 +0,0 @@
|
||||
from .nn import *
|
||||
from utils.reader import *
|
||||
from.intent_predictor import *
|
||||
@ -1,105 +0,0 @@
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from allennlp.common import JsonDict
|
||||
from allennlp.data import Instance
|
||||
from allennlp.predictors import Predictor
|
||||
from overrides import overrides
|
||||
from allennlp.models import Model
|
||||
from allennlp.data.dataset_readers import DatasetReader
|
||||
from allennlp.models.archival import load_archive
|
||||
from utils.reader import DataReaderJsonLines, CitationDataSetReader
|
||||
|
||||
import os
|
||||
|
||||
|
||||
@Predictor.register('citation_intent_predictor')
|
||||
class IntentClassificationPredictor(Predictor):
|
||||
"""
|
||||
~~~Predictor for Citation Intent Classifier~~~
|
||||
|
||||
- This is just a wrapper class around AllenNLP Model
|
||||
used for making predictions from the trained/saved model
|
||||
|
||||
"""
|
||||
|
||||
def predict(self, text: str, intent: str):
|
||||
"""
|
||||
This function can be called for each data point from the test dataset,
|
||||
takes citation text and the target intent as parameters and
|
||||
returns output dictionary from :func: `~classifier.nn.BiLstmClassifier.forward` method
|
||||
|
||||
:param text: Citation text from test data
|
||||
:param intent: target intent of the data point
|
||||
:return: returns output dictionary from Model's forward method
|
||||
"""
|
||||
return self.predict_json({"string": text, "label": intent})
|
||||
|
||||
@overrides
|
||||
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
|
||||
"""
|
||||
we get a callback to this method from AllenNLP Predictor,
|
||||
passes JsonDict as a parameter with the data that we passed to the prediction_json function earlier.
|
||||
|
||||
And this callback should return the AllenNLP Instance with tokens and target label.
|
||||
|
||||
:param json_dict: json dictionary data with text and intent label
|
||||
:return: returns AllenNLP Instance with tokens(ELMo) and target label
|
||||
"""
|
||||
return self._dataset_reader.text_to_instance(json_dict["string"], json_dict["label"])
|
||||
|
||||
|
||||
def make_predictions(model: Model, dataset_reader: DatasetReader, dataset_file_path: str) -> Tuple[list, list]:
|
||||
"""
|
||||
This function takes the pre-trained(saved) Model and DatasetReader(and dataset file path) as arguments
|
||||
and returns a Tuple of prediction list and gold/true list.
|
||||
|
||||
- Creates a predictor object with the pre-trained model and dataset reader.
|
||||
- Read the data from the passed dataset file path and for each data point, use predictor to predict the intent
|
||||
|
||||
:param model: a trained/saved AllenNLP Model
|
||||
:param dataset_reader: Dataset reader object (for tokenizing text and creating Instances)
|
||||
:param dataset_file_path: a dataset file path to make predictions
|
||||
|
||||
:return: returns a Tuple of prediction list and true labels list
|
||||
"""
|
||||
|
||||
# Create predictor class object
|
||||
predictor = IntentClassificationPredictor(model, dataset_reader)
|
||||
|
||||
prediction_list = []
|
||||
true_list = []
|
||||
|
||||
# read JSON Lines file and Iterate through each datapoint to predict
|
||||
jsonl_reader = DataReaderJsonLines(dataset_file_path)
|
||||
for citation in jsonl_reader.read():
|
||||
true_list.append(citation.intent)
|
||||
output = predictor.predict(citation.text, citation.intent)
|
||||
prediction_list.append(output['prediction'])
|
||||
|
||||
# returns prediction list and gold labels list - Tuple
|
||||
return prediction_list, true_list
|
||||
|
||||
|
||||
def load_model_and_predict_test_data(saved_model_dir: str):
|
||||
"""
|
||||
|
||||
This function loads the saved model from the specified directory and calls make_predictions function.
|
||||
|
||||
:param saved_model_dir: path of the saved AllenNLP model (typically from IMS common space)
|
||||
|
||||
:return: returns a list of prediction list and true list
|
||||
"""
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
dev_file_path = project_root + '/data/jsonl/dev.jsonl'
|
||||
test_file_path = project_root + '/data/jsonl/test.jsonl'
|
||||
|
||||
# load the archived/saved model
|
||||
model_archive = load_archive(os.path.join(saved_model_dir, 'model.tar.gz'))
|
||||
|
||||
# create dataset reader object
|
||||
citation_dataset_reader = CitationDataSetReader()
|
||||
|
||||
# make predictions
|
||||
y_pred, y_true = make_predictions(model_archive.model, citation_dataset_reader, test_file_path)
|
||||
|
||||
return y_pred, y_true
|
||||
@ -1,240 +0,0 @@
|
||||
from utils.csv import DataInstance
|
||||
from feature_extraction.features import FEATURE_LIST, THETA_BIAS_FEATURE
|
||||
from collections import OrderedDict
|
||||
import random
|
||||
|
||||
|
||||
class Perceptron:
|
||||
|
||||
"""
|
||||
Perceptron is an algorithm for supervised learning of binary classifiers,
|
||||
which can decide whether or not an input(features) belongs to some specific class.
|
||||
It's a linear classifier, which makes predictions by combining weights with feature vector.
|
||||
"""
|
||||
|
||||
def __init__(self, label: str, weights: dict, theta_bias: float):
|
||||
"""
|
||||
:type label: str
|
||||
:type weights: dict
|
||||
:type theta_bias: float
|
||||
|
||||
:param label: Label for the Perceptron Classifier (useful while dealing with Multi-Class Perceptron)
|
||||
:param weights: dictionary of feature name and feature weights(random number)
|
||||
:param theta_bias: value of the theta bias variable, threshold weight in other words
|
||||
"""
|
||||
self.label = label
|
||||
self.weights = weights
|
||||
self.theta_bias = theta_bias
|
||||
|
||||
def score(self, features: list):
|
||||
"""
|
||||
This function takes the list of features as parameter and
|
||||
computes score by adding all the weights that corresponds to these features
|
||||
|
||||
:type features: list
|
||||
|
||||
:param features: list of features from a DataInstance
|
||||
:return: returns the computed score
|
||||
"""
|
||||
score_val = 0
|
||||
for feature in features:
|
||||
score_val += self.weights[feature]
|
||||
|
||||
return score_val
|
||||
|
||||
def update_weights(self, features: list, learning_rate: float = 1.0, penalize: bool = False, reward: bool = False):
|
||||
"""
|
||||
This function is used to update weights during the training of the Perceptron Classifier.
|
||||
It takes a list of features as parameter and updates(either increase or decrease) the
|
||||
weights for these individual features based on learning rate parameter
|
||||
|
||||
:param features: list of features from Input DataInstance
|
||||
:param learning_rate: Default is 1.0
|
||||
:param penalize: If True, decreases the weights for each feature. Default is False
|
||||
:param reward: If True, increases the weights for each feature. Default is False
|
||||
|
||||
- If both penalize and reward params are False, weights will not get updated.
|
||||
- If both penalize and reward are True without a learning rate(or learning rate 1),
|
||||
weights for the features remain the same.
|
||||
"""
|
||||
for feature in features:
|
||||
feature_weight = self.weights[feature]
|
||||
if penalize:
|
||||
self.weights[feature] = round(feature_weight - (learning_rate * 1), 5)
|
||||
if reward:
|
||||
self.weights[feature] = round(feature_weight + (learning_rate * 1), 5)
|
||||
|
||||
|
||||
class MultiClassPerceptron:
|
||||
"""
|
||||
Perceptron is a binary classifier, can only separate between two classes.
|
||||
Multi-Class Perceptron can be used, where multiple labels can be assigned to each data instance.
|
||||
|
||||
Multi-Class Perceptron creates one Perceptron Classifier for each label, while training
|
||||
it takes the score for each label(from Perceptron Classifier) and
|
||||
the label with the highest score is the predicted label
|
||||
|
||||
If the predicted label is different from true label of data instance,
|
||||
this model updates the weights as follows:
|
||||
- decrease the weights for the Perceptron Classifier of predicted label (penalize)
|
||||
- increase the weights for the Perceptron Classifier of true label (reward)
|
||||
|
||||
This model also shuffles the training data after each epoch.
|
||||
"""
|
||||
def __init__(self, epochs: int = 5, learning_rate: float = 1.0, random_state: int = 42):
|
||||
"""
|
||||
:type epochs: int
|
||||
:type learning_rate: float
|
||||
:type random_state: int
|
||||
|
||||
:param epochs: number of training iterations
|
||||
:param learning_rate: learning rate for updating weights, Default is 1
|
||||
:param random_state: random state for shuffling the data, useful for reproducing the results.
|
||||
Default is 42.
|
||||
"""
|
||||
self.random_state = random_state
|
||||
self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label
|
||||
self.epochs = epochs
|
||||
self.learning_rate = learning_rate
|
||||
|
||||
def fit(self, X_train: list, labels: list):
|
||||
"""
|
||||
This function takes the training data and labels as parameters and trains the model
|
||||
|
||||
:type X_train: list[DataInstance]
|
||||
:type labels: list[str]
|
||||
|
||||
:param X_train: list of training Data Instances
|
||||
:param labels: list of classes
|
||||
"""
|
||||
|
||||
# Check if labels parameter is empty and raise Exception
|
||||
if labels is None or len(labels) <= 0:
|
||||
raise Exception('The labels parameter must contain at least one label')
|
||||
|
||||
# Check if Training Data is empty and raise Exception
|
||||
if X_train is None or len(X_train) <= 0:
|
||||
raise Exception('Training data can\'t be Empty')
|
||||
|
||||
# Check the data type of training Instances
|
||||
if not isinstance(X_train, list) and not isinstance(X_train[0], DataInstance):
|
||||
raise Exception('Training Data must be a list of type DataInstance(model)')
|
||||
|
||||
train_len = len(X_train)
|
||||
|
||||
# Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
|
||||
for label in labels:
|
||||
sample_weights = get_sample_weights_with_features(theta_bias=-0.25, random_state=self.random_state)
|
||||
self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=-0.25)
|
||||
|
||||
# Training Iterations
|
||||
for epoch in range(self.epochs):
|
||||
|
||||
print('Training Epoch :: (', (epoch+1), '/', self.epochs, ')')
|
||||
|
||||
for i in range(train_len):
|
||||
|
||||
# Pick a number from random list
|
||||
inst = X_train[i]
|
||||
|
||||
perceptron_scores = [] # list for storing perceptron scores for each label
|
||||
for label, perceptron in self.perceptron_dict.items():
|
||||
perceptron_scores.append(perceptron.score(inst.features))
|
||||
|
||||
# find the max score from the list of scores
|
||||
max_score = max(perceptron_scores)
|
||||
|
||||
# find the label that corresponds to max score
|
||||
label_max_score = labels[perceptron_scores.index(max_score)]
|
||||
|
||||
# if the label with max score is different from the label of this data instance,
|
||||
# then decrease the weights(penalize) for the Perceptron of label with max score
|
||||
# and increase the weights(reward) for the Perceptron of data instance label
|
||||
if inst.true_label != label_max_score:
|
||||
# decrease weights
|
||||
self.perceptron_dict[label_max_score].update_weights(inst.features, self.learning_rate, penalize=True)
|
||||
# increase weights
|
||||
self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
|
||||
|
||||
# It's important to shuffle the list during every epoch
|
||||
random.Random(self.random_state).shuffle(X_train)
|
||||
|
||||
def predict(self, X_test: list):
|
||||
"""
|
||||
This function takes testing instances as parameters and assigns a predicted label.
|
||||
|
||||
Takes the score from each Perceptron Classifier, label with the highest score is the predicted label
|
||||
|
||||
:param X_test: list of test data instances
|
||||
:return: list of predicted labels
|
||||
"""
|
||||
|
||||
if X_test is None or len(X_test) <= 0:
|
||||
raise Exception('Testing Data cannot be empty')
|
||||
|
||||
print('Predicting..... ')
|
||||
|
||||
y_test = []
|
||||
labels = list(self.perceptron_dict.keys())
|
||||
for test_inst in X_test:
|
||||
perceptron_scores = [] # list for storing perceptron scores for each label
|
||||
for label in labels:
|
||||
perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
|
||||
# find the max score from the list of scores
|
||||
max_score = max(perceptron_scores)
|
||||
|
||||
label_max_score = labels[perceptron_scores.index(max_score)]
|
||||
y_test.append(label_max_score)
|
||||
|
||||
return y_test
|
||||
|
||||
|
||||
def get_class_scores(self, X_test: list):
|
||||
"""
|
||||
This function takes testing instances as parameters and returns the probability for each
|
||||
predicted label.
|
||||
|
||||
|
||||
:param X_test: list of test data instances
|
||||
:return: list of predicted label probabilities
|
||||
"""
|
||||
|
||||
if X_test is None or len(X_test) <= 0:
|
||||
raise Exception('Testing Data cannot be empty')
|
||||
|
||||
print('Predicting..... ')
|
||||
|
||||
y_test = []
|
||||
labels = list(self.perceptron_dict.keys())
|
||||
for test_inst in X_test:
|
||||
perceptron_scores = [] # list for storing perceptron scores for each label
|
||||
for label in labels:
|
||||
perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
|
||||
# find the max score from the list of scores
|
||||
#max_score = max(perceptron_scores)
|
||||
|
||||
#label_max_score = labels[perceptron_scores.index(max_score)]
|
||||
y_test.append(perceptron_scores)
|
||||
|
||||
return y_test
|
||||
|
||||
|
||||
def get_sample_weights_with_features(theta_bias: float = 0.0, random_state: int = 42):
|
||||
"""
|
||||
This function creates a dictionary with feature as a key and a random floating number (feature weight) as value.
|
||||
Weights for each feature is a floating number between -1 and 1
|
||||
|
||||
:type theta_bias: float
|
||||
:type random_state: int
|
||||
|
||||
:param theta_bias: value of theta bias variable
|
||||
:param random_state: random seed number for reproducing the results
|
||||
|
||||
:return: returns a dictionary of random weights for each feature
|
||||
"""
|
||||
weights = {THETA_BIAS_FEATURE: theta_bias}
|
||||
for feature in FEATURE_LIST:
|
||||
random.seed(random_state)
|
||||
weights[feature] = round(random.uniform(-1.0, 1.0), 5)
|
||||
|
||||
return weights
|
||||
@ -1,211 +0,0 @@
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from allennlp.common.checks import ConfigurationError
|
||||
from allennlp.data import Vocabulary
|
||||
from allennlp.models import Model
|
||||
from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder, FeedForward, Elmo
|
||||
from allennlp.nn import util
|
||||
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
|
||||
from overrides import overrides
|
||||
from torch.nn import Parameter
|
||||
|
||||
|
||||
@Model.register("basic_bilstm_classifier")
|
||||
class BiLstmClassifier(Model):
|
||||
|
||||
"""
|
||||
Two things to note first:
|
||||
- This BiLstmClassifier is a subclass of AllenNLP's Model class
|
||||
- This class registers the type "basic_bilstm_classifier" using @Model.register() decorator,
|
||||
this is required for the Config file to identify the Model class.
|
||||
|
||||
AllenNLP Model is similar to PyTorch Module, it implements forward() method and returns an output dictionary
|
||||
with loss, logits and more....
|
||||
|
||||
The constructor parameters should match with configuration in the config file, the Vocabulary is composed by
|
||||
the library or train pipeline after reading data using Dataset Reader.
|
||||
|
||||
In this model, we used Elmo embeddings, 1 layer BiLSTM (encoder) and 2 Feed-forward layers.
|
||||
The train command/pipeline calls the forward method for a batch of Instances,
|
||||
and the forward method returns the output dictionary with loss, logits, label and F1 metrics
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, vocab: Vocabulary,
|
||||
text_field_embedder: TextFieldEmbedder,
|
||||
encoder: Seq2SeqEncoder,
|
||||
classifier_feedforward: FeedForward,
|
||||
elmo: Elmo = None,
|
||||
use_input_elmo: bool = False):
|
||||
super().__init__(vocab)
|
||||
self.elmo = elmo
|
||||
self.use_elmo = use_input_elmo
|
||||
self.text_field_embedder = text_field_embedder
|
||||
self.num_classes = self.vocab.get_vocab_size("labels")
|
||||
self.encoder = encoder
|
||||
self.classifier_feed_forward = classifier_feedforward
|
||||
self.label_accuracy = CategoricalAccuracy()
|
||||
|
||||
self.label_f1_metrics = {}
|
||||
|
||||
# create F1 Measures for each class
|
||||
for i in range(self.num_classes):
|
||||
self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = \
|
||||
F1Measure(positive_label=i)
|
||||
|
||||
self.loss = torch.nn.CrossEntropyLoss()
|
||||
|
||||
self.attention = Attention(encoder.get_output_dim())
|
||||
|
||||
@overrides
|
||||
def forward(self, tokens: Dict[str, torch.LongTensor],
|
||||
label: torch.LongTensor) -> Dict[str, torch.LongTensor]:
|
||||
|
||||
"""
|
||||
The training loop takes a batch of Instances and passes it to the forward method
|
||||
|
||||
:param tokens: tokens from the Instance
|
||||
:param label: label from the data Instance
|
||||
|
||||
:return: returns an output dictionary after forwarding inputs to the model
|
||||
"""
|
||||
|
||||
input_elmo = None
|
||||
# pop the "elmo" key and add it later
|
||||
elmo_tokens = tokens.pop("elmo", None)
|
||||
|
||||
embedded_text = self.text_field_embedder(tokens)
|
||||
text_mask = util.get_text_field_mask(tokens)
|
||||
|
||||
if elmo_tokens is not None:
|
||||
tokens["elmo"] = elmo_tokens
|
||||
|
||||
# Create ELMo embeddings if applicable
|
||||
if self.elmo:
|
||||
if elmo_tokens is not None:
|
||||
# get elmo representations from Tokens
|
||||
elmo_representations = self.elmo(elmo_tokens["elmo_tokens"])["elmo_representations"]
|
||||
if self.use_elmo:
|
||||
input_elmo = elmo_representations.pop()
|
||||
assert not elmo_representations
|
||||
else:
|
||||
raise ConfigurationError("Model was built to use Elmo, but input text is not tokenized for Elmo.")
|
||||
|
||||
if self.use_elmo:
|
||||
if embedded_text is not None:
|
||||
embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
|
||||
else:
|
||||
embedded_text = input_elmo
|
||||
|
||||
# pass the embedded text to the LSTM encoder
|
||||
encoded_text = self.encoder(embedded_text, text_mask)
|
||||
|
||||
# Attention
|
||||
attn_dist, encoded_text = self.attention(encoded_text, return_attn_distribution=True)
|
||||
|
||||
output_dict = {}
|
||||
if label is not None:
|
||||
logits = self.classifier_feed_forward(encoded_text)
|
||||
|
||||
# Probabilities from Softmax
|
||||
class_probabilities = torch.nn.functional.softmax(logits, dim=1)
|
||||
|
||||
output_dict["logits"] = logits
|
||||
|
||||
# loss calculation
|
||||
loss = self.loss(logits, label)
|
||||
output_dict["loss"] = loss
|
||||
|
||||
# compute F1 per label
|
||||
for i in range(self.num_classes):
|
||||
metric = self.label_f1_metrics[self.vocab.get_token_from_index(index=i, namespace="labels")]
|
||||
metric(class_probabilities, label)
|
||||
output_dict['label'] = label
|
||||
|
||||
output_dict['tokens'] = tokens['tokens']
|
||||
|
||||
return output_dict
|
||||
|
||||
@overrides
|
||||
def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
|
||||
The predict command/pipeline calls this method with the output dictionary from forward() method.
|
||||
|
||||
The returned output dictionary will also be printed in the console when the predict command is executed
|
||||
|
||||
:param output_dict: output dictionary
|
||||
:return: returns human readable output dictionary
|
||||
"""
|
||||
class_probabilities = torch.nn.functional.softmax(output_dict['logits'], dim=-1)
|
||||
predictions = class_probabilities.cpu().data.numpy()
|
||||
argmax_indices = np.argmax(predictions, axis=-1)
|
||||
|
||||
# get the label from vocabulary
|
||||
label = [self.vocab.get_token_from_index(x, namespace="labels")
|
||||
for x in argmax_indices]
|
||||
output_dict['probabilities'] = class_probabilities
|
||||
output_dict['positive_label'] = label
|
||||
output_dict['prediction'] = label
|
||||
|
||||
# return ouput dictionary
|
||||
return output_dict
|
||||
|
||||
@overrides
|
||||
def get_metrics(self, reset: bool = False) -> Dict[str, float]:
|
||||
|
||||
"""
|
||||
|
||||
This method gets a call from the train pipeline,
|
||||
and the returned metrics dictionary will be printed in the Console while Training.
|
||||
|
||||
The returned metrics dictionary contains class-wise F1 Scores, Average F1 score and loss
|
||||
|
||||
:param reset: boolean
|
||||
|
||||
:return: returns a metrics dictionary with Class Level F1 scores and losses
|
||||
"""
|
||||
|
||||
metric_dict = {}
|
||||
|
||||
sum_f1 = 0.0
|
||||
for name, metric in self.label_f1_metrics.items():
|
||||
metric_val = metric.get_metric(reset)
|
||||
metric_dict[name + '_F1'] = metric_val[2]
|
||||
if name != 'none': # do not consider `none` label in averaging F1
|
||||
sum_f1 += metric_val[2]
|
||||
|
||||
names = list(self.label_f1_metrics.keys())
|
||||
total_len = len(names) if 'none' not in names else len(names) - 1
|
||||
average_f1 = sum_f1 / total_len
|
||||
metric_dict['AVG_F1_Score'] = average_f1
|
||||
|
||||
return metric_dict
|
||||
|
||||
|
||||
def new_parameter(*size):
|
||||
out = Parameter(torch.FloatTensor(*size))
|
||||
torch.nn.init.xavier_normal_(out)
|
||||
return out
|
||||
|
||||
|
||||
class Attention(torch.nn.Module):
|
||||
""" Simple multiplicative attention"""
|
||||
|
||||
def __init__(self, attention_size):
|
||||
super(Attention, self).__init__()
|
||||
self.attention = new_parameter(attention_size, 1)
|
||||
|
||||
def forward(self, x_in, reduction_dim=-2, return_attn_distribution=False):
|
||||
# calculate attn weights
|
||||
attn_score = torch.matmul(x_in, self.attention).squeeze()
|
||||
# add one dimension at the end and get a distribution out of scores
|
||||
attn_distrib = torch.nn.functional.softmax(attn_score.squeeze(), dim=-1).unsqueeze(-1)
|
||||
scored_x = x_in * attn_distrib
|
||||
weighted_sum = torch.sum(scored_x, dim=reduction_dim)
|
||||
if return_attn_distribution:
|
||||
return attn_distrib.reshape(x_in.shape[0], -1), weighted_sum
|
||||
else:
|
||||
return weighted_sum
|
||||
@ -1,128 +0,0 @@
|
||||
"""
|
||||
Simple feed-forward neural network in PyTorch for baseline results on Scicite data.
|
||||
Created: July 5th, 2020
|
||||
"""
|
||||
|
||||
import torch
|
||||
from utils.nn_reader import read_csv_nn
|
||||
import numpy as np
|
||||
|
||||
|
||||
class FeedForward(torch.nn.Module):
|
||||
"""
|
||||
Creates and trains a basic feedforward neural network.
|
||||
"""
|
||||
|
||||
def __init__(self, input_size, hidden_size, output_size):
|
||||
""" Sets up all basic elements of NN. """
|
||||
super(FeedForward, self).__init__()
|
||||
self.input_size = input_size
|
||||
self.hidden_size = hidden_size
|
||||
self.output_size = output_size
|
||||
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
|
||||
self.tanh = torch.nn.Tanh()
|
||||
self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
|
||||
self.sigmoid = torch.nn.Sigmoid()
|
||||
self.softmax = torch.nn.Softmax(dim=1)
|
||||
self.read_data()
|
||||
|
||||
def forward(self, x):
|
||||
""" Computes output from a given input x. """
|
||||
hidden = self.fc1(x)
|
||||
tanh = self.tanh(hidden)
|
||||
output = self.fc2(tanh)
|
||||
output = self.softmax(output)
|
||||
return output
|
||||
|
||||
def read_data(self):
|
||||
"""" Reads in training and test data and converts it to proper format. """
|
||||
self.X_train_, self.y_train_, self.X_test, self.y_test_ = read_csv_nn()
|
||||
self.X_test = torch.FloatTensor(self.X_test)
|
||||
yclass = np.array([(x[1] == 1) + 2 * (x[2] == 1) for x in self.y_train_])
|
||||
is0 = yclass == 0
|
||||
is1 = yclass == 1
|
||||
is2 = yclass == 2
|
||||
self.X0 = torch.FloatTensor(self.X_train_[is0])
|
||||
self.X1 = torch.FloatTensor(self.X_train_[is1])
|
||||
self.X2 = torch.FloatTensor(self.X_train_[is2])
|
||||
self.y0 = torch.LongTensor(np.zeros((sum(is0),)))
|
||||
self.y1 = torch.LongTensor(np.ones((sum(is1),)))
|
||||
self.y2 = torch.LongTensor(2 * np.ones((sum(is2),)))
|
||||
self.l0 = sum(is0)
|
||||
self.l1 = sum(is1)
|
||||
self.l2 = sum(is2)
|
||||
self.y_test = (self.y_test_[:, 1] == 1) + 2 * (self.y_test_[:, 2] == 1)
|
||||
|
||||
def fit(self, epochs=100, batch_size=16, lr=0.01, samples=(1000, 1000, 1000)):
|
||||
""" Trains model, using cross entropy loss and SGD optimizer. """
|
||||
self.criterion = torch.nn.CrossEntropyLoss()
|
||||
self.optimizer = torch.optim.SGD(self.parameters(), lr)
|
||||
self.samples0, self.samples1, self.samples2 = samples
|
||||
|
||||
self.eval() # put into eval mode
|
||||
|
||||
# initialize training data
|
||||
self.shuffle()
|
||||
y_pred = self.forward(self.X_train)
|
||||
before_train = self.criterion(y_pred, self.y_train)
|
||||
print('Test loss before training', before_train.item())
|
||||
|
||||
# setup for batches
|
||||
l = self.samples0 + self.samples1 + self.samples2 # total length
|
||||
batch_indices = list(zip(list(range(0, l, batch_size))[:-1], list(range(16, l, batch_size))))
|
||||
batch_indices[-1] = (batch_indices[-1][0], l)
|
||||
|
||||
# train model
|
||||
self.train() # put into training mode
|
||||
for epoch in range(epochs):
|
||||
batch = 0
|
||||
for a, b in batch_indices:
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
# forward pass
|
||||
y_pred = self.forward(self.X_train[a:b])
|
||||
loss = self.criterion(y_pred, self.y_train[a:b])
|
||||
|
||||
# backward pass
|
||||
loss.backward()
|
||||
self.optimizer.step()
|
||||
batch += 1
|
||||
|
||||
# get loss following epoch
|
||||
y_pred = self.forward(self.X_train)
|
||||
loss = self.criterion(y_pred, self.y_train)
|
||||
print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
|
||||
|
||||
# shuffle dataset
|
||||
self.shuffle()
|
||||
|
||||
# display final loss
|
||||
self.eval() # back to eval mode
|
||||
y_pred = self.forward(self.X_train)
|
||||
after_train = self.criterion(y_pred, self.y_train)
|
||||
print('Training loss after training', after_train.item())
|
||||
|
||||
def predict(self):
|
||||
""" Generates predictions from model, using test data. """
|
||||
|
||||
# post-process to get predictions & get back to np format
|
||||
y_pred = self.forward(self.X_test)
|
||||
y_pred_np = y_pred.detach().numpy()
|
||||
predmax = np.amax(y_pred_np, axis=1)
|
||||
self.preds = 1 * (y_pred_np[:, 1] == predmax) + 2 * (y_pred_np[:, 2] == predmax)
|
||||
self.probs = y_pred.detach().numpy()
|
||||
|
||||
def shuffle(self):
|
||||
""" Samples and shuffles training data. """
|
||||
|
||||
# create permutations for shuffling
|
||||
p0 = torch.randperm(self.l0)
|
||||
p1 = torch.randperm(self.l1)
|
||||
p2 = torch.randperm(self.l2)
|
||||
n = self.samples0 + self.samples1 + self.samples2
|
||||
p = torch.randperm(n)
|
||||
|
||||
# sample and shuffle data
|
||||
self.X_train = \
|
||||
torch.cat((self.X0[p0][:self.samples0], self.X1[p1][:self.samples1], self.X2[p2][:self.samples2]))[p]
|
||||
self.y_train = torch.cat((self.y0[:self.samples0], self.y1[:self.samples1], self.y2[:self.samples2]))[p]
|
||||
@ -1,57 +0,0 @@
|
||||
{
|
||||
"dataset_reader": {
|
||||
"type": "citation_dataset_reader"
|
||||
},
|
||||
"train_data_path": "data/jsonl/train.jsonl",
|
||||
"validation_data_path": "data/jsonl/dev.jsonl",
|
||||
"test_data_path": "data/jsonl/test.jsonl",
|
||||
"evaluate_on_test": true,
|
||||
"model": {
|
||||
"type": "basic_bilstm_classifier",
|
||||
"text_field_embedder": {
|
||||
"token_embedders": {
|
||||
"tokens": {
|
||||
"pretrained_file": "/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/glove.6B.100d.txt.gz",
|
||||
"type": "embedding",
|
||||
"embedding_dim": 100,
|
||||
"trainable": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"encoder": {
|
||||
"type": "lstm",
|
||||
"input_size": 1124,
|
||||
"hidden_size": 100,
|
||||
"num_layers": 1,
|
||||
"bidirectional": true
|
||||
},
|
||||
"elmo": {
|
||||
"options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
|
||||
"weight_file": "/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
|
||||
"do_layer_norm": true,
|
||||
"dropout": 0.5,
|
||||
"num_output_representations": 1
|
||||
},
|
||||
"use_input_elmo": true,
|
||||
"classifier_feedforward": {
|
||||
"input_dim": 200,
|
||||
"num_layers": 2,
|
||||
"hidden_dims": [20, 3],
|
||||
"activations": ["linear", "linear"]
|
||||
}
|
||||
},
|
||||
"data_loader": {
|
||||
"batch_sampler": {
|
||||
"type": "bucket",
|
||||
"batch_size" : 16
|
||||
}
|
||||
},
|
||||
"trainer": {
|
||||
"optimizer": {
|
||||
"type": "adagrad",
|
||||
"lr": 0.005
|
||||
},
|
||||
"num_epochs": 10,
|
||||
"cuda_device": 3
|
||||
}
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
from allennlp.modules.elmo import Elmo, batch_to_ids
|
||||
|
||||
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
|
||||
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
|
||||
|
||||
elmo = Elmo(options_file, weights_file, 1, dropout=0)
|
||||
text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
|
||||
|
||||
batch = batch_to_ids(text)
|
||||
print(batch)
|
||||
|
||||
dict = elmo.forward(batch)
|
||||
|
||||
print(dict['elmo_representations'])
|
||||
@ -1,48 +0,0 @@
|
||||
import feature_extraction.lexicons as lexicons
|
||||
from utils.constants import REGEX_CONSTANTS
|
||||
|
||||
""" List of supported features for feature extraction from Input String """
|
||||
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
|
||||
'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
|
||||
'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
|
||||
'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE',
|
||||
'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL']
|
||||
|
||||
""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
|
||||
THETA_BIAS_FEATURE = 'THETA_BIAS'
|
||||
|
||||
|
||||
def extract_features_from_text(text: str):
|
||||
"""
|
||||
This function takes text string as input, extracts and returns a list of features by checking each word in
|
||||
:`~feature_extraction.lexicons.ALL_LEXICONS`
|
||||
:param text: takes string text as param
|
||||
:return: returns a list of extracted features from the text, empty list for no features
|
||||
"""
|
||||
|
||||
# ALL_LEXICONS
|
||||
lexicon_dict = lexicons.ALL_LEXICONS
|
||||
|
||||
# Initialize the feature list with Theta Bias feature, this feature must be added to all data instances
|
||||
text_feature_list = [THETA_BIAS_FEATURE]
|
||||
|
||||
# Iterate through the list features and get list of words from the lexicon dictionary,
|
||||
# for each word in the word list, check if it appears in input text and add it to the text feature list
|
||||
for feature in FEATURE_LIST:
|
||||
|
||||
# If the feature is Regex Pattern Match, get the pattern from :`~utils.constants.REGEX_CONSTANTS`
|
||||
# and match it with the input text
|
||||
if feature in REGEX_CONSTANTS:
|
||||
pattern = REGEX_CONSTANTS[feature]
|
||||
if bool(pattern.search(text)):
|
||||
text_feature_list.append(feature)
|
||||
continue
|
||||
|
||||
# If the feature is not Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
|
||||
word_list = lexicon_dict[feature]
|
||||
for word in word_list:
|
||||
if word in text.lower():
|
||||
text_feature_list.append(feature)
|
||||
break
|
||||
|
||||
return text_feature_list
|
||||
@ -1,74 +0,0 @@
|
||||
"""
|
||||
Dictionary of Lexicons used for Feature Extraction
|
||||
"""
|
||||
ALL_LEXICONS = {
|
||||
|
||||
'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low',
|
||||
'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
|
||||
|
||||
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
|
||||
|
||||
'RESULT': ['estimate', 'evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result', 'performance'],
|
||||
|
||||
'INCREASE': ['increase', 'grow', 'intensify', 'build up', 'explode'],
|
||||
|
||||
'CHANGE': ['adapt', 'adjust', 'augment', 'combine', 'change', 'decrease', 'elaborate', 'expand', 'expand on',
|
||||
'extend', 'derive', 'incorporate', 'increase', 'manipulate', 'modify', 'optimize', 'optimise', 'refine',
|
||||
'render', 'replace', 'revise', 'substitute', 'tailor', 'upgrade', 'grow'],
|
||||
|
||||
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
|
||||
|
||||
'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate',
|
||||
'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate',
|
||||
'specify', 'explain'],
|
||||
|
||||
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
|
||||
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate',
|
||||
'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'],
|
||||
|
||||
'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
|
||||
'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
|
||||
'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
|
||||
'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation',
|
||||
'literature', 'experience'],
|
||||
|
||||
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
|
||||
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
|
||||
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strateg',
|
||||
'system', 'technique', 'theory', 'tool', 'treatment'],
|
||||
|
||||
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
|
||||
'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
|
||||
'suggest', 'swear', 'write'],
|
||||
|
||||
'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
|
||||
|
||||
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
|
||||
|
||||
'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
|
||||
'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
|
||||
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
|
||||
|
||||
'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase',
|
||||
'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death',
|
||||
'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
|
||||
'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
|
||||
'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
|
||||
'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity',
|
||||
'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop',
|
||||
'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium', 'rna', 'organism', 'biolog', 'x-ray'],
|
||||
|
||||
'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
|
||||
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
|
||||
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
|
||||
'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter',
|
||||
'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'],
|
||||
|
||||
'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network',
|
||||
'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml',
|
||||
'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
|
||||
'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
|
||||
|
||||
'CITATION': ['et al']
|
||||
|
||||
}
|
||||
|
Before Width: | Height: | Size: 34 KiB |
|
Before Width: | Height: | Size: 33 KiB |
|
Before Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 32 KiB |
|
Before Width: | Height: | Size: 32 KiB |
Binary file not shown.
@ -1,10 +0,0 @@
|
||||
allennlp==1.0.0
|
||||
jsonlines==1.2.0
|
||||
matplotlib==3.3.0
|
||||
numpy==1.19.0
|
||||
overrides==3.0.0
|
||||
scikit-learn==0.23.1
|
||||
six==1.15.0
|
||||
spacy==2.2.4
|
||||
torch==1.5.1
|
||||
torchvision==0.6.1
|
||||
@ -1,2 +0,0 @@
|
||||
from utils.reader import *
|
||||
from classifier.nn import *
|
||||
@ -1,14 +0,0 @@
|
||||
import classifier.intent_predictor as pred
|
||||
|
||||
import eval.metrics as metrics
|
||||
|
||||
saved_model_dir = '/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/saved_models/experiment_4'
|
||||
y_pred, y_true = pred.load_model_and_predict_test_data(saved_model_dir)
|
||||
|
||||
confusion_matrix = metrics.get_confusion_matrix(y_true, y_pred)
|
||||
|
||||
print("Confusion Matrix :: ")
|
||||
print(confusion_matrix)
|
||||
|
||||
plot_file_path = saved_model_dir+'/confusion_matrix_plot.png'
|
||||
metrics.plot_confusion_matrix(confusion_matrix, "BiLSTM Classifier + Attention with ELMo", plot_file_path)
|
||||
@ -1,27 +0,0 @@
|
||||
import os
|
||||
from utils.csv import read_csv_file
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
train_file_path = project_root+'/data/tsv/train.tsv'
|
||||
test_file_path = project_root+'/data/tsv/test.tsv'
|
||||
|
||||
print(train_file_path)
|
||||
|
||||
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
|
||||
|
||||
i = 0
|
||||
feature_dict = {}
|
||||
for inst in data[:20]:
|
||||
inst.print()
|
||||
# print('Data Points without Features :: ', i)
|
||||
|
||||
# tokens = inst.text.split()
|
||||
# for token in tokens:
|
||||
# if token not in feature_dict:
|
||||
# feature_dict[token] = 1
|
||||
# continue
|
||||
# feature_dict[token] += 1
|
||||
#
|
||||
# for key in sorted(feature_dict, key=feature_dict.get, reverse=True):
|
||||
# print(key, ' -> ', feature_dict.get(key))
|
||||
@ -1,34 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
sys.path.append(os.getcwd())
|
||||
from classifier.nn_ff import FeedForward
|
||||
from sklearn.metrics import f1_score
|
||||
from eval.metrics import plot_confusion_matrix, get_confusion_matrix
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
clf = FeedForward(28, 9, 3)
|
||||
clf.fit()
|
||||
clf.predict()
|
||||
|
||||
# predict
|
||||
y_test = clf.preds
|
||||
y_true = clf.y_test
|
||||
|
||||
# Model Evaluation
|
||||
labels = set(['background', 'method', 'result'])
|
||||
f1_score_micro = f1_score(y_true, y_test, average='micro')
|
||||
f1_score_macro = f1_score(y_true, y_test, average='macro')
|
||||
|
||||
# Print F1 Score
|
||||
print('F1 score (micro): ', f1_score_micro)
|
||||
print('F1 score (macro): ', f1_score_macro)
|
||||
|
||||
# plot confusion matrix
|
||||
classdict = {0: 'background', 1: 'method', 2: 'result'}
|
||||
y_test = [classdict[x] for x in y_test]
|
||||
y_true = [classdict[x] for x in y_true]
|
||||
plot_path = project_root + '/plots/confusion_matrix_plot_ff.png'
|
||||
plot_confusion_matrix(get_confusion_matrix(y_true, y_test), 'Feed-forward NN Classifier (Baseline)', plot_path)
|
||||
|
||||
|
||||
@ -1,45 +0,0 @@
|
||||
from classifier.linear_model import MultiClassPerceptron
|
||||
from utils.csv import read_csv_file
|
||||
from eval.metrics import f1_score, plot_confusion_matrix, get_confusion_matrix
|
||||
import utils.constants as const
|
||||
import os
|
||||
|
||||
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
train_file_path = project_root + '/data/tsv/train.tsv'
|
||||
test_file_path = project_root + '/data/tsv/test.tsv'
|
||||
|
||||
# Read the training dataset
|
||||
X_train_inst = read_csv_file(train_file_path, '\t')
|
||||
|
||||
# set of labels from Training data
|
||||
labels = set([inst.true_label for inst in X_train_inst])
|
||||
|
||||
# Read test data set
|
||||
X_test_inst = read_csv_file(test_file_path, '\t')
|
||||
|
||||
# number of training iterations
|
||||
epochs = 50
|
||||
|
||||
# create MultiClassPerceptron classifier object
|
||||
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.7, random_state=101)
|
||||
|
||||
# train the model
|
||||
clf.fit(X_train=X_train_inst, labels=list(labels))
|
||||
|
||||
# predict
|
||||
y_test = clf.predict(X_test_inst)
|
||||
|
||||
y_true = [inst.true_label for inst in X_test_inst]
|
||||
|
||||
# Model Evaluation
|
||||
f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
|
||||
f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
|
||||
f1_score_none = f1_score(y_true, y_test, labels, None)
|
||||
|
||||
# Print F1 Score
|
||||
for result in f1_score_micro + f1_score_macro + f1_score_none:
|
||||
result.print_result()
|
||||
|
||||
# plot confusion matrix
|
||||
plot_path = project_root + '/plots/confusion_matrix_plot.png'
|
||||
plot_confusion_matrix(get_confusion_matrix(y_true, y_test), 'Perceptron Classifier (Baseline)', plot_path)
|
||||
@ -1,66 +0,0 @@
|
||||
#import os
|
||||
#os.chdir('/Users/iriley/code/citation-analysis')
|
||||
import sys
|
||||
sys.path.append('/Users/iriley/code/citation-analysis')
|
||||
from classifier.linear_model import MultiClassPerceptron
|
||||
from sklearn.metrics import confusion_matrix as cm
|
||||
from utils.csv import read_csv_file
|
||||
from eval.metrics import f1_score
|
||||
import utils.constants as const
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
|
||||
|
||||
train_file_path = '/Users/iriley/code/citation-analysis/data/tsv/train.tsv'
|
||||
dev_file_path = '/Users/iriley/code/citation-analysis/data/tsv/test.tsv'
|
||||
|
||||
|
||||
# Read the training dataset
|
||||
X_train_inst = read_csv_file(train_file_path, '\t')
|
||||
|
||||
# set of labels from Training data
|
||||
labels = set([inst.true_label for inst in X_train_inst])
|
||||
|
||||
# Read test data set
|
||||
X_dev_inst = read_csv_file(dev_file_path, '\t')
|
||||
|
||||
# number of training iterations
|
||||
epochs = 50
|
||||
|
||||
# create MultiClassPerceptron classifier object
|
||||
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.5, random_state=101)
|
||||
|
||||
# train the model
|
||||
clf.fit(X_train=X_train_inst, labels=list(labels))
|
||||
|
||||
# predict
|
||||
y_pred = clf.predict(X_dev_inst)
|
||||
y_scores = np.array(clf.get_class_scores(X_dev_inst))
|
||||
|
||||
y_true = [inst.true_label for inst in X_dev_inst]
|
||||
|
||||
labeldict = {'background': 0, 'method': 1, 'result': 2}
|
||||
y_pred = np.array([labeldict[x] for x in y_pred])
|
||||
y_true = np.array([labeldict[x] for x in y_true])
|
||||
|
||||
conmat = cm(y_true, y_pred)
|
||||
|
||||
df = pd.DataFrame()
|
||||
df['pred'] = y_pred
|
||||
df['true'] = y_true
|
||||
df['correct'] = y_pred==y_true
|
||||
df['score0'] = np.round(y_scores[:,0],3)
|
||||
df['score1'] = np.round(y_scores[:,1],3)
|
||||
df['score2'] = np.round(y_scores[:,2],3)
|
||||
|
||||
df.to_csv('/Users/iriley/code/machine_learning/lab2020/y_pred_model1.csv', index=False)
|
||||
|
||||
## Model Evaluation
|
||||
#f1_score_micro = f1_score(y_true, y_pred, labels, const.AVG_MICRO)
|
||||
#f1_score_macro = f1_score(y_true, y_pred, labels, const.AVG_MACRO)
|
||||
#f1_score_none = f1_score(y_true, y_pred, labels, None)
|
||||
|
||||
## Print F1 Score
|
||||
#for result in f1_score_micro + f1_score_macro + f1_score_none:
|
||||
# result.print_result()
|
||||
@ -1,37 +1,2 @@
|
||||
import re
|
||||
|
||||
|
||||
AVG_MICRO = 'MICRO'
|
||||
AVG_MACRO = 'MACRO'
|
||||
|
||||
REGEX_CONSTANTS = {
|
||||
|
||||
# Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2
|
||||
# 'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"),
|
||||
'ACRONYM': re.compile(r"^[A-Z]{2,}[\.,:;\b\s]|[\s\b]m?[A-Z]{2,}[\.,:;\b\s]"),
|
||||
|
||||
# Regex for matching Years in the text - > 1995 / 2020 / 2019
|
||||
'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"),
|
||||
|
||||
# Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
|
||||
'SEQUENCE': re.compile(r"\([\d.*]\)"),
|
||||
|
||||
# Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
|
||||
'REFERENCE': re.compile(r"\[\d.*\]"),
|
||||
|
||||
# Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
|
||||
'PERCENTAGE': re.compile(r"\d[\d\.\-]+%"),
|
||||
|
||||
# Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals.
|
||||
'CONTAINS_URL': re.compile(r"https?://\S+"),
|
||||
|
||||
'ENDS_WITH_RIDE': re.compile(r"ride\b"),
|
||||
|
||||
'ENDS_WITH_RINE': re.compile(r"rine\b"),
|
||||
|
||||
'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
|
||||
|
||||
}
|
||||
|
||||
CLASS_LABELS = {"background": 0, "method": 1, "result": 2}
|
||||
CLASS_LABELS_LIST = ['background', 'method', 'result']
|
||||
AVG_MACRO = 'MACRO'
|
||||
@ -1,17 +0,0 @@
|
||||
class Citation(object):
|
||||
""" Class representing a citation object """
|
||||
|
||||
def __init__(self,
|
||||
text,
|
||||
citing_paper_id,
|
||||
cited_paper_id,
|
||||
section_title=None,
|
||||
intent=None,
|
||||
citation_id=None
|
||||
):
|
||||
self.text = text
|
||||
self.citing_paper_id = citing_paper_id
|
||||
self.cited_paper_id = cited_paper_id
|
||||
self.section_title = section_title
|
||||
self.intent = intent
|
||||
self.citation_id = citation_id
|
||||
@ -0,0 +1,13 @@
|
||||
|
||||
class DataInstance:
|
||||
"""
|
||||
Model Class for carrying Training and Testing data from tsc/csv file
|
||||
"""
|
||||
|
||||
def __init__(self, r_id, text, true_label):
|
||||
self.did = r_id
|
||||
self.text = text
|
||||
self.true_label = true_label
|
||||
|
||||
def print(self):
|
||||
print('True Label :: ', self.true_label, ' Text :: ', self.text)
|
||||
@ -1,57 +0,0 @@
|
||||
import numpy as np
|
||||
from itertools import chain
|
||||
from utils.csv import read_csv_file
|
||||
|
||||
# TODO: clean up, transform into class, allow for command-line arguments
|
||||
|
||||
def read_csv_nn(scicite_dir=None):
|
||||
|
||||
train_file_path = 'data/tsv/train.tsv'
|
||||
test_file_path = 'data/tsv/test.tsv'
|
||||
train_raw = read_csv_file(train_file_path, '\t')
|
||||
test_raw = read_csv_file(test_file_path, '\t')
|
||||
|
||||
features = [x.features for x in train_raw]
|
||||
features_unique = list(set(chain.from_iterable(features)))
|
||||
nobs = len(features)
|
||||
nfeats = len(features_unique)
|
||||
|
||||
X_train = np.zeros((nobs, nfeats))
|
||||
|
||||
for j in range(nfeats):
|
||||
f = features_unique[j]
|
||||
for i in range(nobs):
|
||||
if f in features[i]:
|
||||
X_train[i,j] = 1
|
||||
|
||||
y_train_raw = np.array([x.true_label for x in train_raw])
|
||||
y_unique = sorted(list(set(y_train_raw)))
|
||||
y_dim = len(y_unique)
|
||||
y_train = np.zeros((nobs,y_dim))
|
||||
|
||||
for j in range(y_dim):
|
||||
y_train[:,j] = y_train_raw == y_unique[j]
|
||||
|
||||
test_raw = read_csv_file(test_file_path, '\t')
|
||||
features = [x.features for x in test_raw]
|
||||
#features_unique = list(set(chain.from_iterable(features)))
|
||||
nobs = len(features)
|
||||
nfeats = len(features_unique)
|
||||
|
||||
X_test = np.zeros((nobs, nfeats))
|
||||
for j in range(nfeats):
|
||||
f = features_unique[j]
|
||||
for i in range(nobs):
|
||||
if f in features[i]:
|
||||
X_test[i,j] = 1
|
||||
|
||||
y_test_raw = np.array([x.true_label for x in test_raw])
|
||||
y_test = np.zeros((nobs, y_dim))
|
||||
|
||||
for j in range(y_dim):
|
||||
y_test[:, j] = y_test_raw == y_unique[j]
|
||||
|
||||
return X_train, y_train, X_test, y_test
|
||||
|
||||
|
||||
|
||||
@ -1,33 +0,0 @@
|
||||
import numpy as np
|
||||
from itertools import chain
|
||||
from utils.csv import read_csv_file
|
||||
|
||||
|
||||
def read_csv_nn_dev(scicite_dir=None):
|
||||
|
||||
dev_file_path = 'data/tsv/dev.tsv'
|
||||
dev_raw = read_csv_file(dev_file_path, '\t')
|
||||
|
||||
features = [x.features for x in dev_raw]
|
||||
features_unique = list(set(chain.from_iterable(features)))
|
||||
nobs = len(features)
|
||||
nfeats = len(features_unique)
|
||||
|
||||
X_dev = np.zeros((nobs, nfeats))
|
||||
|
||||
for j in range(nfeats):
|
||||
f = features_unique[j]
|
||||
for i in range(nobs):
|
||||
if f in features[i]:
|
||||
X_dev[i,j] = 1
|
||||
|
||||
y_dev_raw = np.array([x.true_label for x in dev_raw])
|
||||
y_unique = sorted(list(set(y_dev_raw)))
|
||||
y_dim = len(y_unique)
|
||||
y_dev = np.zeros((nobs,y_dim))
|
||||
|
||||
for j in range(y_dim):
|
||||
y_dev[:,j] = y_dev_raw == y_unique[j]
|
||||
|
||||
return X_dev, y_dev
|
||||
|
||||
@ -1,97 +0,0 @@
|
||||
from typing import Iterable
|
||||
|
||||
import jsonlines
|
||||
from allennlp.data import Instance
|
||||
from allennlp.data.dataset_readers import DatasetReader
|
||||
from allennlp.data.fields import TextField, LabelField
|
||||
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
|
||||
from allennlp.data.tokenizers import SpacyTokenizer
|
||||
from overrides import overrides
|
||||
|
||||
from utils.data import Citation
|
||||
|
||||
|
||||
@DatasetReader.register("citation_dataset_reader") # type for config files
|
||||
class CitationDataSetReader(DatasetReader):
|
||||
"""
|
||||
We implement this CitationDataSetReader class by subclassing DatasetReader class,
|
||||
we also need to override some super class methods
|
||||
|
||||
This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
|
||||
We used the default SpacyTokenizer for this project.
|
||||
|
||||
We also need to register this dataset reader, for the Config files to be able to use this class.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# default Spacy Tokenizer
|
||||
self.tokenizer = SpacyTokenizer()
|
||||
|
||||
@overrides
|
||||
def _read(self, file_path: str) -> Iterable[Instance]:
|
||||
"""
|
||||
|
||||
This function reads the JSON Lines file, tokenize the text for each data point
|
||||
and returns a collection of Instances, each instance with tokens and label
|
||||
|
||||
:param file_path: takes the file path as an Argument
|
||||
:return: returns a collection of Instances
|
||||
"""
|
||||
ds_reader = DataReaderJsonLines(file_path)
|
||||
for citation in ds_reader.read():
|
||||
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
|
||||
|
||||
@overrides
|
||||
def text_to_instance(self, citation_text: str,
|
||||
intent: str) -> Instance:
|
||||
|
||||
"""
|
||||
:param citation_text: text from the data point
|
||||
:param intent: true label of the data instance
|
||||
:return: returns Instance class object with tokens & label fields.
|
||||
"""
|
||||
|
||||
citation_tokens = self.tokenizer.tokenize(citation_text)
|
||||
# Use ELMO Token Characters Indexer
|
||||
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
|
||||
"tokens": SingleIdTokenIndexer()}
|
||||
|
||||
fields = {'tokens': TextField(citation_tokens, token_indexers),
|
||||
'label': LabelField(intent)}
|
||||
|
||||
return Instance(fields)
|
||||
|
||||
|
||||
class DataReaderJsonLines:
|
||||
"""
|
||||
Helper class for reading jsonl(JSON Line) files
|
||||
"""
|
||||
def __init__(self, file_path):
|
||||
self.file_path = file_path
|
||||
|
||||
def read(self):
|
||||
"""
|
||||
This method opens the file, reads every line and returns a collection of lines
|
||||
:return: collection of Citation Objects, with the required data
|
||||
"""
|
||||
with jsonlines.open(self.file_path) as jl_reader:
|
||||
for line in jl_reader:
|
||||
yield read_json_line(line)
|
||||
|
||||
|
||||
def read_json_line(line):
|
||||
|
||||
"""
|
||||
:param line: takes the json line dictionary as a parameter
|
||||
:return: returns a Citation Object
|
||||
"""
|
||||
citation = Citation(
|
||||
text=line['string'],
|
||||
citing_paper_id=line['citingPaperId'],
|
||||
cited_paper_id=line['citedPaperId'],
|
||||
section_title=line['sectionName'],
|
||||
intent=line['label'],
|
||||
citation_id=line['id'])
|
||||
|
||||
return citation
|
||||
Loading…
Reference in new issue