Compare commits

..

99 Commits

Author SHA1 Message Date
Pavan Mandava 7cb854863e Fixed paths in README
3 years ago
yelircaasi 3d6f24bf5b
Update README.md
5 years ago
yelircaasi 5833cf1b31 fixed plot paths
5 years ago
yelircaasi a327fa2ee3 edited README.md
5 years ago
yelircaasi 6aa59d0712 improved test file for FFNN
5 years ago
yelircaasi 69f913c801 cleaned up FeedForward class to pass test
5 years ago
Pavan Mandava 49dec048c8 Added some more Code level comments to AllenNLP Model
5 years ago
Pavan Mandava 891c6f2828 Uploaded final submission to GitHub
5 years ago
Pavan Mandava b6fe81b588 README doc - IMG size increased
5 years ago
Pavan Mandava cf20b9169a JSON Dictionary Keys changed
5 years ago
Pavan Mandava 66496f85fc Missing \ character
5 years ago
Pavan Mandava 50b3409578 intent_predictor moved to classifier package
5 years ago
Pavan Mandava 0467eeee5f Merge remote-tracking branch 'origin/master'
5 years ago
Pavan Mandava 9b80e3d272 WIP : README Documentation - Added Report to MD
5 years ago
yelircaasi ca2b2edae6 fixed small errors
5 years ago
yelircaasi 9bc4b27945 fixed small errors
5 years ago
yelircaasi b972c02bde Merge remote-tracking branch 'origin/master'
5 years ago
yelircaasi 3afe028659 fixed small errors
5 years ago
Pavan Mandava 3988033311 Merge remote-tracking branch 'origin/master'
5 years ago
Pavan Mandava 57a08e53cb WIP : README Documentation - Added References
5 years ago
yelircaasi 6e0fd99357 Merge remote-tracking branch 'origin/master'
5 years ago
yelircaasi 108653411c refactored ff_nn, added test file
5 years ago
Pavan Mandava 0f2e8343f1 WIP : README Documentation - Spelling Errors & Notes
5 years ago
Pavan Mandava 9e47b20bed WIP : README Documentation - Added Environment & Setup, Finished AllenNLP doc
5 years ago
Pavan Mandava a89075a5a3 WIP : README Documentation - Plot IMG Aspect Ration changed
5 years ago
Pavan Mandava d31d2c5e7b WIP : README Documentation - AllenNLP Model & plot
5 years ago
Pavan Mandava e24fad7be5 WIP : README Documentation - Added Headers
5 years ago
Pavan Mandava e5dc46ab64 WIP : README Documentation - Relative Path Fix
5 years ago
Pavan Mandava de20cb6542 WIP : README Documentation - Img size fix
5 years ago
Pavan Mandava ea747c886b WIP : README Documentation - Relative Path fixes
5 years ago
Pavan Mandava dabe7fc5f2 WIP : README Documentation - Added plot & changed dirs
5 years ago
Pavan Mandava 517eb53cd2 WIP : README Documentation
5 years ago
Pavan Mandava ec2f290b1a added requirements.txt file
5 years ago
Pavan Mandava 804533bc23 Code documentation/comments for predictor
5 years ago
yelircaasi 038e612e4f minor changes to output file
5 years ago
yelircaasi 02baf00dc5 Merge remote-tracking branch 'origin/master'
5 years ago
yelircaasi 2870975861 added balanced classes to ff model
5 years ago
Pavan Mandava 2b4b09864d plot Threshold color fix
5 years ago
Pavan Mandava bd559e0d8a plot path print
5 years ago
Pavan Mandava 87efce8f82 Saving Confusion Matrix Plot PNG
5 years ago
Pavan Mandava 244e27bee6 Added plot.show(block = True)
5 years ago
Pavan Mandava 3089662a0a Improved plot confusion matrix
5 years ago
Pavan Mandava 52efebe53e WIP : Matplotlib confusion matrix plot has issues
5 years ago
Pavan Mandava d18593f869 printing confusion matrix
5 years ago
Pavan Mandava 98e582444b added confusion matrix and plot
5 years ago
Sai Pavan Mandava 64f787b5d7 Removed Debug Code and Prints
5 years ago
Pavan Mandava 6946077d94 Commented unnecessary code
5 years ago
Sai Pavan Mandava 3455af4e40 Wrong Test file path fixed
5 years ago
Pavan Mandava 3916df452f Added JSONL open check
5 years ago
Pavan Mandava 7d52d0629b Added Predictor code
5 years ago
yelircaasi 0faf344a00 fixed spacing errors and interpreter configs
5 years ago
yelircaasi a1ac7e6cfa added files for paper results
5 years ago
Pavan Mandava 281205b0df WIP : Code Documentation & README Documentation
5 years ago
yelircaasi 12c9610f0b added a few comments
5 years ago
Pavan Mandava eb8a225c9b changed metric prints
6 years ago
Pavan Mandava c792a8783a Deleted LateX old files from the repo
6 years ago
Sai Pavan Mandava 9ddc0df6bb Config file changes for IMS Machines
6 years ago
Pavan Mandava a6793e1585 minor fixes
6 years ago
Pavan Mandava b8fd2e047f missed nn.py
6 years ago
Pavan Mandava 5daea1a2a8 moved Isaac's commit to the current package
6 years ago
Isaac Riley e9b1f31c49 finished basic ff model and torch data reader
6 years ago
Isaac Riley 18b7847bcf ffnn and folder for sota model
6 years ago
Pavan Mandava 931c99602d Added basic model class for LSTMs and config file for basic classifier
6 years ago
Pavan Mandava 9f9a271bc0 data readers added
6 years ago
Pavan Mandava 05ccf02bb2 epochs logic changed - old commit
6 years ago
Pavan Mandava d888673d00 Move DataInstance to utils.csv,
6 years ago
Pavan Mandava cdce93e5be Download Code added to presentation.pdf
6 years ago
Pavan Mandava efd1fe6b5c Added Sample Dataset in the slides
6 years ago
Pavan Mandava 089f883428 Minor changes to model and Added Results to Presentation Slides
6 years ago
Pavan Mandava 75055a6ba8 Finished Presentation Slides
6 years ago
Pavan Mandava 52f0d45def Finished Features Slide
6 years ago
Pavan Mandava cc1e00cb12 Added few slides
6 years ago
Pavan Mandava 9ff34905c4 Merge branch 'master' into isaac
6 years ago
Isaac Riley 37c42f27c7 added presentation folder
6 years ago
Pavan Mandava 3eb3f0f35e theta bias changed
6 years ago
Pavan Mandava 6575ba0952 Random state still not working
6 years ago
Pavan Mandava 3c0e4a411d Added more regex features and random_state to the Classifier
6 years ago
Isaac Riley 52f853d796 acronym regex
6 years ago
Isaac Riley 80ff54ad7d Merge branch 'isaac'
6 years ago
Isaac Riley 1bdda3ad31 simplified reference and sequence regexes
6 years ago
Pavan Mandava eaecb60962 Training print added
6 years ago
Isaac Riley 7ac62ab66d changed 'match' to 'search' - should work now
6 years ago
Isaac Riley 6aed6963fa fixed regular expression errors
6 years ago
Isaac Riley 9c0959f0ef fixed regular expression errors
6 years ago
Isaac Riley ecdce0116e fixed regular expressions
6 years ago
Pavan Mandava d43794d572 feature testing code changed
6 years ago
Pavan Mandava c6440b2553 Added some more Lexicons
6 years ago
Pavan Mandava ce8b6684f7 Model Testing Code added
6 years ago
Pavan Mandava 89f6cfdf88 Perceptron and Multi-Class Perceptron done
6 years ago
Pavan Mandava c915db6fc5 Added some more words to lexicon dictionary
6 years ago
Pavan Mandava d41c674b49 Added few comments and Regex Patterns
6 years ago
Pavan Mandava 190f9f35e6 Merge remote-tracking branch 'origin/master'
6 years ago
Pavan Mandava 7cd79a4b21 Added some more Lexicons and Regex for Feature Extraction
6 years ago
Isaac Riley a69ca18ccc Merge remote-tracking branch 'origin/isaac'
6 years ago
Isaac Riley 5d0ccaf111 created basic skeleton for perceptron (single & multi)
6 years ago
Pavan Mandava cc77b3a755 Feature Extraction with LEXICONS, Need to add more Lexicons and improve feature representation
6 years ago
Isaac Riley 87989f223a Merge branch 'master' into isaac
6 years ago
Pavan Mandava 3455c34601 Added Structure for Perceptron and Multi-Class Perceptron
6 years ago
Isaac Riley a3a3043bbb added micro-f1 code and assert statement
6 years ago

@ -0,0 +1,2 @@
classifier
utils

Binary file not shown.

@ -1,10 +1,105 @@
# citation-analysis
Project repo for Computational Linguistics Team Laboratory at the University of Stuttgart
# Citation Intent Classification
Project repo for Computational Linguistics Team Lab at the University of Stuttgart.
## Introduction
This repository contains code and datasets for classifying citation intents in research papers.
### **Evaluation**
we plan to implement and use ***f1_score*** metric for evaluation of our classifier
We implemented 3 classifiers and evaluated on test dataset:
- Perceptron Classifier - Baseline model (Implemented from scratch)
- Feedforward Neural Network Classifier (using [PyTorch](https://pytorch.org/))
- BiLSTM + Attention with ELMo Embeddings (using [AllenNLP](https://allennlp.org/) library)
This README documentation focuses on running the code base, training the models and predictions. For more information about our project work, model results and detailed error analysis, check [this](/14-final-report-Mandava-Riley.pdf) report. Slides from our mid-term presentation are available [here](/presentation.pdf).<br/>
For more information on the Citation Intent Classification in Scientific Publications, follow this [link](https://arxiv.org/pdf/1904.01608.pdf) to the original published paper and their [GitHub repo](https://github.com/allenai/scicite)
## Environment & Setup
This project needs **Python 3.5 or greater**. We need to install and create a Virtual Environment to run this project.
#### Installing virtualenv
```shell
python3 -m pip install --user virtualenv
```
#### Creating a virtual environment
**venv** (for Python 3) allows us to manage separate package installations for different projects.
```shell
python3 -m venv citation-env
```
#### Activating the virtual environment
Before we start installing or using packages in the virtual environment we need to _activate_ it.
```shell
source citation-env/bin/activate
```
#### Leaving the virtual environment
To leave the virtual environment, simply run:
```shell
deactivate
```
After activating the Virtual Environment, the console should look like this:
```shell
(citation-env) [user@server ~]$
```
#### Cloning the Repository
```shell
git clone https://github.com/yelircaasi/citation-analysis.git
```
Now change the current working directory to the project root folder (`> cd citation-analysis`). <br />
**Note:** Stay in the Project root folder while running all the experiments.
#### Installing Pacakages
Now we can install all the packages required to run this project, available in [requirements.txt](/requirements.txt) file.
```shell
(citation-env) [user@server citation-analysis]$ pip install -r requirements.txt
```
#### Environment Variable for Saved Models Path
Run the below line in the console, we'll use this variable later on.
```shell
export SAVED_MODELS_PATH=/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/saved_models
```
## Data
This project uses a large dataset of citation intents provided by this `SciCite` [GitHub repo](https://github.com/allenai/scicite). Can be downloaded from this [link](https://s3-us-west-2.amazonaws.com/ai2-s2-research/scicite/scicite.tar.gz). <br />
We have 3 different intents/classes in this dataset:
- background (background information)
- method (use of methods)
- result (comparing results)
**Dataset Class distribution:**
| | background | method | result |
|:---|:---:|:---:|:---:|
| train | 4.8 K | 2.3 K | 1.1 K |
| dev | 0.5 K | 0.3 K | 0.1 K |
| test | 1 K | 0.6 K | 0.2 K |
## Methods (Classification)
### 1) Perceptron Classifier (Baseline Classifier)
We implemented [Perceptron](https://en.wikipedia.org/wiki/Perceptron) as a baseline classifier, from scratch (including evaluation). Perceptron is an algorithm for supervised learning of classification. It's a linear and binary classifier, which means it can only decide whether or not an input feature belongs to some specific class and it's only capable of learning linearly separable patterns.
```python
class Perceptron:
def __init__(self, label: str, weights: dict, theta_bias: float):
def score(self, features: list):
def update_weights(self, features: list, learning_rate: float, penalize: bool, reward: bool):
class MultiClassPerceptron:
def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 42)
def fit(self, X_train: list, labels: list)
def predict(self, X_test: list)
```
Since we have 3 different classes for Classification, we create a Perceptron object for each class. Each Perceptron has score and update functions. During training, for a set of input features it takes the score from the Perceptron for each label and assigns the label with max score(for all the data instances). It compares the assigned label with the true label and decides whether or not to update the weights (with some learning rate).
Check the source [code](classifier/linear_model.py) for more details on the implementation of Perceptron Classifier.
### Running the Model
```shell
(citation-env) [user@server citation-analysis]$ python3 -m testing.model_testing
```
[Link](testing/model_testing.py) to the test source code. All the Hyperparameters can be modified to experiment with.
### Evaluation
we used ***f1_score*** metric for evaluation of our baseline classifier.
> F1 score is a weighted average of Precision and Recall(or Harmonic Mean between Precision and Recall).
> The formula for F1 Score is:
> F1 = 2 * (precision * recall) / (precision + recall)
@ -12,8 +107,128 @@ we plan to implement and use ***f1_score*** metric for evaluation of our classif
```python
eval.metrics.f1_score(y_true, y_pred, labels, average)
```
#### Parameters:
**Parameters**:
**y_true** : 1-d array or list of gold class values
**y_pred** : 1-d array or list of estimated values returned by a classifier
**labels** : list of labels/classes
**average**: string - [None, 'micro', 'macro'] If None, the scores for each class are returned.
[Link](eval/metrics.py) to the metrics source code.
### Results
<img src="plots/perceptron/confusion_matrix_plot.png" width="600" height = "450" alt = "Confusion Matrix Plot" />
### 2) Feed-forward Neural Network Classifier (Baseline Classifier)
A feed-forward neural network classifier with a single hidden layer containing 9 units. While clearly not the ideal architecture for sequential text data, the feed-forward neural network provides a second baseline. The input to the feedforward network remained the same as the perceptron; only the third model is suitable for more complex inputs such as word embeddings.
```python
class FeedForward(torch.nn.Module):
def __init__(self, input_size: int, hidden_size: int, output_size: int):
def forward(self, x: torch.nn.FloatTensor):
def read_data(self):
def fit(self, epochs: int = 100, batch_size: int = 16, lr: int = 0.01,
samples: tuple = (1000, 1000, 1000)):
def predict(self):
def shuffle(self):
```
Check the source [code](classifier/nn_ff.py) for more details on the implementation of the feed-forward neural network.
### Running the Model
```shell
(citation-env) [user@server citation-analysis]$ python3 -m testing.ff_model_testing
```
[Link](testing/ff_model_testing.py) to the test source code. All the Hyperparameters can be modified to experiment with.
### Evaluation
As in the perceptron classifier, we used ***f1_score*** metric for evaluation of our baseline classifier.
### Results
<img src="plots/ffnn_model/confusion_matrix_plot_ff.png" width="600" height = "450" alt = "Confusion Matrix Plot" />
### 3) BiLSTM + Attention with ELMo (AllenNLP Model)
The Bi-directional Long Short Term Memory (BiLSTM) model built using the [AllenNLP](https://allennlp.org/) library. For word representations, we used 100-dimensional [GloVe](https://nlp.stanford.edu/projects/glove/) vectors trained on a corpus of 6B tokens from Wikipedia. For contextual representations, we used [ELMo](https://allennlp.org/elmo) Embeddings which have been trained on a dataset of 5.5B tokens. This model uses the entire input text, as opposed to selected features in the text, as in the first two models. It has a single-layer BiLSTM with a hidden dimension size of 50 for each direction.
We used AllenNLP's [Config Files](https://guide.allennlp.org/using-config-files) to build our model, just need to implement a model and a dataset reader (with a JSON Config file).
Our BiLSTM AllenNLP model contains 4 major components:
1. Dataset Reader - [CitationDatasetReader](utils/reader.py)
- It reads the data from the file, tokenizes the input text and creates AllenNLP `Instances`
- Each `Instance` contains a dictionary of `tokens` and `label`
2. Model - [BiLstmClassifier](calssifier/nn.py)
- The model's `forward()` method is called for every data instance by passing `tokens` and `label`
- The signature of `forward()` needs to match with field names of the `Instance` created by the DatasetReader
- This Model uses [ELMo](https://allennlp.org/elmo) deep contextualised embeddings.
- The `forward()` method finally returns an output dictionary with the predicted label, loss, softmax probabilities and so on...
3. Config File - [basic_model.json](configs/basic_model.json?raw=true)
- The AllenNLP Configuration file takes the constructor parameters for various objects (Model, DatasetReader, Predictor, ...)
- We can provide a number of Hyperparameters in this Config file.
- Depth and Width of the Network
- Number of Epochs
- Optimizer & Learning Rate
- Batch Size
- Dropout
- Embeddings
- All the classes that the Config file uses must register using Python decorators (for example, `@Model.register('bilstm_classifier'`).
4. Predictor - [IntentClassificationPredictor](classifier/intent_predictor.py)
- AllenNLP uses `Predictor`, a wrapper around the trained model, for making predictions.
- The Predictor uses a pre-trained/saved model and dataset reader to predict new Instances
### Running the Model
AllenNLP provides `train`, `evaluate` and `predict` commands to interact with the models from command line.
#### Training
```shell
$ allennlp train \
configs/basic_model.json \
-s $SAVED_MODELS_PATH/experiment_10 \
--include-package classifier
```
We ran a few experiments on this model, the run configurations, results and archived models are available in the `SAVED_MODELS_PATH` directory. <br />
**Note:** If the GPU cores are not available, set the `"cuda_device":` to `-1` in the [config file](configs/basic_model.json?raw=true), otherwise the available GPU Core.
### Evaluation
To evaluate the model, simply run:
```shell
$ allennlp evaluate \
$SAVED_MODELS_PATH/experiment_4/model.tar.gz \
data/jsonl/test.jsonl \
--cuda-device 3 \
--include-package classifier
```
### Predictions
To make predictions, simply run:
```shell
$ allennlp predict \
$SAVED_MODELS_PATH/experiment_4/model.tar.gz \
data/jsonl/test.jsonl \
--cuda-device 3 \
--include-package classifier \
--predictor citation_intent_predictor
```
We also have an another way to make predictions without using `allennlp predict` command. This returns prediction list, softmax probabilities and more details useful for error analysis. Simply run the following command:
```shell
(citation-env) [user@server citation-analysis]$ python3 -m testing.bilstm_predict
```
Modify [this](testing/bilstm_predict.py) source to run predictions on different experiments. It also saves the Confusion Matrix Plot (as shown below) after prediction.
### Results
<img src="plots/bilstm_model/confusion_matrix_plot.png" width="600" height = "450" alt = "Confusion Matrix Plot" />
## References
[\[1\]](https://github.com/allenai/scicite) SciCite GitHub Repository<br />
This repository contains datasets and code for classifying citation intents, our poroject is based on this repository. <br /><br />
[\[2\]](https://s3-us-west-2.amazonaws.com/ai2-s2-research/scicite/scicite.tar.gz) SciCite Dataset <br />
Large Datset of Citation Intents <br /> <br />
[\[3\]](https://allennlp.org/tutorials) AllenNLP Library.<br />
An open-source NLP research library, built on PyTorch. <br /><br />
[\[4\]](https://allennlp.org/elmo) ELMo Embeddings<br />
Deep Contextualized word representations. <br /><br />
[\[5\]](https://guide.allennlp.org/) AllenNLP Guide<br />
A Guide to Natural Language Processing With AllenNLP. <br /><br />

@ -0,0 +1,3 @@
from .nn import *
from utils.reader import *
from.intent_predictor import *

@ -0,0 +1,105 @@
from typing import Dict, List, Tuple
from allennlp.common import JsonDict
from allennlp.data import Instance
from allennlp.predictors import Predictor
from overrides import overrides
from allennlp.models import Model
from allennlp.data.dataset_readers import DatasetReader
from allennlp.models.archival import load_archive
from utils.reader import DataReaderJsonLines, CitationDataSetReader
import os
@Predictor.register('citation_intent_predictor')
class IntentClassificationPredictor(Predictor):
"""
~~~Predictor for Citation Intent Classifier~~~
- This is just a wrapper class around AllenNLP Model
used for making predictions from the trained/saved model
"""
def predict(self, text: str, intent: str):
"""
This function can be called for each data point from the test dataset,
takes citation text and the target intent as parameters and
returns output dictionary from :func: `~classifier.nn.BiLstmClassifier.forward` method
:param text: Citation text from test data
:param intent: target intent of the data point
:return: returns output dictionary from Model's forward method
"""
return self.predict_json({"string": text, "label": intent})
@overrides
def _json_to_instance(self, json_dict: JsonDict) -> Instance:
"""
we get a callback to this method from AllenNLP Predictor,
passes JsonDict as a parameter with the data that we passed to the prediction_json function earlier.
And this callback should return the AllenNLP Instance with tokens and target label.
:param json_dict: json dictionary data with text and intent label
:return: returns AllenNLP Instance with tokens(ELMo) and target label
"""
return self._dataset_reader.text_to_instance(json_dict["string"], json_dict["label"])
def make_predictions(model: Model, dataset_reader: DatasetReader, dataset_file_path: str) -> Tuple[list, list]:
"""
This function takes the pre-trained(saved) Model and DatasetReader(and dataset file path) as arguments
and returns a Tuple of prediction list and gold/true list.
- Creates a predictor object with the pre-trained model and dataset reader.
- Read the data from the passed dataset file path and for each data point, use predictor to predict the intent
:param model: a trained/saved AllenNLP Model
:param dataset_reader: Dataset reader object (for tokenizing text and creating Instances)
:param dataset_file_path: a dataset file path to make predictions
:return: returns a Tuple of prediction list and true labels list
"""
# Create predictor class object
predictor = IntentClassificationPredictor(model, dataset_reader)
prediction_list = []
true_list = []
# read JSON Lines file and Iterate through each datapoint to predict
jsonl_reader = DataReaderJsonLines(dataset_file_path)
for citation in jsonl_reader.read():
true_list.append(citation.intent)
output = predictor.predict(citation.text, citation.intent)
prediction_list.append(output['prediction'])
# returns prediction list and gold labels list - Tuple
return prediction_list, true_list
def load_model_and_predict_test_data(saved_model_dir: str):
"""
This function loads the saved model from the specified directory and calls make_predictions function.
:param saved_model_dir: path of the saved AllenNLP model (typically from IMS common space)
:return: returns a list of prediction list and true list
"""
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
dev_file_path = project_root + '/data/jsonl/dev.jsonl'
test_file_path = project_root + '/data/jsonl/test.jsonl'
# load the archived/saved model
model_archive = load_archive(os.path.join(saved_model_dir, 'model.tar.gz'))
# create dataset reader object
citation_dataset_reader = CitationDataSetReader()
# make predictions
y_pred, y_true = make_predictions(model_archive.model, citation_dataset_reader, test_file_path)
return y_pred, y_true

@ -0,0 +1,240 @@
from utils.csv import DataInstance
from feature_extraction.features import FEATURE_LIST, THETA_BIAS_FEATURE
from collections import OrderedDict
import random
class Perceptron:
"""
Perceptron is an algorithm for supervised learning of binary classifiers,
which can decide whether or not an input(features) belongs to some specific class.
It's a linear classifier, which makes predictions by combining weights with feature vector.
"""
def __init__(self, label: str, weights: dict, theta_bias: float):
"""
:type label: str
:type weights: dict
:type theta_bias: float
:param label: Label for the Perceptron Classifier (useful while dealing with Multi-Class Perceptron)
:param weights: dictionary of feature name and feature weights(random number)
:param theta_bias: value of the theta bias variable, threshold weight in other words
"""
self.label = label
self.weights = weights
self.theta_bias = theta_bias
def score(self, features: list):
"""
This function takes the list of features as parameter and
computes score by adding all the weights that corresponds to these features
:type features: list
:param features: list of features from a DataInstance
:return: returns the computed score
"""
score_val = 0
for feature in features:
score_val += self.weights[feature]
return score_val
def update_weights(self, features: list, learning_rate: float = 1.0, penalize: bool = False, reward: bool = False):
"""
This function is used to update weights during the training of the Perceptron Classifier.
It takes a list of features as parameter and updates(either increase or decrease) the
weights for these individual features based on learning rate parameter
:param features: list of features from Input DataInstance
:param learning_rate: Default is 1.0
:param penalize: If True, decreases the weights for each feature. Default is False
:param reward: If True, increases the weights for each feature. Default is False
- If both penalize and reward params are False, weights will not get updated.
- If both penalize and reward are True without a learning rate(or learning rate 1),
weights for the features remain the same.
"""
for feature in features:
feature_weight = self.weights[feature]
if penalize:
self.weights[feature] = round(feature_weight - (learning_rate * 1), 5)
if reward:
self.weights[feature] = round(feature_weight + (learning_rate * 1), 5)
class MultiClassPerceptron:
"""
Perceptron is a binary classifier, can only separate between two classes.
Multi-Class Perceptron can be used, where multiple labels can be assigned to each data instance.
Multi-Class Perceptron creates one Perceptron Classifier for each label, while training
it takes the score for each label(from Perceptron Classifier) and
the label with the highest score is the predicted label
If the predicted label is different from true label of data instance,
this model updates the weights as follows:
- decrease the weights for the Perceptron Classifier of predicted label (penalize)
- increase the weights for the Perceptron Classifier of true label (reward)
This model also shuffles the training data after each epoch.
"""
def __init__(self, epochs: int = 5, learning_rate: float = 1.0, random_state: int = 42):
"""
:type epochs: int
:type learning_rate: float
:type random_state: int
:param epochs: number of training iterations
:param learning_rate: learning rate for updating weights, Default is 1
:param random_state: random state for shuffling the data, useful for reproducing the results.
Default is 42.
"""
self.random_state = random_state
self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label
self.epochs = epochs
self.learning_rate = learning_rate
def fit(self, X_train: list, labels: list):
"""
This function takes the training data and labels as parameters and trains the model
:type X_train: list[DataInstance]
:type labels: list[str]
:param X_train: list of training Data Instances
:param labels: list of classes
"""
# Check if labels parameter is empty and raise Exception
if labels is None or len(labels) <= 0:
raise Exception('The labels parameter must contain at least one label')
# Check if Training Data is empty and raise Exception
if X_train is None or len(X_train) <= 0:
raise Exception('Training data can\'t be Empty')
# Check the data type of training Instances
if not isinstance(X_train, list) and not isinstance(X_train[0], DataInstance):
raise Exception('Training Data must be a list of type DataInstance(model)')
train_len = len(X_train)
# Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
for label in labels:
sample_weights = get_sample_weights_with_features(theta_bias=-0.25, random_state=self.random_state)
self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=-0.25)
# Training Iterations
for epoch in range(self.epochs):
print('Training Epoch :: (', (epoch+1), '/', self.epochs, ')')
for i in range(train_len):
# Pick a number from random list
inst = X_train[i]
perceptron_scores = [] # list for storing perceptron scores for each label
for label, perceptron in self.perceptron_dict.items():
perceptron_scores.append(perceptron.score(inst.features))
# find the max score from the list of scores
max_score = max(perceptron_scores)
# find the label that corresponds to max score
label_max_score = labels[perceptron_scores.index(max_score)]
# if the label with max score is different from the label of this data instance,
# then decrease the weights(penalize) for the Perceptron of label with max score
# and increase the weights(reward) for the Perceptron of data instance label
if inst.true_label != label_max_score:
# decrease weights
self.perceptron_dict[label_max_score].update_weights(inst.features, self.learning_rate, penalize=True)
# increase weights
self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
# It's important to shuffle the list during every epoch
random.Random(self.random_state).shuffle(X_train)
def predict(self, X_test: list):
"""
This function takes testing instances as parameters and assigns a predicted label.
Takes the score from each Perceptron Classifier, label with the highest score is the predicted label
:param X_test: list of test data instances
:return: list of predicted labels
"""
if X_test is None or len(X_test) <= 0:
raise Exception('Testing Data cannot be empty')
print('Predicting..... ')
y_test = []
labels = list(self.perceptron_dict.keys())
for test_inst in X_test:
perceptron_scores = [] # list for storing perceptron scores for each label
for label in labels:
perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
# find the max score from the list of scores
max_score = max(perceptron_scores)
label_max_score = labels[perceptron_scores.index(max_score)]
y_test.append(label_max_score)
return y_test
def get_class_scores(self, X_test: list):
"""
This function takes testing instances as parameters and returns the probability for each
predicted label.
:param X_test: list of test data instances
:return: list of predicted label probabilities
"""
if X_test is None or len(X_test) <= 0:
raise Exception('Testing Data cannot be empty')
print('Predicting..... ')
y_test = []
labels = list(self.perceptron_dict.keys())
for test_inst in X_test:
perceptron_scores = [] # list for storing perceptron scores for each label
for label in labels:
perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
# find the max score from the list of scores
#max_score = max(perceptron_scores)
#label_max_score = labels[perceptron_scores.index(max_score)]
y_test.append(perceptron_scores)
return y_test
def get_sample_weights_with_features(theta_bias: float = 0.0, random_state: int = 42):
"""
This function creates a dictionary with feature as a key and a random floating number (feature weight) as value.
Weights for each feature is a floating number between -1 and 1
:type theta_bias: float
:type random_state: int
:param theta_bias: value of theta bias variable
:param random_state: random seed number for reproducing the results
:return: returns a dictionary of random weights for each feature
"""
weights = {THETA_BIAS_FEATURE: theta_bias}
for feature in FEATURE_LIST:
random.seed(random_state)
weights[feature] = round(random.uniform(-1.0, 1.0), 5)
return weights

@ -0,0 +1,211 @@
from typing import Dict
import numpy as np
import torch
from allennlp.common.checks import ConfigurationError
from allennlp.data import Vocabulary
from allennlp.models import Model
from allennlp.modules import TextFieldEmbedder, Seq2SeqEncoder, FeedForward, Elmo
from allennlp.nn import util
from allennlp.training.metrics import CategoricalAccuracy, F1Measure
from overrides import overrides
from torch.nn import Parameter
@Model.register("basic_bilstm_classifier")
class BiLstmClassifier(Model):
"""
Two things to note first:
- This BiLstmClassifier is a subclass of AllenNLP's Model class
- This class registers the type "basic_bilstm_classifier" using @Model.register() decorator,
this is required for the Config file to identify the Model class.
AllenNLP Model is similar to PyTorch Module, it implements forward() method and returns an output dictionary
with loss, logits and more....
The constructor parameters should match with configuration in the config file, the Vocabulary is composed by
the library or train pipeline after reading data using Dataset Reader.
In this model, we used Elmo embeddings, 1 layer BiLSTM (encoder) and 2 Feed-forward layers.
The train command/pipeline calls the forward method for a batch of Instances,
and the forward method returns the output dictionary with loss, logits, label and F1 metrics
"""
def __init__(self, vocab: Vocabulary,
text_field_embedder: TextFieldEmbedder,
encoder: Seq2SeqEncoder,
classifier_feedforward: FeedForward,
elmo: Elmo = None,
use_input_elmo: bool = False):
super().__init__(vocab)
self.elmo = elmo
self.use_elmo = use_input_elmo
self.text_field_embedder = text_field_embedder
self.num_classes = self.vocab.get_vocab_size("labels")
self.encoder = encoder
self.classifier_feed_forward = classifier_feedforward
self.label_accuracy = CategoricalAccuracy()
self.label_f1_metrics = {}
# create F1 Measures for each class
for i in range(self.num_classes):
self.label_f1_metrics[vocab.get_token_from_index(index=i, namespace="labels")] = \
F1Measure(positive_label=i)
self.loss = torch.nn.CrossEntropyLoss()
self.attention = Attention(encoder.get_output_dim())
@overrides
def forward(self, tokens: Dict[str, torch.LongTensor],
label: torch.LongTensor) -> Dict[str, torch.LongTensor]:
"""
The training loop takes a batch of Instances and passes it to the forward method
:param tokens: tokens from the Instance
:param label: label from the data Instance
:return: returns an output dictionary after forwarding inputs to the model
"""
input_elmo = None
# pop the "elmo" key and add it later
elmo_tokens = tokens.pop("elmo", None)
embedded_text = self.text_field_embedder(tokens)
text_mask = util.get_text_field_mask(tokens)
if elmo_tokens is not None:
tokens["elmo"] = elmo_tokens
# Create ELMo embeddings if applicable
if self.elmo:
if elmo_tokens is not None:
# get elmo representations from Tokens
elmo_representations = self.elmo(elmo_tokens["elmo_tokens"])["elmo_representations"]
if self.use_elmo:
input_elmo = elmo_representations.pop()
assert not elmo_representations
else:
raise ConfigurationError("Model was built to use Elmo, but input text is not tokenized for Elmo.")
if self.use_elmo:
if embedded_text is not None:
embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
else:
embedded_text = input_elmo
# pass the embedded text to the LSTM encoder
encoded_text = self.encoder(embedded_text, text_mask)
# Attention
attn_dist, encoded_text = self.attention(encoded_text, return_attn_distribution=True)
output_dict = {}
if label is not None:
logits = self.classifier_feed_forward(encoded_text)
# Probabilities from Softmax
class_probabilities = torch.nn.functional.softmax(logits, dim=1)
output_dict["logits"] = logits
# loss calculation
loss = self.loss(logits, label)
output_dict["loss"] = loss
# compute F1 per label
for i in range(self.num_classes):
metric = self.label_f1_metrics[self.vocab.get_token_from_index(index=i, namespace="labels")]
metric(class_probabilities, label)
output_dict['label'] = label
output_dict['tokens'] = tokens['tokens']
return output_dict
@overrides
def make_output_human_readable(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
"""
The predict command/pipeline calls this method with the output dictionary from forward() method.
The returned output dictionary will also be printed in the console when the predict command is executed
:param output_dict: output dictionary
:return: returns human readable output dictionary
"""
class_probabilities = torch.nn.functional.softmax(output_dict['logits'], dim=-1)
predictions = class_probabilities.cpu().data.numpy()
argmax_indices = np.argmax(predictions, axis=-1)
# get the label from vocabulary
label = [self.vocab.get_token_from_index(x, namespace="labels")
for x in argmax_indices]
output_dict['probabilities'] = class_probabilities
output_dict['positive_label'] = label
output_dict['prediction'] = label
# return ouput dictionary
return output_dict
@overrides
def get_metrics(self, reset: bool = False) -> Dict[str, float]:
"""
This method gets a call from the train pipeline,
and the returned metrics dictionary will be printed in the Console while Training.
The returned metrics dictionary contains class-wise F1 Scores, Average F1 score and loss
:param reset: boolean
:return: returns a metrics dictionary with Class Level F1 scores and losses
"""
metric_dict = {}
sum_f1 = 0.0
for name, metric in self.label_f1_metrics.items():
metric_val = metric.get_metric(reset)
metric_dict[name + '_F1'] = metric_val[2]
if name != 'none': # do not consider `none` label in averaging F1
sum_f1 += metric_val[2]
names = list(self.label_f1_metrics.keys())
total_len = len(names) if 'none' not in names else len(names) - 1
average_f1 = sum_f1 / total_len
metric_dict['AVG_F1_Score'] = average_f1
return metric_dict
def new_parameter(*size):
out = Parameter(torch.FloatTensor(*size))
torch.nn.init.xavier_normal_(out)
return out
class Attention(torch.nn.Module):
""" Simple multiplicative attention"""
def __init__(self, attention_size):
super(Attention, self).__init__()
self.attention = new_parameter(attention_size, 1)
def forward(self, x_in, reduction_dim=-2, return_attn_distribution=False):
# calculate attn weights
attn_score = torch.matmul(x_in, self.attention).squeeze()
# add one dimension at the end and get a distribution out of scores
attn_distrib = torch.nn.functional.softmax(attn_score.squeeze(), dim=-1).unsqueeze(-1)
scored_x = x_in * attn_distrib
weighted_sum = torch.sum(scored_x, dim=reduction_dim)
if return_attn_distribution:
return attn_distrib.reshape(x_in.shape[0], -1), weighted_sum
else:
return weighted_sum

@ -0,0 +1,128 @@
"""
Simple feed-forward neural network in PyTorch for baseline results on Scicite data.
Created: July 5th, 2020
"""
import torch
from utils.nn_reader import read_csv_nn
import numpy as np
class FeedForward(torch.nn.Module):
"""
Creates and trains a basic feedforward neural network.
"""
def __init__(self, input_size, hidden_size, output_size):
""" Sets up all basic elements of NN. """
super(FeedForward, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.fc1 = torch.nn.Linear(self.input_size, self.hidden_size)
self.tanh = torch.nn.Tanh()
self.fc2 = torch.nn.Linear(self.hidden_size, self.output_size)
self.sigmoid = torch.nn.Sigmoid()
self.softmax = torch.nn.Softmax(dim=1)
self.read_data()
def forward(self, x):
""" Computes output from a given input x. """
hidden = self.fc1(x)
tanh = self.tanh(hidden)
output = self.fc2(tanh)
output = self.softmax(output)
return output
def read_data(self):
"""" Reads in training and test data and converts it to proper format. """
self.X_train_, self.y_train_, self.X_test, self.y_test_ = read_csv_nn()
self.X_test = torch.FloatTensor(self.X_test)
yclass = np.array([(x[1] == 1) + 2 * (x[2] == 1) for x in self.y_train_])
is0 = yclass == 0
is1 = yclass == 1
is2 = yclass == 2
self.X0 = torch.FloatTensor(self.X_train_[is0])
self.X1 = torch.FloatTensor(self.X_train_[is1])
self.X2 = torch.FloatTensor(self.X_train_[is2])
self.y0 = torch.LongTensor(np.zeros((sum(is0),)))
self.y1 = torch.LongTensor(np.ones((sum(is1),)))
self.y2 = torch.LongTensor(2 * np.ones((sum(is2),)))
self.l0 = sum(is0)
self.l1 = sum(is1)
self.l2 = sum(is2)
self.y_test = (self.y_test_[:, 1] == 1) + 2 * (self.y_test_[:, 2] == 1)
def fit(self, epochs=100, batch_size=16, lr=0.01, samples=(1000, 1000, 1000)):
""" Trains model, using cross entropy loss and SGD optimizer. """
self.criterion = torch.nn.CrossEntropyLoss()
self.optimizer = torch.optim.SGD(self.parameters(), lr)
self.samples0, self.samples1, self.samples2 = samples
self.eval() # put into eval mode
# initialize training data
self.shuffle()
y_pred = self.forward(self.X_train)
before_train = self.criterion(y_pred, self.y_train)
print('Test loss before training', before_train.item())
# setup for batches
l = self.samples0 + self.samples1 + self.samples2 # total length
batch_indices = list(zip(list(range(0, l, batch_size))[:-1], list(range(16, l, batch_size))))
batch_indices[-1] = (batch_indices[-1][0], l)
# train model
self.train() # put into training mode
for epoch in range(epochs):
batch = 0
for a, b in batch_indices:
self.optimizer.zero_grad()
# forward pass
y_pred = self.forward(self.X_train[a:b])
loss = self.criterion(y_pred, self.y_train[a:b])
# backward pass
loss.backward()
self.optimizer.step()
batch += 1
# get loss following epoch
y_pred = self.forward(self.X_train)
loss = self.criterion(y_pred, self.y_train)
print('Epoch {}: train loss: {}'.format(epoch, loss.item()))
# shuffle dataset
self.shuffle()
# display final loss
self.eval() # back to eval mode
y_pred = self.forward(self.X_train)
after_train = self.criterion(y_pred, self.y_train)
print('Training loss after training', after_train.item())
def predict(self):
""" Generates predictions from model, using test data. """
# post-process to get predictions & get back to np format
y_pred = self.forward(self.X_test)
y_pred_np = y_pred.detach().numpy()
predmax = np.amax(y_pred_np, axis=1)
self.preds = 1 * (y_pred_np[:, 1] == predmax) + 2 * (y_pred_np[:, 2] == predmax)
self.probs = y_pred.detach().numpy()
def shuffle(self):
""" Samples and shuffles training data. """
# create permutations for shuffling
p0 = torch.randperm(self.l0)
p1 = torch.randperm(self.l1)
p2 = torch.randperm(self.l2)
n = self.samples0 + self.samples1 + self.samples2
p = torch.randperm(n)
# sample and shuffle data
self.X_train = \
torch.cat((self.X0[p0][:self.samples0], self.X1[p1][:self.samples1], self.X2[p2][:self.samples2]))[p]
self.y_train = torch.cat((self.y0[:self.samples0], self.y1[:self.samples1], self.y2[:self.samples2]))[p]

@ -0,0 +1,57 @@
{
"dataset_reader": {
"type": "citation_dataset_reader"
},
"train_data_path": "data/jsonl/train.jsonl",
"validation_data_path": "data/jsonl/dev.jsonl",
"test_data_path": "data/jsonl/test.jsonl",
"evaluate_on_test": true,
"model": {
"type": "basic_bilstm_classifier",
"text_field_embedder": {
"token_embedders": {
"tokens": {
"pretrained_file": "/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/glove.6B.100d.txt.gz",
"type": "embedding",
"embedding_dim": 100,
"trainable": false
}
}
},
"encoder": {
"type": "lstm",
"input_size": 1124,
"hidden_size": 100,
"num_layers": 1,
"bidirectional": true
},
"elmo": {
"options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json",
"weight_file": "/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5",
"do_layer_norm": true,
"dropout": 0.5,
"num_output_representations": 1
},
"use_input_elmo": true,
"classifier_feedforward": {
"input_dim": 200,
"num_layers": 2,
"hidden_dims": [20, 3],
"activations": ["linear", "linear"]
}
},
"data_loader": {
"batch_sampler": {
"type": "bucket",
"batch_size" : 16
}
},
"trainer": {
"optimizer": {
"type": "adagrad",
"lr": 0.005
},
"num_epochs": 10,
"cuda_device": 3
}
}

@ -1,4 +1,8 @@
import utils.constants as const
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
import itertools
def f1_score(y_true, y_pred, labels, average):
@ -163,6 +167,54 @@ def calculate_f1_score(precision, recall):
return 2 * (precision * recall) / (precision + recall)
def get_confusion_matrix(y_true, y_pred):
"""
takes predicted labels and true labels as parameters and returns Confusion Matrix
- uses sklearn metric s functions
:param y_true: True labels
:param y_pred: Predicted labels
:return: returns Confusion Matrix
"""
return confusion_matrix(y_true, y_pred, labels=const.CLASS_LABELS_LIST)
def plot_confusion_matrix(confusion_mat, classifier_name, plot_file_name):
"""
Saves the confusion matrix plot with the specified file name
:param confusion_mat: takes Confusion Matrix as an argument
:param classifier_name: Classifier name
:param plot_file_name: file name (with path) to save
"""
plt.figure(figsize=(8, 6))
plt.imshow(confusion_mat, interpolation='nearest', cmap=plt.get_cmap('Blues'))
plt.title(classifier_name)
plt.colorbar()
target_names = const.CLASS_LABELS_LIST
if target_names is not None:
tick_marks = np.arange(len(target_names))
plt.xticks(tick_marks, target_names, rotation=45)
plt.yticks(tick_marks, target_names)
thresh = confusion_mat.max() / 2
for i, j in itertools.product(range(confusion_mat.shape[0]), range(confusion_mat.shape[1])):
plt.text(j, i, "{:,}".format(confusion_mat[i, j]),
horizontalalignment="center",
color="white" if confusion_mat[i, j] > thresh else "black")
plt.tight_layout(1.5)
plt.ylabel('True/Gold')
plt.xlabel('Predicted')
plt.savefig(plot_file_name)
print('Confusion Matrix Plot saved to :: ', plot_file_name)
class Result:
"""
Model Class for carrying Evaluation Data (F1 Score, Precision, Recall, ....)
@ -177,4 +229,9 @@ class Result:
def print_result(self):
""" Prints F1 Score"""
print('F1 Score :: ', self.f1_score, ' Label :: ', self.label, ' Average :: ', self.average)
print_line = 'F1 Score :: ' + str(self.f1_score)
if self.label:
print_line += ' Label :: ' + self.label
if self.average:
print_line += ' Average :: ' + self.average
print(print_line)

@ -0,0 +1,14 @@
from allennlp.modules.elmo import Elmo, batch_to_ids
weights_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo = Elmo(options_file, weights_file, 1, dropout=0)
text = ['Backgammon', 'is', 'one', 'of', 'the', 'oldest', 'known', 'board', 'games']
batch = batch_to_ids(text)
print(batch)
dict = elmo.forward(batch)
print(dict['elmo_representations'])

@ -0,0 +1,48 @@
import feature_extraction.lexicons as lexicons
from utils.constants import REGEX_CONSTANTS
""" List of supported features for feature extraction from Input String """
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE',
'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL']
""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
THETA_BIAS_FEATURE = 'THETA_BIAS'
def extract_features_from_text(text: str):
"""
This function takes text string as input, extracts and returns a list of features by checking each word in
:`~feature_extraction.lexicons.ALL_LEXICONS`
:param text: takes string text as param
:return: returns a list of extracted features from the text, empty list for no features
"""
# ALL_LEXICONS
lexicon_dict = lexicons.ALL_LEXICONS
# Initialize the feature list with Theta Bias feature, this feature must be added to all data instances
text_feature_list = [THETA_BIAS_FEATURE]
# Iterate through the list features and get list of words from the lexicon dictionary,
# for each word in the word list, check if it appears in input text and add it to the text feature list
for feature in FEATURE_LIST:
# If the feature is Regex Pattern Match, get the pattern from :`~utils.constants.REGEX_CONSTANTS`
# and match it with the input text
if feature in REGEX_CONSTANTS:
pattern = REGEX_CONSTANTS[feature]
if bool(pattern.search(text)):
text_feature_list.append(feature)
continue
# If the feature is not Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
word_list = lexicon_dict[feature]
for word in word_list:
if word in text.lower():
text_feature_list.append(feature)
break
return text_feature_list

@ -0,0 +1,74 @@
"""
Dictionary of Lexicons used for Feature Extraction
"""
ALL_LEXICONS = {
'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low',
'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
'RESULT': ['estimate', 'evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result', 'performance'],
'INCREASE': ['increase', 'grow', 'intensify', 'build up', 'explode'],
'CHANGE': ['adapt', 'adjust', 'augment', 'combine', 'change', 'decrease', 'elaborate', 'expand', 'expand on',
'extend', 'derive', 'incorporate', 'increase', 'manipulate', 'modify', 'optimize', 'optimise', 'refine',
'render', 'replace', 'revise', 'substitute', 'tailor', 'upgrade', 'grow'],
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate',
'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate',
'specify', 'explain'],
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate',
'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'],
'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation',
'literature', 'experience'],
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strateg',
'system', 'technique', 'theory', 'tool', 'treatment'],
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
'suggest', 'swear', 'write'],
'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase',
'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death',
'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity',
'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop',
'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium', 'rna', 'organism', 'biolog', 'x-ray'],
'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter',
'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'],
'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network',
'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml',
'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
'CITATION': ['et al']
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

@ -0,0 +1,10 @@
allennlp==1.0.0
jsonlines==1.2.0
matplotlib==3.3.0
numpy==1.19.0
overrides==3.0.0
scikit-learn==0.23.1
six==1.15.0
spacy==2.2.4
torch==1.5.1
torchvision==0.6.1

@ -0,0 +1,2 @@
from utils.reader import *
from classifier.nn import *

@ -0,0 +1,14 @@
import classifier.intent_predictor as pred
import eval.metrics as metrics
saved_model_dir = '/mount/arbeitsdaten/studenten1/team-lab-nlp/mandavsi_rileyic/saved_models/experiment_4'
y_pred, y_true = pred.load_model_and_predict_test_data(saved_model_dir)
confusion_matrix = metrics.get_confusion_matrix(y_true, y_pred)
print("Confusion Matrix :: ")
print(confusion_matrix)
plot_file_path = saved_model_dir+'/confusion_matrix_plot.png'
metrics.plot_confusion_matrix(confusion_matrix, "BiLSTM Classifier + Attention with ELMo", plot_file_path)

@ -1,13 +1,11 @@
from eval.metrics import f1_score
import utils.constants as const
from sklearn.metrics import f1_score as f1
import os
from utils.csv import read_csv_file
y_true = ['positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative']
y_pred = ['positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative']
result_list = f1_score(y_true, y_pred, ['positive', 'negative'], const.AVG_MICRO)
result_list = f1_score(y_true, y_pred, ['positive', 'negative'], None)
for result in result_list:
result.print_result()
@ -21,12 +19,3 @@ for result in result_list:
print('SK Learn F1 Score (MACRO):: ', f1(y_true, y_pred, ['positive', 'negative'], average='macro'))
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
train_file_path = project_root+'/data/tsv/train.tsv'
print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
for inst in data[:5]:
inst.print()

@ -0,0 +1,27 @@
import os
from utils.csv import read_csv_file
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
train_file_path = project_root+'/data/tsv/train.tsv'
test_file_path = project_root+'/data/tsv/test.tsv'
print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
i = 0
feature_dict = {}
for inst in data[:20]:
inst.print()
# print('Data Points without Features :: ', i)
# tokens = inst.text.split()
# for token in tokens:
# if token not in feature_dict:
# feature_dict[token] = 1
# continue
# feature_dict[token] += 1
#
# for key in sorted(feature_dict, key=feature_dict.get, reverse=True):
# print(key, ' -> ', feature_dict.get(key))

@ -0,0 +1,34 @@
import sys
import os
sys.path.append(os.getcwd())
from classifier.nn_ff import FeedForward
from sklearn.metrics import f1_score
from eval.metrics import plot_confusion_matrix, get_confusion_matrix
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
clf = FeedForward(28, 9, 3)
clf.fit()
clf.predict()
# predict
y_test = clf.preds
y_true = clf.y_test
# Model Evaluation
labels = set(['background', 'method', 'result'])
f1_score_micro = f1_score(y_true, y_test, average='micro')
f1_score_macro = f1_score(y_true, y_test, average='macro')
# Print F1 Score
print('F1 score (micro): ', f1_score_micro)
print('F1 score (macro): ', f1_score_macro)
# plot confusion matrix
classdict = {0: 'background', 1: 'method', 2: 'result'}
y_test = [classdict[x] for x in y_test]
y_true = [classdict[x] for x in y_true]
plot_path = project_root + '/plots/confusion_matrix_plot_ff.png'
plot_confusion_matrix(get_confusion_matrix(y_true, y_test), 'Feed-forward NN Classifier (Baseline)', plot_path)

@ -0,0 +1,45 @@
from classifier.linear_model import MultiClassPerceptron
from utils.csv import read_csv_file
from eval.metrics import f1_score, plot_confusion_matrix, get_confusion_matrix
import utils.constants as const
import os
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
train_file_path = project_root + '/data/tsv/train.tsv'
test_file_path = project_root + '/data/tsv/test.tsv'
# Read the training dataset
X_train_inst = read_csv_file(train_file_path, '\t')
# set of labels from Training data
labels = set([inst.true_label for inst in X_train_inst])
# Read test data set
X_test_inst = read_csv_file(test_file_path, '\t')
# number of training iterations
epochs = 50
# create MultiClassPerceptron classifier object
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.7, random_state=101)
# train the model
clf.fit(X_train=X_train_inst, labels=list(labels))
# predict
y_test = clf.predict(X_test_inst)
y_true = [inst.true_label for inst in X_test_inst]
# Model Evaluation
f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
f1_score_none = f1_score(y_true, y_test, labels, None)
# Print F1 Score
for result in f1_score_micro + f1_score_macro + f1_score_none:
result.print_result()
# plot confusion matrix
plot_path = project_root + '/plots/confusion_matrix_plot.png'
plot_confusion_matrix(get_confusion_matrix(y_true, y_test), 'Perceptron Classifier (Baseline)', plot_path)

@ -0,0 +1,66 @@
#import os
#os.chdir('/Users/iriley/code/citation-analysis')
import sys
sys.path.append('/Users/iriley/code/citation-analysis')
from classifier.linear_model import MultiClassPerceptron
from sklearn.metrics import confusion_matrix as cm
from utils.csv import read_csv_file
from eval.metrics import f1_score
import utils.constants as const
import pandas as pd
import numpy as np
train_file_path = '/Users/iriley/code/citation-analysis/data/tsv/train.tsv'
dev_file_path = '/Users/iriley/code/citation-analysis/data/tsv/test.tsv'
# Read the training dataset
X_train_inst = read_csv_file(train_file_path, '\t')
# set of labels from Training data
labels = set([inst.true_label for inst in X_train_inst])
# Read test data set
X_dev_inst = read_csv_file(dev_file_path, '\t')
# number of training iterations
epochs = 50
# create MultiClassPerceptron classifier object
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.5, random_state=101)
# train the model
clf.fit(X_train=X_train_inst, labels=list(labels))
# predict
y_pred = clf.predict(X_dev_inst)
y_scores = np.array(clf.get_class_scores(X_dev_inst))
y_true = [inst.true_label for inst in X_dev_inst]
labeldict = {'background': 0, 'method': 1, 'result': 2}
y_pred = np.array([labeldict[x] for x in y_pred])
y_true = np.array([labeldict[x] for x in y_true])
conmat = cm(y_true, y_pred)
df = pd.DataFrame()
df['pred'] = y_pred
df['true'] = y_true
df['correct'] = y_pred==y_true
df['score0'] = np.round(y_scores[:,0],3)
df['score1'] = np.round(y_scores[:,1],3)
df['score2'] = np.round(y_scores[:,2],3)
df.to_csv('/Users/iriley/code/machine_learning/lab2020/y_pred_model1.csv', index=False)
## Model Evaluation
#f1_score_micro = f1_score(y_true, y_pred, labels, const.AVG_MICRO)
#f1_score_macro = f1_score(y_true, y_pred, labels, const.AVG_MACRO)
#f1_score_none = f1_score(y_true, y_pred, labels, None)
## Print F1 Score
#for result in f1_score_micro + f1_score_macro + f1_score_none:
# result.print_result()

@ -1,2 +1,37 @@
import re
AVG_MICRO = 'MICRO'
AVG_MACRO = 'MACRO'
REGEX_CONSTANTS = {
# Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2
# 'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"),
'ACRONYM': re.compile(r"^[A-Z]{2,}[\.,:;\b\s]|[\s\b]m?[A-Z]{2,}[\.,:;\b\s]"),
# Regex for matching Years in the text - > 1995 / 2020 / 2019
'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"),
# Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
'SEQUENCE': re.compile(r"\([\d.*]\)"),
# Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
'REFERENCE': re.compile(r"\[\d.*\]"),
# Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
'PERCENTAGE': re.compile(r"\d[\d\.\-]+%"),
# Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals.
'CONTAINS_URL': re.compile(r"https?://\S+"),
'ENDS_WITH_RIDE': re.compile(r"ride\b"),
'ENDS_WITH_RINE': re.compile(r"rine\b"),
'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
}
CLASS_LABELS = {"background": 0, "method": 1, "result": 2}
CLASS_LABELS_LIST = ['background', 'method', 'result']

@ -1,5 +1,5 @@
import csv
from utils.models import DataInstance
from feature_extraction.features import extract_features_from_text
def read_csv_file(csv_file_path, delimiter='\t'):
@ -17,3 +17,21 @@ def read_csv_file(csv_file_path, delimiter='\t'):
for row in file_data:
data.append(DataInstance(row[0], row[2], row[3]))
return data
class DataInstance:
"""
Model Class for carrying Training and Testing data from tsv/csv file.
Also carries the extracted features.
"""
def __init__(self, r_id, text, true_label):
self.did = r_id
self.text = text
self.true_label = true_label
self.predicted_label = None
self.features = extract_features_from_text(text)
def print(self):
print('\nTrue Label :: ', self.true_label, ' Text :: ', self.text)
print('Features :: ', self.features)

@ -0,0 +1,17 @@
class Citation(object):
""" Class representing a citation object """
def __init__(self,
text,
citing_paper_id,
cited_paper_id,
section_title=None,
intent=None,
citation_id=None
):
self.text = text
self.citing_paper_id = citing_paper_id
self.cited_paper_id = cited_paper_id
self.section_title = section_title
self.intent = intent
self.citation_id = citation_id

@ -1,13 +0,0 @@
class DataInstance:
"""
Model Class for carrying Training and Testing data from tsc/csv file
"""
def __init__(self, r_id, text, true_label):
self.did = r_id
self.text = text
self.true_label = true_label
def print(self):
print('True Label :: ', self.true_label, ' Text :: ', self.text)

@ -0,0 +1,57 @@
import numpy as np
from itertools import chain
from utils.csv import read_csv_file
# TODO: clean up, transform into class, allow for command-line arguments
def read_csv_nn(scicite_dir=None):
train_file_path = 'data/tsv/train.tsv'
test_file_path = 'data/tsv/test.tsv'
train_raw = read_csv_file(train_file_path, '\t')
test_raw = read_csv_file(test_file_path, '\t')
features = [x.features for x in train_raw]
features_unique = list(set(chain.from_iterable(features)))
nobs = len(features)
nfeats = len(features_unique)
X_train = np.zeros((nobs, nfeats))
for j in range(nfeats):
f = features_unique[j]
for i in range(nobs):
if f in features[i]:
X_train[i,j] = 1
y_train_raw = np.array([x.true_label for x in train_raw])
y_unique = sorted(list(set(y_train_raw)))
y_dim = len(y_unique)
y_train = np.zeros((nobs,y_dim))
for j in range(y_dim):
y_train[:,j] = y_train_raw == y_unique[j]
test_raw = read_csv_file(test_file_path, '\t')
features = [x.features for x in test_raw]
#features_unique = list(set(chain.from_iterable(features)))
nobs = len(features)
nfeats = len(features_unique)
X_test = np.zeros((nobs, nfeats))
for j in range(nfeats):
f = features_unique[j]
for i in range(nobs):
if f in features[i]:
X_test[i,j] = 1
y_test_raw = np.array([x.true_label for x in test_raw])
y_test = np.zeros((nobs, y_dim))
for j in range(y_dim):
y_test[:, j] = y_test_raw == y_unique[j]
return X_train, y_train, X_test, y_test

@ -0,0 +1,33 @@
import numpy as np
from itertools import chain
from utils.csv import read_csv_file
def read_csv_nn_dev(scicite_dir=None):
dev_file_path = 'data/tsv/dev.tsv'
dev_raw = read_csv_file(dev_file_path, '\t')
features = [x.features for x in dev_raw]
features_unique = list(set(chain.from_iterable(features)))
nobs = len(features)
nfeats = len(features_unique)
X_dev = np.zeros((nobs, nfeats))
for j in range(nfeats):
f = features_unique[j]
for i in range(nobs):
if f in features[i]:
X_dev[i,j] = 1
y_dev_raw = np.array([x.true_label for x in dev_raw])
y_unique = sorted(list(set(y_dev_raw)))
y_dim = len(y_unique)
y_dev = np.zeros((nobs,y_dim))
for j in range(y_dim):
y_dev[:,j] = y_dev_raw == y_unique[j]
return X_dev, y_dev

@ -0,0 +1,97 @@
from typing import Iterable
import jsonlines
from allennlp.data import Instance
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.fields import TextField, LabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer, ELMoTokenCharactersIndexer
from allennlp.data.tokenizers import SpacyTokenizer
from overrides import overrides
from utils.data import Citation
@DatasetReader.register("citation_dataset_reader") # type for config files
class CitationDataSetReader(DatasetReader):
"""
We implement this CitationDataSetReader class by subclassing DatasetReader class,
we also need to override some super class methods
This CitationDataSetReader class reads the datasets(train|dev|test) and converts them to a collection of Instances.
We used the default SpacyTokenizer for this project.
We also need to register this dataset reader, for the Config files to be able to use this class.
"""
def __init__(self):
super().__init__()
# default Spacy Tokenizer
self.tokenizer = SpacyTokenizer()
@overrides
def _read(self, file_path: str) -> Iterable[Instance]:
"""
This function reads the JSON Lines file, tokenize the text for each data point
and returns a collection of Instances, each instance with tokens and label
:param file_path: takes the file path as an Argument
:return: returns a collection of Instances
"""
ds_reader = DataReaderJsonLines(file_path)
for citation in ds_reader.read():
yield self.text_to_instance(citation_text=citation.text, intent=citation.intent)
@overrides
def text_to_instance(self, citation_text: str,
intent: str) -> Instance:
"""
:param citation_text: text from the data point
:param intent: true label of the data instance
:return: returns Instance class object with tokens & label fields.
"""
citation_tokens = self.tokenizer.tokenize(citation_text)
# Use ELMO Token Characters Indexer
token_indexers = {"elmo": ELMoTokenCharactersIndexer(),
"tokens": SingleIdTokenIndexer()}
fields = {'tokens': TextField(citation_tokens, token_indexers),
'label': LabelField(intent)}
return Instance(fields)
class DataReaderJsonLines:
"""
Helper class for reading jsonl(JSON Line) files
"""
def __init__(self, file_path):
self.file_path = file_path
def read(self):
"""
This method opens the file, reads every line and returns a collection of lines
:return: collection of Citation Objects, with the required data
"""
with jsonlines.open(self.file_path) as jl_reader:
for line in jl_reader:
yield read_json_line(line)
def read_json_line(line):
"""
:param line: takes the json line dictionary as a parameter
:return: returns a Citation Object
"""
citation = Citation(
text=line['string'],
citing_paper_id=line['citingPaperId'],
cited_paper_id=line['citedPaperId'],
section_title=line['sectionName'],
intent=line['label'],
citation_id=line['id'])
return citation
Loading…
Cancel
Save