From 3c0e4a411d3fc578a773d25eaa7deea1191ee5fa Mon Sep 17 00:00:00 2001
From: Pavan Mandava <mspavan04@gmail.com>
Date: Sun, 17 May 2020 22:51:24 +0200
Subject: [PATCH 1/3] Added more regex features and random_state to the
 Classifier

---
 classifier/linear_model.py     | 10 +++++++---
 feature_extraction/features.py |  3 ++-
 feature_extraction/lexicons.py | 29 +++++++++++++++++------------
 testing/model_testing.py       | 10 ++++++----
 utils/constants.py             | 13 ++++++++++---
 5 files changed, 42 insertions(+), 23 deletions(-)

diff --git a/classifier/linear_model.py b/classifier/linear_model.py
index 4275a35..3de308a 100644
--- a/classifier/linear_model.py
+++ b/classifier/linear_model.py
@@ -85,14 +85,18 @@ class MultiClassPerceptron:
 
     """
 
-    def __init__(self, epochs: int = 2000, learning_rate: float = 1):
+    def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 4):
         """
         :type epochs: int
         :type learning_rate: float
+        :type random_state: int
 
         :param epochs: number of training iterations
         :param learning_rate: learning rate for updating weights, Default is 1
+        :param random_state: random state for shuffling the data, useful for reproducing the results.
+                    Default is 4.
         """
+        self.random_state = random_state
         self.perceptron_dict = OrderedDict()  # contains Key : label and value : Perceptron Object for label
         self.epochs = epochs
         self.learning_rate = learning_rate
@@ -124,7 +128,7 @@ class MultiClassPerceptron:
 
         # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
         for label in labels:
-            self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(-0.5), -0.5)
+            self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(theta_bias=-0.5), theta_bias=-0.5)
 
         next_print = int(self.epochs/10)
 
@@ -160,7 +164,7 @@ class MultiClassPerceptron:
                 self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
 
             # It's important to shuffle the data during every epoch
-            random.shuffle(X_train)
+            random.Random(self.random_state).shuffle(X_train)
 
     def predict(self, X_test: list):
         """
diff --git a/feature_extraction/features.py b/feature_extraction/features.py
index 3b2e360..d6241f7 100644
--- a/feature_extraction/features.py
+++ b/feature_extraction/features.py
@@ -5,7 +5,8 @@ from utils.constants import REGEX_CONSTANTS
 FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
                 'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
                 'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
-                'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
+                'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE',
+                'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL']
 
 """ Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
 THETA_BIAS_FEATURE = 'THETA_BIAS'
diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py
index feb89db..5326eda 100644
--- a/feature_extraction/lexicons.py
+++ b/feature_extraction/lexicons.py
@@ -4,7 +4,7 @@ Dictionary of Lexicons used for Feature Extraction
 ALL_LEXICONS = {
 
     'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low',
-                'worse', 'worst', 'greater', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
+                'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
 
     'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
 
@@ -19,7 +19,8 @@ ALL_LEXICONS = {
     'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
 
     'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate',
-                'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure'],
+                'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate',
+                'specify', 'explain'],
 
     'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
                   'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate',
@@ -28,7 +29,8 @@ ALL_LEXICONS = {
     'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
                  'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
                  'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
-                 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation'],
+                 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation',
+                 'literature', 'experience'],
 
     'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
                  'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
@@ -47,22 +49,25 @@ ALL_LEXICONS = {
                       'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
                       'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
 
-    'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia',
-                 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium',
-                 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'rna',
+    'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase',
+                 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death',
+                 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
                  'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
                  'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
-                 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant',
-                 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen'],
+                 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity',
+                 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop',
+                 'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium',  'rna', 'organism', 'biolog', 'x-ray'],
 
     'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
              'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
              'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
-             'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', 'probabilit', 'multivariate'],
+             'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter',
+             'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'],
 
-    'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', 'algorithm',
-                         'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', 'corpus', 'uml', 'system',
-                         'security', 'protocol'],
+    'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network',
+                         'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml',
+                         'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
+                         'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
 
     'CITATION': ['et al'],  # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
 
diff --git a/testing/model_testing.py b/testing/model_testing.py
index ba3ac5f..b640f69 100644
--- a/testing/model_testing.py
+++ b/testing/model_testing.py
@@ -15,9 +15,9 @@ labels = set([inst.true_label for inst in X_train_inst])
 
 X_test_inst = read_csv_file(test_file_path, '\t')
 
-epochs = int(len(X_train_inst)*0.75)
+epochs = int(len(X_train_inst)*0.95)
 
-clf = MultiClassPerceptron(epochs, 1)
+clf = MultiClassPerceptron(epochs=epochs, learning_rate=1, random_state=10)
 
 clf.fit(X_train=X_train_inst, labels=list(labels))
 
@@ -25,7 +25,9 @@ y_test = clf.predict(X_test_inst)
 
 y_true = [inst.true_label for inst in X_test_inst]
 
-f1_score_list = f1_score(y_true, y_test, labels, const.AVG_MICRO)
+f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
+f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
+f1_score_none = f1_score(y_true, y_test, labels, None)
 
-for result in f1_score_list:
+for result in f1_score_micro+f1_score_macro+f1_score_none:
     result.print_result()
diff --git a/utils/constants.py b/utils/constants.py
index 4318131..8a55fdb 100644
--- a/utils/constants.py
+++ b/utils/constants.py
@@ -7,14 +7,14 @@ AVG_MACRO = 'MACRO'
 REGEX_CONSTANTS = {
 
     # Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2
-    #'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"),
+    # 'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"),
     'ACRONYM': re.compile(r"^[A-Z]{2,}[\.,:;\b\s]|[\s\b]m?[A-Z]{2,}[\.,:;\b\s]"),
 
     # Regex for matching Years in the text - > 1995 / 2020 / 2019
     'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"),
 
     # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
-    'SEQUENCE': re.compile(r"\([\d.*\)"),
+    'SEQUENCE': re.compile(r"\([\d.*]\)"),
 
     # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
     'REFERENCE': re.compile(r"\[\d.*\]"),
@@ -24,5 +24,12 @@ REGEX_CONSTANTS = {
 
     # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. ,
     # https://www.ims.uni-stuttgart.de/
-    'URL': re.compile(r"https?://\S+")#...\S+(?=\.?,?:?[\s\"$])")
+    'CONTAINS_URL': re.compile(r"https?://\S+"),
+
+    'ENDS_WITH_RIDE': re.compile(r"ride\b"),
+
+    'ENDS_WITH_RINE': re.compile(r"rine\b"),
+
+    'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
+
 }

From 6575ba09523391e78ec2323e3c551dac3f77fc09 Mon Sep 17 00:00:00 2001
From: Pavan Mandava <mspavan04@gmail.com>
Date: Mon, 18 May 2020 00:13:22 +0200
Subject: [PATCH 2/3] Random state still not working

---
 classifier/linear_model.py | 33 +++++++++++++++++++++------------
 testing/model_testing.py   |  2 +-
 utils/constants.py         |  3 +--
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/classifier/linear_model.py b/classifier/linear_model.py
index 3de308a..6e25f69 100644
--- a/classifier/linear_model.py
+++ b/classifier/linear_model.py
@@ -62,9 +62,9 @@ class Perceptron:
         for feature in features:
             feature_weight = self.weights[feature]
             if penalize:
-                self.weights[feature] = feature_weight - (learning_rate * 1)
+                self.weights[feature] = round(feature_weight - (learning_rate * 1), 5)
             if reward:
-                self.weights[feature] = feature_weight + (learning_rate * 1)
+                self.weights[feature] = round(feature_weight + (learning_rate * 1), 5)
 
 
 class MultiClassPerceptron:
@@ -85,7 +85,7 @@ class MultiClassPerceptron:
 
     """
 
-    def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 4):
+    def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 42):
         """
         :type epochs: int
         :type learning_rate: float
@@ -94,7 +94,7 @@ class MultiClassPerceptron:
         :param epochs: number of training iterations
         :param learning_rate: learning rate for updating weights, Default is 1
         :param random_state: random state for shuffling the data, useful for reproducing the results.
-                    Default is 4.
+                    Default is 42.
         """
         self.random_state = random_state
         self.perceptron_dict = OrderedDict()  # contains Key : label and value : Perceptron Object for label
@@ -128,10 +128,14 @@ class MultiClassPerceptron:
 
         # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
         for label in labels:
-            self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(theta_bias=-0.5), theta_bias=-0.5)
+            sample_weights = get_sample_weights_with_features(theta_bias=0.9, random_state=self.random_state)
+            self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=0.9)
 
         next_print = int(self.epochs/10)
 
+        random.seed(self.random_state)
+        random_list = [random.randint(0, train_len-1) for i in range(self.epochs)]
+
         # Training Iterations
         for epoch in range(self.epochs):
 
@@ -139,10 +143,8 @@ class MultiClassPerceptron:
                 print('Training Multi-Class Perceptron Classifier..... (', epoch, '/', self.epochs, ')')
                 next_print = next_print + int(self.epochs/10)
 
-            # get a random number within the size of training set
-            rand_num = random.randint(0, train_len-1)
-            # pick a random data instance with the generated random number
-            inst = X_train[rand_num]
+            # Pick a number from random list
+            inst = X_train[random_list[epoch]]
 
             perceptron_scores = []  # list for storing perceptron scores for each label
             for label, perceptron in self.perceptron_dict.items():
@@ -163,7 +165,7 @@ class MultiClassPerceptron:
                 # increase weights
                 self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
 
-            # It's important to shuffle the data during every epoch
+            # It's important to shuffle the list during every epoch
             random.Random(self.random_state).shuffle(X_train)
 
     def predict(self, X_test: list):
@@ -196,15 +198,22 @@ class MultiClassPerceptron:
         return y_test
 
 
-def get_sample_weights_with_features(theta_bias: float = None):
+def get_sample_weights_with_features(theta_bias: float = None, random_state: int = 42):
     """
     This function creates a dictionary with feature as a key and a random floating number (feature weight) as value.
     Weights for each feature is a floating number between -1 and 1
 
+    :type theta_bias: float
+    :type random_state: int
+
+    :param theta_bias: value of theta bias variable
+    :param random_state: random seed number for reproducing the results
+
     :return: returns a dictionary of random weights for each feature
     """
     weights = {THETA_BIAS_FEATURE: theta_bias}
+    random.seed(random_state)
     for feature in FEATURE_LIST:
-        weights[feature] = round(random.uniform(-1.0, 1.0), 4)
+        weights[feature] = round(random.uniform(-1.0, 1.0), 5)
 
     return weights
diff --git a/testing/model_testing.py b/testing/model_testing.py
index b640f69..59f87b6 100644
--- a/testing/model_testing.py
+++ b/testing/model_testing.py
@@ -17,7 +17,7 @@ X_test_inst = read_csv_file(test_file_path, '\t')
 
 epochs = int(len(X_train_inst)*0.95)
 
-clf = MultiClassPerceptron(epochs=epochs, learning_rate=1, random_state=10)
+clf = MultiClassPerceptron(epochs=epochs, learning_rate=1, random_state=42)
 
 clf.fit(X_train=X_train_inst, labels=list(labels))
 
diff --git a/utils/constants.py b/utils/constants.py
index 8a55fdb..7476fb6 100644
--- a/utils/constants.py
+++ b/utils/constants.py
@@ -22,8 +22,7 @@ REGEX_CONSTANTS = {
     # Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
     'PERCENTAGE': re.compile(r"\d[\d\.\-]+%"),
 
-    # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. ,
-    # https://www.ims.uni-stuttgart.de/
+    # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals.
     'CONTAINS_URL': re.compile(r"https?://\S+"),
 
     'ENDS_WITH_RIDE': re.compile(r"ride\b"),

From 3eb3f0f35e9f35e14ae4697970dfaee0b927cd6e Mon Sep 17 00:00:00 2001
From: Pavan Mandava <mspavan04@gmail.com>
Date: Mon, 18 May 2020 09:57:08 +0200
Subject: [PATCH 3/3] theta bias changed

---
 classifier/linear_model.py |  4 ++--
 testing/model_testing.py   | 21 +++++++++++++++------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/classifier/linear_model.py b/classifier/linear_model.py
index 6e25f69..5c86f92 100644
--- a/classifier/linear_model.py
+++ b/classifier/linear_model.py
@@ -128,8 +128,8 @@ class MultiClassPerceptron:
 
         # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
         for label in labels:
-            sample_weights = get_sample_weights_with_features(theta_bias=0.9, random_state=self.random_state)
-            self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=0.9)
+            sample_weights = get_sample_weights_with_features(theta_bias=-0.25, random_state=self.random_state)
+            self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=-0.25)
 
         next_print = int(self.epochs/10)
 
diff --git a/testing/model_testing.py b/testing/model_testing.py
index 59f87b6..aa10bee 100644
--- a/testing/model_testing.py
+++ b/testing/model_testing.py
@@ -8,26 +8,35 @@ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 train_file_path = project_root+'/data/tsv/train.tsv'
 test_file_path = project_root+'/data/tsv/test.tsv'
 
-
+# Read the training dataset
 X_train_inst = read_csv_file(train_file_path, '\t')
 
+# set of labels from Training data
 labels = set([inst.true_label for inst in X_train_inst])
 
+# Read test data set
 X_test_inst = read_csv_file(test_file_path, '\t')
 
-epochs = int(len(X_train_inst)*0.95)
+# number of training iterations
+epochs = int(len(X_train_inst)*0.9)
 
-clf = MultiClassPerceptron(epochs=epochs, learning_rate=1, random_state=42)
+# create MultiClassPerceptron classifier object
+clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.9, random_state=42)
 
+# train the model
 clf.fit(X_train=X_train_inst, labels=list(labels))
 
+
+# predict
 y_test = clf.predict(X_test_inst)
 
 y_true = [inst.true_label for inst in X_test_inst]
 
+# Model Evaluation
 f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
-f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
-f1_score_none = f1_score(y_true, y_test, labels, None)
+# f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
+# f1_score_none = f1_score(y_true, y_test, labels, None)
 
-for result in f1_score_micro+f1_score_macro+f1_score_none:
+# Print F1 Score
+for result in f1_score_micro:
     result.print_result()