From 89f6cfdf88ebc165c079a56edf644cd967ea454a Mon Sep 17 00:00:00 2001
From: Pavan Mandava <mspavan04@gmail.com>
Date: Fri, 15 May 2020 21:42:37 +0200
Subject: [PATCH] Perceptron and Multi-Class Perceptron done

---
 classifier/linear_model.py     | 256 +++++++++++++++++++++++----------
 feature_extraction/features.py |  11 +-
 testing/model_testing.py       |   3 +
 utils/models.py                |   1 +
 4 files changed, 196 insertions(+), 75 deletions(-)
 create mode 100644 testing/model_testing.py

diff --git a/classifier/linear_model.py b/classifier/linear_model.py
index d0d538b..bca1494 100644
--- a/classifier/linear_model.py
+++ b/classifier/linear_model.py
@@ -1,86 +1,198 @@
-# initialization procedure: https://towardsdatascience.com/weight-initialization-techniques-in-neural-networks-26c649eb3b78
+from utils.models import DataInstance
+from feature_extraction.features import FEATURE_LIST, THETA_BIAS_FEATURE
+from collections import OrderedDict
+import random
+
 
 class Perceptron:
 
-    def __init__(self, label, input_dim, output_dim, step_size, num_classes=1):
-        self.classifier_label = label
-        self.input_len = input_dim
-        self.output_len = output_dim
-        self.sigmoid = lambda z : 1/(1+exp(-z))
-        self.num_classes = num_classes
-        self.multi_class = num_classes > 1
-        self.vexp = np.vectorize(exp)
+    """
+    Perceptron is an algorithm for supervised learning of binary classifiers,
+    which can decide whether or not an input(features) belongs to some specific class.
+    It's a linear classifier, which makes predictions by combining weights with feature vector.
+    """
+
+    def __init__(self, label: str, weights: dict, theta_bias: float):
+        """
+        :type label: str
+        :type weights: dict
+        :type theta_bias: float
+
+        :param label:  Label for the Perceptron Classifier (useful while dealing with Multi-Class Perceptron)
+        :param weights: dictionary of feature name and feature weights(random number)
+        :param theta_bias: value of the theta bias variable, threshold weight in other words
+        """
+        self.label = label
+        self.weights = weights
+        self.theta_bias = theta_bias
+
+    def score(self, features: list):
+        """
+        This function takes the list of features as parameter and
+        computes score by adding all the weights that corresponds to these features
 
+        :type features: list
 
+        :param features: list of features from a DataInstance
+        :return: returns the computed score
+        """
+        score_val = 0
+        for feature in features:
+            score_val += self.weights[feature]
 
+        return score_val
 
-    def fit(self, X, y, weights=None, step_size=0.01, batch_size=10):
+    def update_weights(self, features: list, learning_rate: float = 1, penalize: bool = None, reward: bool = None):
         """
-        initializes training data and hyperparameters
+        This function is used to update weights during the training of the Perceptron Classifier.
+        It takes a list of features as parameter and updates(either increase or decrease) the
+        weights for these individual features based on learning rate parameter
+
+        :param features: list of features from Input DataInstance
+        :param learning_rate: Default is 1
+        :param penalize: If True, decreases the weights for each feature. Default is None
+        :param reward: If True, increases the weights for each feature. Default is None
+
+        - If both penalize and reward are None, weights will not get updated.
+        - If both penalize and reward are True without learning rate(or learning rate 1),
+            weights for the features remain the same.
+
         """
-        # init weights and step_size
-        assert X.shape[0] == y.shape[0]
-        self.train_nobs = X.shape[0]
-        if weights not None:
-            self.W = weights
-        else:
-            self.W = np.random.randn(self.input_len, self.num_classes)*sqrt(2/(1+self.input_len))
-        self.step_size = step_size
-        self.batch_size = batch_size
-        self. shuffler = np.random.randn(self.train_nobs)
-        self.X = X[self.shuffler]
-        self.y = y[self.shuffler]
-
-
-
-    def predict(self, X):
+
+        for feature in features:
+            feature_weight = self.weights[feature]
+            if penalize:
+                self.weights[feature] = feature_weight - (learning_rate * 1)
+            if reward:
+                self.weights[feature] = feature_weight + (learning_rate * 1)
+
+
+class MultiClassPerceptron:
+    """
+    Perceptron is a binary classifier, can only separate between two classes.
+    Multi-Class Perceptron can be used, where multiple labels can be assigned to each data instance.
+
+    Multi-Class Perceptron creates one Perceptron Classifier for each label, while training
+     it takes the score for each label(from Perceptron Classifier) and
+     the label with the highest score is the predicted label
+
+     If the predicted label is different from true label of data instance,
+     this model updates the weights as follows:
+        - decrease the weights for the Perceptron Classifier of predicted label (penalize)
+        - increase the weights for the Perceptron Classifier of true label (reward)
+
+     This model also shuffles the training data after each epoch.
+
+    """
+
+    def __init__(self, epochs: int = 2000, learning_rate: float = 1):
         """
-        takes a test set and returns predictions
+        :type epochs: int
+        :type learning_rate: float
+
+        :param epochs: number of training iterations
+        :param learning_rate: learning rate for updating weights, Default is 1
         """
-        if self.multi_class:
-            return self.softmax(X.dot(self.W))
-        else:
-            return self.sigmoid(X.dot(self.W))
+        self.perceptron_dict = OrderedDict()  # contains Key : label and value : Perceptron Object for label
+        self.epochs = epochs
+        self.learning_rate = learning_rate
 
-    def train(self, num_epochs=1, cost_funct='cross_ent'):
+    def fit(self, X_train: list, labels: list):
         """
-        implements backpropagation algorithm
+        This function takes the training data and labels as parameters and trains the model
+
+        :type X_train: list[DataInstance]
+        :type labels: list[str]
+
+        :param X_train: list of training Data Instances
+        :param labels: list of classes
         """
-        batches = [(n,n+self.batch_size) for n in range(self.input_len)]
-        for a,b in batches:
-            XW = X.dot(self.W)
-            preds = self.predict(self.X[a:b])
-            #cost = self.cost(self.y[a:b], preds, funct=cost_funct)
-            cost_deriv = preds - self.y
-            self.W = self.W - self.step_size *
-            if self.multi_class:
-                act_deriv = self.soft_deriv(XW)
-            else:
-                act_deriv = self.sigmoid(XW)(1-self.sigmoid(XW))
-            update = X.dot(act_deriv).dot(cost_deriv)
-            self.W = self.W - self.step_size * update
-
-
-    def softmax(self, vector):
-        denom = np.sum(self.vexp(vector))
-        return np.array(self.vexp(exp))/denom
-
-    def cost(self, y, yhat, funct='cross_ent'):
-        if funct == 'cross_ent':
-            return np.sum(np.vectorize(log)(yhat) * y)
-
-    def soft_deriv(self, inputs):
-        size = max(*inputs.shape)
-        deriv = np.zeros((size,size))
-        for i in range(size):
-            for j in range(size):
-                if i==j:
-                    deriv[i,j] = self.sigmoid(inputs[j])(1-self.sigmoid(inputs[i]))
-                else:
-                    deriv[i, j] = -self.sigmoid(inputs[j]) * self.sigmoid(inputs[i])
-        return deriv
-
-#class MultiClassPerceptron(Perceptron):
-
-#    def __init__(self):
-#        pass
+
+        # Check if labels parameter is empty and raise Exception
+        if labels is None or len(labels) <= 0:
+            raise Exception('The labels parameter must contain at least one label')
+
+        # Check if Training Data is empty and raise Exception
+        if X_train is None or len(X_train) <= 0:
+            raise Exception('Training data can\'t be Empty')
+
+        # Check the data type of training Instances
+        if not isinstance(X_train, list) and not isinstance(X_train[0], DataInstance):
+            raise Exception('Training Data must be a list of type DataInstance(model)')
+
+        train_len = len(X_train)
+
+        # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
+        for label in labels:
+            self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(-0.5), -0.5)
+
+        # Training Iterations
+        for epoch in range(self.epochs):
+
+            # get a random number within the size of training set
+            rand_num = random.randint(0, train_len)
+            # pick a random data instance with the generated random number
+            inst = X_train[rand_num]
+
+            perceptron_scores = []  # list for storing perceptron scores for each label
+            for label, perceptron in self.perceptron_dict.items():
+                perceptron_scores.append(perceptron.score(inst.features))
+
+            # find the max score from the list of scores
+            max_score = max(perceptron_scores)
+
+            # find the label that corresponds to max score
+            label_max_score = labels[perceptron_scores.index(max_score)]
+
+            # if the label with max score is different from the label of this data instance,
+            # then decrease the weights(penalize) for the Perceptron of label with max score
+            # and increase the weights(reward) for the Perceptron of data instance label
+            if inst.true_label != label_max_score:
+                # decrease weights
+                self.perceptron_dict[label_max_score].update_weights(inst.features, self.learning_rate, penalize=True)
+                # increase weights
+                self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
+
+            # It's important to shuffle the data during every epoch
+            random.shuffle(X_train)
+
+    def predict(self, X_test: list):
+        """
+        This function takes testing instances as parameters and assigns a predicted label.
+
+        Takes the score from each Perceptron Classifier, label with the highest score is the predicted label
+
+        :param X_test: list of test data instances
+        :return: list of predicted labels
+        """
+
+        if X_test is None or len(X_test) <= 0:
+            raise Exception('Testing Data cannot be empty')
+
+        y_test = []
+        labels = list(self.perceptron_dict.keys())
+        for test_inst in X_test:
+            perceptron_scores = []  # list for storing perceptron scores for each label
+            for label in labels:
+                perceptron_scores.append(self.perceptron_dict[label].score(test_inst.features))
+            # find the max score from the list of scores
+            max_score = max(perceptron_scores)
+
+            label_max_score = labels[perceptron_scores.index(max_score)]
+            y_test.append(label_max_score)
+
+        return y_test
+
+
+def get_sample_weights_with_features(theta_bias: float = None):
+    """
+    This function creates a dictionary with feature as a key and a random floating number (feature weight) as value.
+    Weights for each feature is a floating number between -1 and 1
+
+    :return: returns a dictionary of random weights for each feature
+    """
+    weights = {THETA_BIAS_FEATURE: theta_bias}
+    for feature in FEATURE_LIST:
+        weights[feature] = round(random.uniform(-1.0, 1.0), 4)
+
+    return weights
diff --git a/feature_extraction/features.py b/feature_extraction/features.py
index d33d7a6..3daa6e6 100644
--- a/feature_extraction/features.py
+++ b/feature_extraction/features.py
@@ -6,6 +6,9 @@ FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'P
                 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'CITATION',
                 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
 
+""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
+THETA_BIAS_FEATURE = 'THETA_BIAS'
+
 
 def extract_features_from_text(text: str):
     """
@@ -18,12 +21,14 @@ def extract_features_from_text(text: str):
     # ALL_LEXICONS
     lexicon_dict = lexicons.ALL_LEXICONS
 
-    text_feature_list = []
+    # Initialize the feature list with Theta Bias feature, this feature must be added to all data instances
+    text_feature_list = [THETA_BIAS_FEATURE]
+
     # Iterate through the list features and get list of words from the lexicon dictionary,
     # for each word in the word list, check if it appears in input text and add it to the text feature list
     for feature in FEATURE_LIST:
 
-        # If the feature is Regex Pattern Match, get the pattern from :`~feature_extraction.lexicons.ALL_LEXICONS`
+        # If the feature is Regex Pattern Match, get the pattern from :`~utils.constants.REGEX_CONSTANTS`
         # and match it with the input text
         if feature in REGEX_CONSTANTS:
             pattern = REGEX_CONSTANTS[feature]
@@ -31,7 +36,7 @@ def extract_features_from_text(text: str):
                 text_feature_list.append(feature)
             continue
 
-        # If the feature is not a Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
+        # If the feature is not Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
         word_list = lexicon_dict[feature]
         for word in word_list:
             if word in text.lower():
diff --git a/testing/model_testing.py b/testing/model_testing.py
new file mode 100644
index 0000000..f42fbe0
--- /dev/null
+++ b/testing/model_testing.py
@@ -0,0 +1,3 @@
+from classifier.linear_model import get_sample_weights_with_features
+
+print(get_sample_weights_with_features())
diff --git a/utils/models.py b/utils/models.py
index 9ac6326..0c816e3 100644
--- a/utils/models.py
+++ b/utils/models.py
@@ -11,6 +11,7 @@ class DataInstance:
         self.did = r_id
         self.text = text
         self.true_label = true_label
+        self.predicted_label = None
         self.features = extract_features_from_text(text)
 
     def print(self):