From 6410dda0cbe95c77032f99c831ded8ac32f2fa1a Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Mon, 4 May 2020 00:56:23 +0200 Subject: [PATCH 1/4] Micro Averaging done --- eval/metrics.py | 29 +++++++++++++++++++++-------- testing/eval_testing.py | 11 ++++++++--- utils/csv.py | 0 3 files changed, 29 insertions(+), 11 deletions(-) create mode 100644 utils/csv.py diff --git a/eval/metrics.py b/eval/metrics.py index feacd0a..5ca2331 100644 --- a/eval/metrics.py +++ b/eval/metrics.py @@ -18,8 +18,9 @@ def f1_score(y_true, y_pred, labels, average): Use :func:`~eval.metrics.Result.print_result` to print F1 Score on the Console """ + # pr_list - list of dictionaries with precision, recall, TPs, FPs and FNs for each label + pr_list = get_precision_recall(y_true, y_pred, labels) if average is None or average == const.AVG_MACRO: - pr_list = get_precision_recall(y_true, y_pred, labels) f1_score_list = [] f1_sum = 0 for item in pr_list: @@ -28,17 +29,29 @@ def f1_score(y_true, y_pred, labels, average): f_score = calculate_f1_score(precision, recall) f1_sum += f_score if average is None: - f1_score_list.append(Result(precision, recall, average, item['label'], f_score)) + f1_score_list.append(Result(precision, recall, average, item['label'], round(f_score, 3))) if average is None: return f1_score_list elif average == const.AVG_MACRO: - return [Result(None, None, average, None, f1_sum / len(pr_list))] + return [Result(None, None, average, None, round(f1_sum / len(pr_list), 3))] elif average == const.AVG_MICRO: - print('test test test') - print("another test comment") - pass + aggregate_tp = 0 + aggregate_fp = 0 + aggregate_fn = 0 + + for item in pr_list: + aggregate_tp += item['tp'] + aggregate_fp += item['fp'] + aggregate_fn += item['fn'] + + # find precision and recall for aggregate TP, FP & FN + agg_precision = get_precision(aggregate_tp, aggregate_fp) + agg_recall = get_recall(aggregate_tp, aggregate_fn) + + agg_f1_score = calculate_f1_score(agg_precision, agg_recall) + return [Result(agg_precision, agg_recall, average, None, round(agg_f1_score, 3))] return None @@ -63,7 +76,7 @@ def get_precision_recall(y_true, y_pred, labels=None): raise ValueError('Length of Gold standard labels and Predicted labels must be the same') all_labels = False - if labels is None or len(labels) is 0: + if labels is None or len(labels) == 0: # get the precision and recall for all the labels all_labels = True @@ -164,4 +177,4 @@ class Result: def print_result(self): """ Prints F1 Score""" - print('F1 Score :: ', self.f1_score, ' Label :: ', self.label) + print('F1 Score :: ', self.f1_score, ' Label :: ', self.label, ' Average :: ', self.average) diff --git a/testing/eval_testing.py b/testing/eval_testing.py index 89782fa..e6fa801 100644 --- a/testing/eval_testing.py +++ b/testing/eval_testing.py @@ -1,10 +1,15 @@ from eval.metrics import f1_score import utils.constants as const -y_true = ['positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative'] -y_pred = ['positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative'] +y_true = ['positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative'] +y_pred = ['positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative'] result_list = f1_score(y_true, y_pred, ['positive', 'negative'], const.AVG_MICRO) for result in result_list: - result.print_result() \ No newline at end of file + result.print_result() + +result_list = f1_score(y_true, y_pred, ['positive', 'negative'], const.AVG_MACRO) + +for result in result_list: + result.print_result() diff --git a/utils/csv.py b/utils/csv.py new file mode 100644 index 0000000..e69de29 From 3fe33ab51a64c303fffa40b7c874a699b9710ebb Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Mon, 4 May 2020 09:04:55 +0200 Subject: [PATCH 2/4] Comparision with sklearn metrics done - testing --- eval/metrics.py | 6 +++--- testing/eval_testing.py | 5 +++++ 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/eval/metrics.py b/eval/metrics.py index 5ca2331..9719648 100644 --- a/eval/metrics.py +++ b/eval/metrics.py @@ -29,12 +29,12 @@ def f1_score(y_true, y_pred, labels, average): f_score = calculate_f1_score(precision, recall) f1_sum += f_score if average is None: - f1_score_list.append(Result(precision, recall, average, item['label'], round(f_score, 3))) + f1_score_list.append(Result(precision, recall, average, item['label'], round(f_score, 4))) if average is None: return f1_score_list elif average == const.AVG_MACRO: - return [Result(None, None, average, None, round(f1_sum / len(pr_list), 3))] + return [Result(None, None, average, None, round(f1_sum / len(pr_list), 4))] elif average == const.AVG_MICRO: aggregate_tp = 0 @@ -51,7 +51,7 @@ def f1_score(y_true, y_pred, labels, average): agg_recall = get_recall(aggregate_tp, aggregate_fn) agg_f1_score = calculate_f1_score(agg_precision, agg_recall) - return [Result(agg_precision, agg_recall, average, None, round(agg_f1_score, 3))] + return [Result(agg_precision, agg_recall, average, None, round(agg_f1_score, 4))] return None diff --git a/testing/eval_testing.py b/testing/eval_testing.py index e6fa801..acf2c7d 100644 --- a/testing/eval_testing.py +++ b/testing/eval_testing.py @@ -1,5 +1,6 @@ from eval.metrics import f1_score import utils.constants as const +from sklearn.metrics import f1_score as f1 y_true = ['positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative'] y_pred = ['positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative'] @@ -9,7 +10,11 @@ result_list = f1_score(y_true, y_pred, ['positive', 'negative'], const.AVG_MICRO for result in result_list: result.print_result() +print('SK Learn F1 Score (MICRO):: ', f1(y_true, y_pred, ['positive', 'negative'], average='micro')) + result_list = f1_score(y_true, y_pred, ['positive', 'negative'], const.AVG_MACRO) for result in result_list: result.print_result() + +print('SK Learn F1 Score (MACRO):: ', f1(y_true, y_pred, ['positive', 'negative'], average='macro')) From 0577f982a26e2f4a1516720c22d201e6fa8d1776 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Sun, 10 May 2020 18:05:25 +0200 Subject: [PATCH 3/4] Reading Train file done, fixed os path issues --- testing/eval_testing.py | 12 ++++++++++++ utils/csv.py | 19 +++++++++++++++++++ utils/models.py | 13 +++++++++++++ 3 files changed, 44 insertions(+) create mode 100644 utils/models.py diff --git a/testing/eval_testing.py b/testing/eval_testing.py index acf2c7d..b4353ef 100644 --- a/testing/eval_testing.py +++ b/testing/eval_testing.py @@ -1,6 +1,8 @@ from eval.metrics import f1_score import utils.constants as const from sklearn.metrics import f1_score as f1 +import os +from utils.csv import read_csv_file y_true = ['positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative'] y_pred = ['positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative'] @@ -18,3 +20,13 @@ for result in result_list: result.print_result() print('SK Learn F1 Score (MACRO):: ', f1(y_true, y_pred, ['positive', 'negative'], average='macro')) + + +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +train_file_path = project_root+'/data/tsv/train.tsv' +print(train_file_path) + +data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') +for inst in data[:5]: + inst.print() diff --git a/utils/csv.py b/utils/csv.py index e69de29..51b32c2 100644 --- a/utils/csv.py +++ b/utils/csv.py @@ -0,0 +1,19 @@ +import csv +from utils.models import DataInstance + + +def read_csv_file(csv_file_path, delimiter='\t'): + """ + This function takes file path as an argument, reads the data file and + returns a list of DataInstance objects with text and true labels + + :param delimiter: Delimiter for the file. Default is Tab(\t) + :param csv_file_path: path to the TSV/CSV file + :return: returns a list of DataInstance class objects. + """ + with open(csv_file_path, 'r') as file: + file_data = csv.reader(file, delimiter=delimiter) + data = [] + for row in file_data: + data.append(DataInstance(row[0], row[2], row[3])) + return data diff --git a/utils/models.py b/utils/models.py new file mode 100644 index 0000000..f2a6753 --- /dev/null +++ b/utils/models.py @@ -0,0 +1,13 @@ + +class DataInstance: + """ + Model Class for carrying Training and Testing data from tsc/csv file + """ + + def __init__(self, r_id, text, true_label): + self.did = r_id + self.text = text + self.true_label = true_label + + def print(self): + print('True Label :: ', self.true_label, ' Text :: ', self.text) From 3455c34601c22679201154bd7169be224a8ba109 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Sun, 10 May 2020 19:14:24 +0200 Subject: [PATCH 4/4] Added Structure for Perceptron and Multi-Class Perceptron --- classifier/__init__.py | 0 classifier/linear_model.py | 17 +++++++++++++++++ 2 files changed, 17 insertions(+) create mode 100644 classifier/__init__.py create mode 100644 classifier/linear_model.py diff --git a/classifier/__init__.py b/classifier/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/classifier/linear_model.py b/classifier/linear_model.py new file mode 100644 index 0000000..99fd07c --- /dev/null +++ b/classifier/linear_model.py @@ -0,0 +1,17 @@ +class Perceptron: + + def __init__(self, label): + self.classifier_label = label + pass + + def fit(self, X, y, weights=None): + pass + + def predict(self, X): + pass + + +class MultiClassPerceptron: + + def __init__(self): + pass