diff --git a/classifier/nn_ff.py b/classifier/nn_ff.py index 177eff5..d333618 100644 --- a/classifier/nn_ff.py +++ b/classifier/nn_ff.py @@ -4,8 +4,10 @@ Date: July 5th, 2020 """ import torch -import os -os.chdir('/Users/iriley/code/citation-analysis') +#import os +#os.chdir('/Users/iriley/code/citation-analysis') +import sys +sys.path.append('/Users/iriley/code/citation-analysis') from utils.nn_reader import read_csv_nn from utils.nn_reader2 import read_csv_nn_dev from sklearn.metrics import confusion_matrix @@ -47,10 +49,31 @@ if __name__=='__main__': X_train, y_train, X_test = read_csv_nn() - X_train = torch.FloatTensor(X_train) + # balance classes + yclass = np.array([(x[1]==1)+2*(x[2]==1) for x in y_train]) + is0 = yclass==0 + is1 = yclass==1 + is2 = yclass==2 + X0 = torch.FloatTensor(X_train[is0]) + X1 = torch.FloatTensor(X_train[is1]) + X2 = torch.FloatTensor(X_train[is2]) + y0 = torch.LongTensor(np.zeros((sum(is0),))) + y1 = torch.LongTensor(np.ones((sum(is1),))) + y2 = torch.LongTensor(2*np.ones((sum(is2),))) + l0 = sum(is0) + l1 = sum(is1) + l2 = sum(is2) + p0 = torch.randperm(l0) + p1 = torch.randperm(l1) + p2 = torch.randperm(l2) + p = torch.randperm(3000) + X_train = torch.cat((X0[p0][:1000], X1[p1][:1000], X2[p2][:1000]))[p] + y_train = torch.cat((y0[:1000], y1[:1000], y2[:1000]))[p] + + #X_train = torch.FloatTensor(X_train) X_test = torch.FloatTensor(X_test) - y_train_ = torch.FloatTensor(y_train) - y_train = torch.max(torch.FloatTensor(y_train_),1)[1] + #y_train_ = torch.FloatTensor(y_train) + #y_train = torch.max(torch.FloatTensor(y_train_),1)[1] model = Feedforward(28, 9, 3) criterion = torch.nn.CrossEntropyLoss() @@ -61,13 +84,13 @@ if __name__=='__main__': before_train = criterion(y_pred, y_train) print('Test loss before training' , before_train.item()) - l = X_train.shape[0] + l = 3000 # X_train.shape[0] batch_indices = list(zip(list(range(0,l,16))[:-1], list(range(16,l,16))))# + [(l-l%16,l)] batch_indices[-1] = (batch_indices[-1][0], l) # train model model.train() - epochs = 50 + epochs = 100 for epoch in range(epochs): batch = 0 for a,b in batch_indices: @@ -85,9 +108,15 @@ if __name__=='__main__': loss = criterion(y_pred, y_train) print('Epoch {}: train loss: {}'.format(epoch, loss.item())) # shuffle dataset - p = torch.randperm(l) - X_train = X_train[p] - y_train = y_train[p] + #p = torch.randperm(l) + #X_train = X_train[p] + #y_train = y_train[p] + p0 = torch.randperm(l0) + p1 = torch.randperm(l1) + p2 = torch.randperm(l2) + p = torch.randperm(3000) + X_train = torch.cat((X0[p0][:1000], X1[p1][:1000], X2[p2][:1000]))[p] + y_train = torch.cat((y0[:1000], y1[:1000], y2[:1000]))[p] model.eval() y_pred = model.forward(X_train) @@ -102,32 +131,32 @@ if __name__=='__main__': #y_train = torch.max(torch.FloatTensor(y_train_),1)[1] # get dev set to make predictions - X_dev, y_dev = read_csv_nn_dev() - X_dev = torch.FloatTensor(X_dev) - y_dev_pre = torch.FloatTensor(y_dev) - y_dev = torch.max(torch.FloatTensor(y_dev_pre),1)[1] + #X_dev, y_dev = read_csv_nn_dev() + #X_dev = torch.FloatTensor(X_dev) + #y_dev_pre = torch.FloatTensor(y_dev) + #y_dev = torch.max(torch.FloatTensor(y_dev_pre),1)[1] # post-process to get predictions & get back to np format - y_pred = model.forward(X_dev) + y_pred = model.forward(X_test) y_pred_np = y_pred.detach().numpy() predmax = np.amax(y_pred_np, axis=1) preds = 1*(y_pred_np[:,1]==predmax) + 2*(y_pred_np[:,2]==predmax) - y_dev_ = y_dev.detach().numpy() + #y_dev_ = y_dev.detach().numpy() # create confusion matrix - cm = confusion_matrix(y_dev_, preds) - print(cm) + #cm = confusion_matrix(y_dev_, preds) + #print(cm) # save predictions df = pd.DataFrame() df['preds'] = preds - df['true'] = y_dev_ + #df['true'] = y_dev_ probs = y_pred.detach().numpy() df['pr0'] = probs[:,0] df['pr1'] = probs[:,1] df['pr2'] = probs[:,2] - df['correct'] = df.preds==df.true - df.to_csv('/Users/iriley/code/machine_learning/lab2020/preds_ffnn.csv', index=False) + #df['correct'] = df.preds==df.true + df.to_csv('/Users/iriley/code/machine_learning/lab2020/y_pred_model2.csv', index=False) diff --git a/testing/save_dev_results.py b/testing/save_dev_results.py deleted file mode 100644 index 9e91d6c..0000000 --- a/testing/save_dev_results.py +++ /dev/null @@ -1,62 +0,0 @@ -import os -from classifier.linear_model import MultiClassPerceptron -from sklearn.metrics import confusion_matrix as cm -from utils.csv import read_csv_file -from eval.metrics import f1_score -import utils.constants as const -import pandas as pd -import numpy as np - - -train_file_path = '/Users/iriley/code/citation-analysis/data/tsv/train.tsv' -dev_file_path = '/Users/iriley/code/citation-analysis/data/tsv/dev.tsv' - - -# Read the training dataset -X_train_inst = read_csv_file(train_file_path, '\t') - -# set of labels from Training data -labels = set([inst.true_label for inst in X_train_inst]) - -# Read test data set -X_dev_inst = read_csv_file(dev_file_path, '\t') - -# number of training iterations -epochs = 50 - -# create MultiClassPerceptron classifier object -clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.5, random_state=101) - -# train the model -clf.fit(X_train=X_train_inst, labels=list(labels)) - -# predict -y_pred = clf.predict(X_dev_inst) -y_scores = np.array(clf.get_class_scores(X_dev_inst)) - -y_true = [inst.true_label for inst in X_dev_inst] - -labeldict = {'background': 0, 'method': 1, 'result': 2} -y_pred = np.array([labeldict[x] for x in y_pred]) -y_true = np.array([labeldict[x] for x in y_true]) - -conmat = cm(y_true, y_pred) - -df = pd.DataFrame() -df['pred'] = y_pred -df['true'] = y_true -df['correct'] = y_pred==y_true -df['score0'] = np.round(y_scores[:,0],3) -df['score1'] = np.round(y_scores[:,1],3) -df['score2'] = np.round(y_scores[:,2],3) - -df.to_csv('/Users/iriley/code/machine_learning/lab2020/preds_perceptron.csv', index=False) - -## Model Evaluation -#f1_score_micro = f1_score(y_true, y_pred, labels, const.AVG_MICRO) -#f1_score_macro = f1_score(y_true, y_pred, labels, const.AVG_MACRO) -#f1_score_none = f1_score(y_true, y_pred, labels, None) - -## Print F1 Score -#for result in f1_score_micro + f1_score_macro + f1_score_none: -# result.print_result() \ No newline at end of file