Merge branch 'master' into isaac

isaac
Pavan Mandava 6 years ago
commit 9ff34905c4

@ -62,9 +62,9 @@ class Perceptron:
for feature in features: for feature in features:
feature_weight = self.weights[feature] feature_weight = self.weights[feature]
if penalize: if penalize:
self.weights[feature] = feature_weight - (learning_rate * 1) self.weights[feature] = round(feature_weight - (learning_rate * 1), 5)
if reward: if reward:
self.weights[feature] = feature_weight + (learning_rate * 1) self.weights[feature] = round(feature_weight + (learning_rate * 1), 5)
class MultiClassPerceptron: class MultiClassPerceptron:
@ -85,14 +85,18 @@ class MultiClassPerceptron:
""" """
def __init__(self, epochs: int = 2000, learning_rate: float = 1): def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 42):
""" """
:type epochs: int :type epochs: int
:type learning_rate: float :type learning_rate: float
:type random_state: int
:param epochs: number of training iterations :param epochs: number of training iterations
:param learning_rate: learning rate for updating weights, Default is 1 :param learning_rate: learning rate for updating weights, Default is 1
:param random_state: random state for shuffling the data, useful for reproducing the results.
Default is 42.
""" """
self.random_state = random_state
self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label
self.epochs = epochs self.epochs = epochs
self.learning_rate = learning_rate self.learning_rate = learning_rate
@ -124,10 +128,14 @@ class MultiClassPerceptron:
# Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
for label in labels: for label in labels:
self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(-0.5), -0.5) sample_weights = get_sample_weights_with_features(theta_bias=-0.25, random_state=self.random_state)
self.perceptron_dict[label] = Perceptron(label, sample_weights, theta_bias=-0.25)
next_print = int(self.epochs/10) next_print = int(self.epochs/10)
random.seed(self.random_state)
random_list = [random.randint(0, train_len-1) for i in range(self.epochs)]
# Training Iterations # Training Iterations
for epoch in range(self.epochs): for epoch in range(self.epochs):
@ -135,10 +143,8 @@ class MultiClassPerceptron:
print('Training Multi-Class Perceptron Classifier..... (', epoch, '/', self.epochs, ')') print('Training Multi-Class Perceptron Classifier..... (', epoch, '/', self.epochs, ')')
next_print = next_print + int(self.epochs/10) next_print = next_print + int(self.epochs/10)
# get a random number within the size of training set # Pick a number from random list
rand_num = random.randint(0, train_len-1) inst = X_train[random_list[epoch]]
# pick a random data instance with the generated random number
inst = X_train[rand_num]
perceptron_scores = [] # list for storing perceptron scores for each label perceptron_scores = [] # list for storing perceptron scores for each label
for label, perceptron in self.perceptron_dict.items(): for label, perceptron in self.perceptron_dict.items():
@ -159,8 +165,8 @@ class MultiClassPerceptron:
# increase weights # increase weights
self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True) self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
# It's important to shuffle the data during every epoch # It's important to shuffle the list during every epoch
random.shuffle(X_train) random.Random(self.random_state).shuffle(X_train)
def predict(self, X_test: list): def predict(self, X_test: list):
""" """
@ -192,15 +198,22 @@ class MultiClassPerceptron:
return y_test return y_test
def get_sample_weights_with_features(theta_bias: float = None): def get_sample_weights_with_features(theta_bias: float = None, random_state: int = 42):
""" """
This function creates a dictionary with feature as a key and a random floating number (feature weight) as value. This function creates a dictionary with feature as a key and a random floating number (feature weight) as value.
Weights for each feature is a floating number between -1 and 1 Weights for each feature is a floating number between -1 and 1
:type theta_bias: float
:type random_state: int
:param theta_bias: value of theta bias variable
:param random_state: random seed number for reproducing the results
:return: returns a dictionary of random weights for each feature :return: returns a dictionary of random weights for each feature
""" """
weights = {THETA_BIAS_FEATURE: theta_bias} weights = {THETA_BIAS_FEATURE: theta_bias}
random.seed(random_state)
for feature in FEATURE_LIST: for feature in FEATURE_LIST:
weights[feature] = round(random.uniform(-1.0, 1.0), 4) weights[feature] = round(random.uniform(-1.0, 1.0), 5)
return weights return weights

@ -5,7 +5,8 @@ from utils.constants import REGEX_CONSTANTS
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT', FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE',
'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL']
""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """ """ Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
THETA_BIAS_FEATURE = 'THETA_BIAS' THETA_BIAS_FEATURE = 'THETA_BIAS'

@ -4,7 +4,7 @@ Dictionary of Lexicons used for Feature Extraction
ALL_LEXICONS = { ALL_LEXICONS = {
'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low', 'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low',
'worse', 'worst', 'greater', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'], 'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'], 'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
@ -19,7 +19,8 @@ ALL_LEXICONS = {
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'], 'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate', 'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate',
'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure'], 'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate',
'specify', 'explain'],
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental', 'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate', 'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate',
@ -28,7 +29,8 @@ ALL_LEXICONS = {
'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat', 'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin', 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical', 'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation'], 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation',
'literature', 'experience'],
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri', 'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine', 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
@ -47,22 +49,25 @@ ALL_LEXICONS = {
'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists', 'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'], 'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia', 'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase',
'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium', 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death',
'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'rna', 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug', 'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury', 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity',
'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen'], 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop',
'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium', 'rna', 'organism', 'biolog', 'x-ray'],
'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean', 'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median', 'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation', 'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', 'probabilit', 'multivariate'], 'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter',
'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'],
'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', 'algorithm', 'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network',
'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', 'corpus', 'uml', 'system', 'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml',
'security', 'protocol'], 'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text

@ -8,24 +8,35 @@ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
train_file_path = project_root+'/data/tsv/train.tsv' train_file_path = project_root+'/data/tsv/train.tsv'
test_file_path = project_root+'/data/tsv/test.tsv' test_file_path = project_root+'/data/tsv/test.tsv'
# Read the training dataset
X_train_inst = read_csv_file(train_file_path, '\t') X_train_inst = read_csv_file(train_file_path, '\t')
# set of labels from Training data
labels = set([inst.true_label for inst in X_train_inst]) labels = set([inst.true_label for inst in X_train_inst])
# Read test data set
X_test_inst = read_csv_file(test_file_path, '\t') X_test_inst = read_csv_file(test_file_path, '\t')
epochs = int(len(X_train_inst)*0.75) # number of training iterations
epochs = int(len(X_train_inst)*0.9)
clf = MultiClassPerceptron(epochs, 1) # create MultiClassPerceptron classifier object
clf = MultiClassPerceptron(epochs=epochs, learning_rate=0.9, random_state=42)
# train the model
clf.fit(X_train=X_train_inst, labels=list(labels)) clf.fit(X_train=X_train_inst, labels=list(labels))
# predict
y_test = clf.predict(X_test_inst) y_test = clf.predict(X_test_inst)
y_true = [inst.true_label for inst in X_test_inst] y_true = [inst.true_label for inst in X_test_inst]
f1_score_list = f1_score(y_true, y_test, labels, const.AVG_MICRO) # Model Evaluation
f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
# f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
# f1_score_none = f1_score(y_true, y_test, labels, None)
for result in f1_score_list: # Print F1 Score
for result in f1_score_micro:
result.print_result() result.print_result()

@ -14,7 +14,7 @@ REGEX_CONSTANTS = {
'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"), 'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"),
# Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14) # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
'SEQUENCE': re.compile(r"\([\d.*\)"), 'SEQUENCE': re.compile(r"\([\d.*]\)"),
# Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500] # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
'REFERENCE': re.compile(r"\[\d.*\]"), 'REFERENCE': re.compile(r"\[\d.*\]"),
@ -22,7 +22,13 @@ REGEX_CONSTANTS = {
# Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30% # Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
'PERCENTAGE': re.compile(r"\d[\d\.\-]+%"), 'PERCENTAGE': re.compile(r"\d[\d\.\-]+%"),
# Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. , # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals.
# https://www.ims.uni-stuttgart.de/ 'CONTAINS_URL': re.compile(r"https?://\S+"),
'URL': re.compile(r"https?://\S+")#...\S+(?=\.?,?:?[\s\"$])")
'ENDS_WITH_RIDE': re.compile(r"ride\b"),
'ENDS_WITH_RINE': re.compile(r"rine\b"),
'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
} }

Loading…
Cancel
Save