Added more regex features and random_state to the Classifier

isaac
Pavan Mandava 6 years ago
parent 52f853d796
commit 3c0e4a411d

@ -85,14 +85,18 @@ class MultiClassPerceptron:
""" """
def __init__(self, epochs: int = 2000, learning_rate: float = 1): def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 4):
""" """
:type epochs: int :type epochs: int
:type learning_rate: float :type learning_rate: float
:type random_state: int
:param epochs: number of training iterations :param epochs: number of training iterations
:param learning_rate: learning rate for updating weights, Default is 1 :param learning_rate: learning rate for updating weights, Default is 1
:param random_state: random state for shuffling the data, useful for reproducing the results.
Default is 4.
""" """
self.random_state = random_state
self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label
self.epochs = epochs self.epochs = epochs
self.learning_rate = learning_rate self.learning_rate = learning_rate
@ -124,7 +128,7 @@ class MultiClassPerceptron:
# Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label
for label in labels: for label in labels:
self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(-0.5), -0.5) self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(theta_bias=-0.5), theta_bias=-0.5)
next_print = int(self.epochs/10) next_print = int(self.epochs/10)
@ -160,7 +164,7 @@ class MultiClassPerceptron:
self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True) self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True)
# It's important to shuffle the data during every epoch # It's important to shuffle the data during every epoch
random.shuffle(X_train) random.Random(self.random_state).shuffle(X_train)
def predict(self, X_test: list): def predict(self, X_test: list):
""" """

@ -5,7 +5,8 @@ from utils.constants import REGEX_CONSTANTS
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT', FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE',
'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL']
""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """ """ Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
THETA_BIAS_FEATURE = 'THETA_BIAS' THETA_BIAS_FEATURE = 'THETA_BIAS'

@ -4,7 +4,7 @@ Dictionary of Lexicons used for Feature Extraction
ALL_LEXICONS = { ALL_LEXICONS = {
'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low', 'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low',
'worse', 'worst', 'greater', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'], 'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'],
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'], 'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
@ -19,7 +19,8 @@ ALL_LEXICONS = {
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'], 'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate', 'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate',
'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure'], 'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate',
'specify', 'explain'],
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental', 'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate', 'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate',
@ -28,7 +29,8 @@ ALL_LEXICONS = {
'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat', 'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin', 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical', 'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation'], 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation',
'literature', 'experience'],
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri', 'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine', 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
@ -47,22 +49,25 @@ ALL_LEXICONS = {
'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists', 'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'], 'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia', 'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase',
'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium', 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death',
'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'rna', 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug', 'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury', 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity',
'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen'], 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop',
'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium', 'rna', 'organism', 'biolog', 'x-ray'],
'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean', 'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median', 'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation', 'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', 'probabilit', 'multivariate'], 'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter',
'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'],
'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', 'algorithm', 'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network',
'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', 'corpus', 'uml', 'system', 'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml',
'security', 'protocol'], 'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform',
'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text

@ -15,9 +15,9 @@ labels = set([inst.true_label for inst in X_train_inst])
X_test_inst = read_csv_file(test_file_path, '\t') X_test_inst = read_csv_file(test_file_path, '\t')
epochs = int(len(X_train_inst)*0.75) epochs = int(len(X_train_inst)*0.95)
clf = MultiClassPerceptron(epochs, 1) clf = MultiClassPerceptron(epochs=epochs, learning_rate=1, random_state=10)
clf.fit(X_train=X_train_inst, labels=list(labels)) clf.fit(X_train=X_train_inst, labels=list(labels))
@ -25,7 +25,9 @@ y_test = clf.predict(X_test_inst)
y_true = [inst.true_label for inst in X_test_inst] y_true = [inst.true_label for inst in X_test_inst]
f1_score_list = f1_score(y_true, y_test, labels, const.AVG_MICRO) f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO)
f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO)
f1_score_none = f1_score(y_true, y_test, labels, None)
for result in f1_score_list: for result in f1_score_micro+f1_score_macro+f1_score_none:
result.print_result() result.print_result()

@ -14,7 +14,7 @@ REGEX_CONSTANTS = {
'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"), 'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"),
# Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14) # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
'SEQUENCE': re.compile(r"\([\d.*\)"), 'SEQUENCE': re.compile(r"\([\d.*]\)"),
# Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500] # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
'REFERENCE': re.compile(r"\[\d.*\]"), 'REFERENCE': re.compile(r"\[\d.*\]"),
@ -24,5 +24,12 @@ REGEX_CONSTANTS = {
# Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. , # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. ,
# https://www.ims.uni-stuttgart.de/ # https://www.ims.uni-stuttgart.de/
'URL': re.compile(r"https?://\S+")#...\S+(?=\.?,?:?[\s\"$])") 'CONTAINS_URL': re.compile(r"https?://\S+"),
'ENDS_WITH_RIDE': re.compile(r"ride\b"),
'ENDS_WITH_RINE': re.compile(r"rine\b"),
'ENDS_WITH_ETHYL': re.compile(r"ethyl\b")
} }

Loading…
Cancel
Save