diff --git a/classifier/linear_model.py b/classifier/linear_model.py index 4275a35..3de308a 100644 --- a/classifier/linear_model.py +++ b/classifier/linear_model.py @@ -85,14 +85,18 @@ class MultiClassPerceptron: """ - def __init__(self, epochs: int = 2000, learning_rate: float = 1): + def __init__(self, epochs: int = 5000, learning_rate: float = 1, random_state: int = 4): """ :type epochs: int :type learning_rate: float + :type random_state: int :param epochs: number of training iterations :param learning_rate: learning rate for updating weights, Default is 1 + :param random_state: random state for shuffling the data, useful for reproducing the results. + Default is 4. """ + self.random_state = random_state self.perceptron_dict = OrderedDict() # contains Key : label and value : Perceptron Object for label self.epochs = epochs self.learning_rate = learning_rate @@ -124,7 +128,7 @@ class MultiClassPerceptron: # Dictionary for storing label->Perceptron() objects, Create a new Perceptron object for each label for label in labels: - self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(-0.5), -0.5) + self.perceptron_dict[label] = Perceptron(label, get_sample_weights_with_features(theta_bias=-0.5), theta_bias=-0.5) next_print = int(self.epochs/10) @@ -160,7 +164,7 @@ class MultiClassPerceptron: self.perceptron_dict[inst.true_label].update_weights(inst.features, self.learning_rate, reward=True) # It's important to shuffle the data during every epoch - random.shuffle(X_train) + random.Random(self.random_state).shuffle(X_train) def predict(self, X_test: list): """ diff --git a/feature_extraction/features.py b/feature_extraction/features.py index 3b2e360..d6241f7 100644 --- a/feature_extraction/features.py +++ b/feature_extraction/features.py @@ -5,7 +5,8 @@ from utils.constants import REGEX_CONSTANTS FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT', 'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION', - 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] + 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', + 'CONTAINS_URL', 'ENDS_WITH_RIDE', 'ENDS_WITH_RINE', 'ENDS_WITH_ETHYL'] """ Feature Name for Theta Bias -- need to add it to the list of features for all data instances """ THETA_BIAS_FEATURE = 'THETA_BIAS' diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py index feb89db..5326eda 100644 --- a/feature_extraction/lexicons.py +++ b/feature_extraction/lexicons.py @@ -4,7 +4,7 @@ Dictionary of Lexicons used for Feature Extraction ALL_LEXICONS = { 'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low', - 'worse', 'worst', 'greater', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'], + 'wors', 'great', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'], 'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'], @@ -19,7 +19,8 @@ ALL_LEXICONS = { 'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'], 'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate', - 'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure'], + 'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure', 'indicate', + 'specify', 'explain'], 'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental', 'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate', @@ -28,7 +29,8 @@ ALL_LEXICONS = { 'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin', 'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical', - 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation'], + 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation', + 'literature', 'experience'], 'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri', 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine', @@ -47,22 +49,25 @@ ALL_LEXICONS = { 'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists', 'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'], - 'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia', - 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium', - 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'rna', + 'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', + 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', + 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug', 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury', - 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', - 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen'], + 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant', 'obesity', + 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen', 'inject', 'laparoscop', + 'circulat', 'ventricle', 'tract', 'pneumonia', 'calcium', 'rna', 'organism', 'biolog', 'x-ray'], 'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean', 'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median', 'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation', - 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', 'probabilit', 'multivariate'], + 'frequen', 'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', + 'probabilit', 'multivariate', 'negative', 'positive', 'regression', 'digit'], - 'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', 'algorithm', - 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', 'corpus', 'uml', 'system', - 'security', 'protocol'], + 'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', + 'algorithm', 'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', + 'corpus', 'uml', 'system', 'security', 'protocol', 'classification', 'data transform', + 'memory', 'java', 'python', 'cluster', 'epoch', 'training', 'deadlock', 'technique'], 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text diff --git a/testing/model_testing.py b/testing/model_testing.py index ba3ac5f..b640f69 100644 --- a/testing/model_testing.py +++ b/testing/model_testing.py @@ -15,9 +15,9 @@ labels = set([inst.true_label for inst in X_train_inst]) X_test_inst = read_csv_file(test_file_path, '\t') -epochs = int(len(X_train_inst)*0.75) +epochs = int(len(X_train_inst)*0.95) -clf = MultiClassPerceptron(epochs, 1) +clf = MultiClassPerceptron(epochs=epochs, learning_rate=1, random_state=10) clf.fit(X_train=X_train_inst, labels=list(labels)) @@ -25,7 +25,9 @@ y_test = clf.predict(X_test_inst) y_true = [inst.true_label for inst in X_test_inst] -f1_score_list = f1_score(y_true, y_test, labels, const.AVG_MICRO) +f1_score_micro = f1_score(y_true, y_test, labels, const.AVG_MICRO) +f1_score_macro = f1_score(y_true, y_test, labels, const.AVG_MACRO) +f1_score_none = f1_score(y_true, y_test, labels, None) -for result in f1_score_list: +for result in f1_score_micro+f1_score_macro+f1_score_none: result.print_result() diff --git a/utils/constants.py b/utils/constants.py index 4318131..8a55fdb 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -7,14 +7,14 @@ AVG_MACRO = 'MACRO' REGEX_CONSTANTS = { # Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2 - #'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"), + # 'ACRONYM': re.compile(r"[m0-9\W^]([A-Z]{2,})[s\.,:\-$]"), 'ACRONYM': re.compile(r"^[A-Z]{2,}[\.,:;\b\s]|[\s\b]m?[A-Z]{2,}[\.,:;\b\s]"), # Regex for matching Years in the text - > 1995 / 2020 / 2019 'CONTAINS_YEAR': re.compile(r"(?<=[^0-9])1[8-9][0-9]{2}(?=[^0-9$])|(?<=[^0-9])20[0-2][0-9](?=[^0-9$])"), # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14) - 'SEQUENCE': re.compile(r"\([\d.*\)"), + 'SEQUENCE': re.compile(r"\([\d.*]\)"), # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500] 'REFERENCE': re.compile(r"\[\d.*\]"), @@ -24,5 +24,12 @@ REGEX_CONSTANTS = { # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. , # https://www.ims.uni-stuttgart.de/ - 'URL': re.compile(r"https?://\S+")#...\S+(?=\.?,?:?[\s\"$])") + 'CONTAINS_URL': re.compile(r"https?://\S+"), + + 'ENDS_WITH_RIDE': re.compile(r"ride\b"), + + 'ENDS_WITH_RINE': re.compile(r"rine\b"), + + 'ENDS_WITH_ETHYL': re.compile(r"ethyl\b") + }