From c915db6fc52cf82daa45384adc9d3497d9ab5b90 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Thu, 14 May 2020 18:19:21 +0200 Subject: [PATCH] Added some more words to lexicon dictionary --- feature_extraction/features.py | 13 +++++------- feature_extraction/lexicons.py | 39 ++++++++++++++++++++++++++-------- testing/eval_testing.py | 11 ---------- testing/feature_testing.py | 26 +++++++++++++++++++++++ 4 files changed, 61 insertions(+), 28 deletions(-) create mode 100644 testing/feature_testing.py diff --git a/feature_extraction/features.py b/feature_extraction/features.py index 1f22ca9..d33d7a6 100644 --- a/feature_extraction/features.py +++ b/feature_extraction/features.py @@ -2,12 +2,9 @@ import feature_extraction.lexicons as lexicons from utils.constants import REGEX_CONSTANTS """ List of supported features for feature extraction from Input String """ -FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH', - 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM', - 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] - -""" Features with Regex Pattern Matching - For these features, get the regex pattern from constants""" -REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] +FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT', 'IMPORTANT', 'RESEARCH', + 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'CITATION', + 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] def extract_features_from_text(text: str): @@ -28,7 +25,7 @@ def extract_features_from_text(text: str): # If the feature is Regex Pattern Match, get the pattern from :`~feature_extraction.lexicons.ALL_LEXICONS` # and match it with the input text - if feature in REGEX_FEATURES: + if feature in REGEX_CONSTANTS: pattern = REGEX_CONSTANTS[feature] if bool(pattern.match(text)): text_feature_list.append(feature) @@ -37,7 +34,7 @@ def extract_features_from_text(text: str): # If the feature is not a Regex Pattern Match, then get the list of dictionary words from lexicon dictionary word_list = lexicon_dict[feature] for word in word_list: - if word in text: + if word in text.lower(): text_feature_list.append(feature) break diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py index 2b05958..50038d2 100644 --- a/feature_extraction/lexicons.py +++ b/feature_extraction/lexicons.py @@ -3,21 +3,32 @@ Dictionary of Lexicons used for Feature Extraction """ ALL_LEXICONS = { - 'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst', - 'greater', 'larger', 'faster', 'measur'], + 'COMPARE': ['compar', 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'good', 'low', + 'worse', 'worst', 'greater', 'larger', 'faster', 'high', 'measur', 'between', 'another', 'similar'], 'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'], - 'RESULT': ['evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result'], + 'RESULT': ['estimate', 'evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result', 'performance'], + + 'INCREASE': ['increase', 'grow', 'intensify', 'build up', 'explode'], + + 'CHANGE': ['adapt', 'adjust', 'augment', 'combine', 'change', 'decrease', 'elaborate', 'expand', 'expand on', + 'extend', 'derive', 'incorporate', 'increase', 'manipulate', 'modify', 'optimize', 'optimise', 'refine', + 'render', 'replace', 'revise', 'substitute', 'tailor', 'upgrade', 'grow'], 'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'], + 'PRESENT': ['describe', 'discuss', 'give', 'introduce', 'note', 'notice', 'present', 'propose', 'recapitulate', + 'demonstrate', 'remark', 'report', 'say', 'show', 'sketch', 'state', 'suggest', 'figure'], + 'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental', - 'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate'], + 'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate', + 'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'], - 'RESEARCH': ['apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin', - 'observ', 'predict', 'verify', 'work on', 'empirical', 'experiment', 'exploratory', 'ongoing', - 'quantitative', 'qualitative', 'preliminary', 'statistical', 'underway'], + 'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat', + 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', + 'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical', + 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document'], 'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri', 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine', @@ -27,15 +38,25 @@ ALL_LEXICONS = { 'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain', 'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say', 'suggest', 'swear', 'write'], - + 'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'], - + 'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'], 'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts', 'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists', 'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'], + 'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia', + 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium', + 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', + 'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug', + 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'antibodies'], + + 'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', + 'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', + 'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'math'], + 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text } diff --git a/testing/eval_testing.py b/testing/eval_testing.py index e8e7b16..df32d34 100644 --- a/testing/eval_testing.py +++ b/testing/eval_testing.py @@ -1,8 +1,6 @@ from eval.metrics import f1_score import utils.constants as const from sklearn.metrics import f1_score as f1 -import os -from utils.csv import read_csv_file y_true = ['positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative'] y_pred = ['positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'positive', 'positive', 'negative', 'negative', 'negative'] @@ -21,12 +19,3 @@ for result in result_list: print('SK Learn F1 Score (MACRO):: ', f1(y_true, y_pred, ['positive', 'negative'], average='macro')) - -project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -train_file_path = project_root+'/data/tsv/train.tsv' -print(train_file_path) - -data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') -for inst in data[:10]: - inst.print() diff --git a/testing/feature_testing.py b/testing/feature_testing.py new file mode 100644 index 0000000..b2a2364 --- /dev/null +++ b/testing/feature_testing.py @@ -0,0 +1,26 @@ +import os +from utils.csv import read_csv_file + +project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +train_file_path = project_root+'/data/tsv/train.tsv' +print(train_file_path) + +data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') + +i = 0 +for inst in data: + if len(inst.features) <= 0: + inst.print() + i += 1 +print('Data Points without Features :: ', i) + +# tokens = inst.text.split() +# for token in tokens: +# if token not in feature_dict: +# feature_dict[token] = 1 +# continue +# feature_dict[token] += 1 +# +# for key in sorted(feature_dict, key=feature_dict.get, reverse=True): +# print(key, ' -> ', feature_dict.get(key))