From cc77b3a7550cb0bbd90d41c5d94a06319f174034 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Mon, 11 May 2020 00:28:07 +0200 Subject: [PATCH] Feature Extraction with LEXICONS, Need to add more Lexicons and improve feature representation --- feature_extraction/__init__.py | 0 feature_extraction/features.py | 38 ++++++++++++++++++++++++++++++++++ feature_extraction/lexicons.py | 34 ++++++++++++++++++++++++++++++ utils/constants.py | 6 +++++- utils/models.py | 6 +++++- 5 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 feature_extraction/__init__.py create mode 100644 feature_extraction/features.py create mode 100644 feature_extraction/lexicons.py diff --git a/feature_extraction/__init__.py b/feature_extraction/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/feature_extraction/features.py b/feature_extraction/features.py new file mode 100644 index 0000000..3710817 --- /dev/null +++ b/feature_extraction/features.py @@ -0,0 +1,38 @@ +from utils.constants import REGEX_CONSTANTS +import feature_extraction.lexicons as lexicons +import re + +""" List of supported features for feature extraction from Input String """ +FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH', + 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM'] + +REGEX_FEATURES = ['ACRONYM'] + + +def extract_features_from_text(text: str): + """ + This function takes text string as input, extracts and returns a list of features by checking each word in + :`~feature_extraction.lexicons.ALL_LEXICONS` + :param text: takes string text as param + :return: returns a list of extracted features from the text, empty list for no features + """ + + # ALL_LEXICONS + lexicon_dict = lexicons.ALL_LEXICONS + + text_feature_list = [] + # Iterate through the list features and get list of words from the lexicon dictionary, + # for each word in the word list, check if it appears in input text and add it to the text feature list + for feature in FEATURE_LIST: + if feature in REGEX_FEATURES: + regex = REGEX_CONSTANTS[feature] + if bool(re.match(regex, text)): + text_feature_list.append(feature) + continue + word_list = lexicon_dict[feature] + for word in word_list: + if word in text: + text_feature_list.append(feature) + break + + return text_feature_list diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py new file mode 100644 index 0000000..6105b94 --- /dev/null +++ b/feature_extraction/lexicons.py @@ -0,0 +1,34 @@ +ALL_LEXICONS = { + + 'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst', + 'greater', 'larger', 'faster', 'measur'], + + 'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'], + + 'RESULT': ['evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result'], + + 'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'], + + 'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental', + 'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate'], + + 'RESEARCH': ['apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin', + 'observ', 'predict', 'verify', 'work on', 'empirical', 'experiment', 'exploratory', 'ongoing', + 'quantitative', 'qualitative', 'preliminary', 'statistical', 'underway'], + + 'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri', + 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine', + 'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy', + 'system', 'technique', 'theory', 'tool', 'treatment'], + + 'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain', + 'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say', + 'suggest', 'swear', 'write'], + + 'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'], + + 'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'], + + 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text + +} diff --git a/utils/constants.py b/utils/constants.py index 7941faf..64bee15 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -1,2 +1,6 @@ AVG_MICRO = 'MICRO' -AVG_MACRO = 'MACRO' \ No newline at end of file +AVG_MACRO = 'MACRO' + +REGEX_CONSTANTS = { + 'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b' +} diff --git a/utils/models.py b/utils/models.py index f2a6753..dfea3a0 100644 --- a/utils/models.py +++ b/utils/models.py @@ -1,3 +1,5 @@ +from feature_extraction.features import extract_features_from_text + class DataInstance: """ @@ -8,6 +10,8 @@ class DataInstance: self.did = r_id self.text = text self.true_label = true_label + self.features = extract_features_from_text(text) def print(self): - print('True Label :: ', self.true_label, ' Text :: ', self.text) + print('\nTrue Label :: ', self.true_label, ' Text :: ', self.text) + print('Features :: ', self.features)