Feature Extraction with LEXICONS, Need to add more Lexicons and improve feature representation

isaac
Pavan Mandava 6 years ago
parent 3455c34601
commit cc77b3a755

@ -0,0 +1,38 @@
from utils.constants import REGEX_CONSTANTS
import feature_extraction.lexicons as lexicons
import re
""" List of supported features for feature extraction from Input String """
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM']
REGEX_FEATURES = ['ACRONYM']
def extract_features_from_text(text: str):
"""
This function takes text string as input, extracts and returns a list of features by checking each word in
:`~feature_extraction.lexicons.ALL_LEXICONS`
:param text: takes string text as param
:return: returns a list of extracted features from the text, empty list for no features
"""
# ALL_LEXICONS
lexicon_dict = lexicons.ALL_LEXICONS
text_feature_list = []
# Iterate through the list features and get list of words from the lexicon dictionary,
# for each word in the word list, check if it appears in input text and add it to the text feature list
for feature in FEATURE_LIST:
if feature in REGEX_FEATURES:
regex = REGEX_CONSTANTS[feature]
if bool(re.match(regex, text)):
text_feature_list.append(feature)
continue
word_list = lexicon_dict[feature]
for word in word_list:
if word in text:
text_feature_list.append(feature)
break
return text_feature_list

@ -0,0 +1,34 @@
ALL_LEXICONS = {
'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst',
'greater', 'larger', 'faster', 'measur'],
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
'RESULT': ['evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result'],
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate'],
'RESEARCH': ['apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin',
'observ', 'predict', 'verify', 'work on', 'empirical', 'experiment', 'exploratory', 'ongoing',
'quantitative', 'qualitative', 'preliminary', 'statistical', 'underway'],
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy',
'system', 'technique', 'theory', 'tool', 'treatment'],
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
'suggest', 'swear', 'write'],
'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
}

@ -1,2 +1,6 @@
AVG_MICRO = 'MICRO'
AVG_MACRO = 'MACRO'
REGEX_CONSTANTS = {
'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b'
}

@ -1,3 +1,5 @@
from feature_extraction.features import extract_features_from_text
class DataInstance:
"""
@ -8,6 +10,8 @@ class DataInstance:
self.did = r_id
self.text = text
self.true_label = true_label
self.features = extract_features_from_text(text)
def print(self):
print('True Label :: ', self.true_label, ' Text :: ', self.text)
print('\nTrue Label :: ', self.true_label, ' Text :: ', self.text)
print('Features :: ', self.features)

Loading…
Cancel
Save