isaac
parent
3455c34601
commit
cc77b3a755
@ -0,0 +1,38 @@
|
||||
from utils.constants import REGEX_CONSTANTS
|
||||
import feature_extraction.lexicons as lexicons
|
||||
import re
|
||||
|
||||
""" List of supported features for feature extraction from Input String """
|
||||
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
|
||||
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM']
|
||||
|
||||
REGEX_FEATURES = ['ACRONYM']
|
||||
|
||||
|
||||
def extract_features_from_text(text: str):
|
||||
"""
|
||||
This function takes text string as input, extracts and returns a list of features by checking each word in
|
||||
:`~feature_extraction.lexicons.ALL_LEXICONS`
|
||||
:param text: takes string text as param
|
||||
:return: returns a list of extracted features from the text, empty list for no features
|
||||
"""
|
||||
|
||||
# ALL_LEXICONS
|
||||
lexicon_dict = lexicons.ALL_LEXICONS
|
||||
|
||||
text_feature_list = []
|
||||
# Iterate through the list features and get list of words from the lexicon dictionary,
|
||||
# for each word in the word list, check if it appears in input text and add it to the text feature list
|
||||
for feature in FEATURE_LIST:
|
||||
if feature in REGEX_FEATURES:
|
||||
regex = REGEX_CONSTANTS[feature]
|
||||
if bool(re.match(regex, text)):
|
||||
text_feature_list.append(feature)
|
||||
continue
|
||||
word_list = lexicon_dict[feature]
|
||||
for word in word_list:
|
||||
if word in text:
|
||||
text_feature_list.append(feature)
|
||||
break
|
||||
|
||||
return text_feature_list
|
||||
@ -0,0 +1,34 @@
|
||||
ALL_LEXICONS = {
|
||||
|
||||
'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst',
|
||||
'greater', 'larger', 'faster', 'measur'],
|
||||
|
||||
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
|
||||
|
||||
'RESULT': ['evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result'],
|
||||
|
||||
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
|
||||
|
||||
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
|
||||
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate'],
|
||||
|
||||
'RESEARCH': ['apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin',
|
||||
'observ', 'predict', 'verify', 'work on', 'empirical', 'experiment', 'exploratory', 'ongoing',
|
||||
'quantitative', 'qualitative', 'preliminary', 'statistical', 'underway'],
|
||||
|
||||
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
|
||||
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
|
||||
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy',
|
||||
'system', 'technique', 'theory', 'tool', 'treatment'],
|
||||
|
||||
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
|
||||
'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
|
||||
'suggest', 'swear', 'write'],
|
||||
|
||||
'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
|
||||
|
||||
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
|
||||
|
||||
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
|
||||
|
||||
}
|
||||
@ -1,2 +1,6 @@
|
||||
AVG_MICRO = 'MICRO'
|
||||
AVG_MACRO = 'MACRO'
|
||||
AVG_MACRO = 'MACRO'
|
||||
|
||||
REGEX_CONSTANTS = {
|
||||
'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b'
|
||||
}
|
||||
|
||||
Loading…
Reference in new issue