isaac
parent
3455c34601
commit
cc77b3a755
@ -0,0 +1,38 @@
|
|||||||
|
from utils.constants import REGEX_CONSTANTS
|
||||||
|
import feature_extraction.lexicons as lexicons
|
||||||
|
import re
|
||||||
|
|
||||||
|
""" List of supported features for feature extraction from Input String """
|
||||||
|
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
|
||||||
|
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM']
|
||||||
|
|
||||||
|
REGEX_FEATURES = ['ACRONYM']
|
||||||
|
|
||||||
|
|
||||||
|
def extract_features_from_text(text: str):
|
||||||
|
"""
|
||||||
|
This function takes text string as input, extracts and returns a list of features by checking each word in
|
||||||
|
:`~feature_extraction.lexicons.ALL_LEXICONS`
|
||||||
|
:param text: takes string text as param
|
||||||
|
:return: returns a list of extracted features from the text, empty list for no features
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ALL_LEXICONS
|
||||||
|
lexicon_dict = lexicons.ALL_LEXICONS
|
||||||
|
|
||||||
|
text_feature_list = []
|
||||||
|
# Iterate through the list features and get list of words from the lexicon dictionary,
|
||||||
|
# for each word in the word list, check if it appears in input text and add it to the text feature list
|
||||||
|
for feature in FEATURE_LIST:
|
||||||
|
if feature in REGEX_FEATURES:
|
||||||
|
regex = REGEX_CONSTANTS[feature]
|
||||||
|
if bool(re.match(regex, text)):
|
||||||
|
text_feature_list.append(feature)
|
||||||
|
continue
|
||||||
|
word_list = lexicon_dict[feature]
|
||||||
|
for word in word_list:
|
||||||
|
if word in text:
|
||||||
|
text_feature_list.append(feature)
|
||||||
|
break
|
||||||
|
|
||||||
|
return text_feature_list
|
||||||
@ -0,0 +1,34 @@
|
|||||||
|
ALL_LEXICONS = {
|
||||||
|
|
||||||
|
'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst',
|
||||||
|
'greater', 'larger', 'faster', 'measur'],
|
||||||
|
|
||||||
|
'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
|
||||||
|
|
||||||
|
'RESULT': ['evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result'],
|
||||||
|
|
||||||
|
'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
|
||||||
|
|
||||||
|
'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
|
||||||
|
'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate'],
|
||||||
|
|
||||||
|
'RESEARCH': ['apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin',
|
||||||
|
'observ', 'predict', 'verify', 'work on', 'empirical', 'experiment', 'exploratory', 'ongoing',
|
||||||
|
'quantitative', 'qualitative', 'preliminary', 'statistical', 'underway'],
|
||||||
|
|
||||||
|
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
|
||||||
|
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
|
||||||
|
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy',
|
||||||
|
'system', 'technique', 'theory', 'tool', 'treatment'],
|
||||||
|
|
||||||
|
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
|
||||||
|
'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
|
||||||
|
'suggest', 'swear', 'write'],
|
||||||
|
|
||||||
|
'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
|
||||||
|
|
||||||
|
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
|
||||||
|
|
||||||
|
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
|
||||||
|
|
||||||
|
}
|
||||||
@ -1,2 +1,6 @@
|
|||||||
AVG_MICRO = 'MICRO'
|
AVG_MICRO = 'MICRO'
|
||||||
AVG_MACRO = 'MACRO'
|
AVG_MACRO = 'MACRO'
|
||||||
|
|
||||||
|
REGEX_CONSTANTS = {
|
||||||
|
'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b'
|
||||||
|
}
|
||||||
|
|||||||
Loading…
Reference in new issue