Added some more Lexicons and Regex for Feature Extraction

isaac
Pavan Mandava 6 years ago
parent cc77b3a755
commit 7cd79a4b21

@ -4,9 +4,10 @@ import re
""" List of supported features for feature extraction from Input String """ """ List of supported features for feature extraction from Input String """
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH', FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM'] 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM',
'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']
REGEX_FEATURES = ['ACRONYM'] REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']
def extract_features_from_text(text: str): def extract_features_from_text(text: str):
@ -25,8 +26,8 @@ def extract_features_from_text(text: str):
# for each word in the word list, check if it appears in input text and add it to the text feature list # for each word in the word list, check if it appears in input text and add it to the text feature list
for feature in FEATURE_LIST: for feature in FEATURE_LIST:
if feature in REGEX_FEATURES: if feature in REGEX_FEATURES:
regex = REGEX_CONSTANTS[feature] pattern = REGEX_CONSTANTS[feature]
if bool(re.match(regex, text)): if bool(pattern.match(text)):
text_feature_list.append(feature) text_feature_list.append(feature)
continue continue
word_list = lexicon_dict[feature] word_list = lexicon_dict[feature]

@ -29,6 +29,10 @@ ALL_LEXICONS = {
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'], 'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists',
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
} }

@ -28,5 +28,6 @@ train_file_path = project_root+'/data/tsv/train.tsv'
print(train_file_path) print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
for inst in data[:5]: for inst in data:
if len(inst.features) <= 0:
inst.print() inst.print()

@ -1,6 +1,12 @@
import re
AVG_MICRO = 'MICRO' AVG_MICRO = 'MICRO'
AVG_MACRO = 'MACRO' AVG_MACRO = 'MACRO'
REGEX_CONSTANTS = { REGEX_CONSTANTS = {
'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b' 'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"),
'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'),
'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'),
'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*")
} }

Loading…
Cancel
Save