Added some more Lexicons and Regex for Feature Extraction

isaac
Pavan Mandava 6 years ago
parent cc77b3a755
commit 7cd79a4b21

@ -4,9 +4,10 @@ import re
""" List of supported features for feature extraction from Input String """
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM']
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM',
'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']
REGEX_FEATURES = ['ACRONYM']
REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']
def extract_features_from_text(text: str):
@ -25,8 +26,8 @@ def extract_features_from_text(text: str):
# for each word in the word list, check if it appears in input text and add it to the text feature list
for feature in FEATURE_LIST:
if feature in REGEX_FEATURES:
regex = REGEX_CONSTANTS[feature]
if bool(re.match(regex, text)):
pattern = REGEX_CONSTANTS[feature]
if bool(pattern.match(text)):
text_feature_list.append(feature)
continue
word_list = lexicon_dict[feature]

@ -29,6 +29,10 @@ ALL_LEXICONS = {
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists',
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
}

@ -28,5 +28,6 @@ train_file_path = project_root+'/data/tsv/train.tsv'
print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
for inst in data[:5]:
inst.print()
for inst in data:
if len(inst.features) <= 0:
inst.print()

@ -1,6 +1,12 @@
import re
AVG_MICRO = 'MICRO'
AVG_MACRO = 'MACRO'
REGEX_CONSTANTS = {
'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b'
'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"),
'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'),
'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'),
'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*")
}

Loading…
Cancel
Save