diff --git a/feature_extraction/features.py b/feature_extraction/features.py index 3710817..99f8767 100644 --- a/feature_extraction/features.py +++ b/feature_extraction/features.py @@ -4,9 +4,10 @@ import re """ List of supported features for feature extraction from Input String """ FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH', - 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM'] + 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM', + 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE'] -REGEX_FEATURES = ['ACRONYM'] +REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE'] def extract_features_from_text(text: str): @@ -25,8 +26,8 @@ def extract_features_from_text(text: str): # for each word in the word list, check if it appears in input text and add it to the text feature list for feature in FEATURE_LIST: if feature in REGEX_FEATURES: - regex = REGEX_CONSTANTS[feature] - if bool(re.match(regex, text)): + pattern = REGEX_CONSTANTS[feature] + if bool(pattern.match(text)): text_feature_list.append(feature) continue word_list = lexicon_dict[feature] diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py index 6105b94..5239d3e 100644 --- a/feature_extraction/lexicons.py +++ b/feature_extraction/lexicons.py @@ -29,6 +29,10 @@ ALL_LEXICONS = { 'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'], + 'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts', + 'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists', + 'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'], + 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text } diff --git a/testing/eval_testing.py b/testing/eval_testing.py index b4353ef..9599f81 100644 --- a/testing/eval_testing.py +++ b/testing/eval_testing.py @@ -28,5 +28,6 @@ train_file_path = project_root+'/data/tsv/train.tsv' print(train_file_path) data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') -for inst in data[:5]: - inst.print() +for inst in data: + if len(inst.features) <= 0: + inst.print() diff --git a/utils/constants.py b/utils/constants.py index 64bee15..1f25cfb 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -1,6 +1,12 @@ +import re + + AVG_MICRO = 'MICRO' AVG_MACRO = 'MACRO' REGEX_CONSTANTS = { - 'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b' + 'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"), + 'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'), + 'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'), + 'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*") }