Added some more Lexicons and Regex for Feature Extraction

6 years ago · 7cd79a4b21
parent cc77b3a755
commit 7cd79a4b21
4 changed files with 19 additions and 7 deletions
--- a/feature_extraction/features.py
+++ b/feature_extraction/features.py
@ -4,9 +4,10 @@ import re

 """ List of supported features for feature extraction from Input String """
 FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
-                'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM']
+                'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM',
+                'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']

-REGEX_FEATURES = ['ACRONYM']
+REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']


 def extract_features_from_text(text: str):
@ -25,8 +26,8 @@ def extract_features_from_text(text: str):
    # for each word in the word list, check if it appears in input text and add it to the text feature list
    for feature in FEATURE_LIST:
        if feature in REGEX_FEATURES:
-            regex = REGEX_CONSTANTS[feature]
-            if bool(re.match(regex, text)):
+            pattern = REGEX_CONSTANTS[feature]
+            if bool(pattern.match(text)):
                text_feature_list.append(feature)
            continue
        word_list = lexicon_dict[feature]
--- a/feature_extraction/lexicons.py
+++ b/feature_extraction/lexicons.py
@ -29,6 +29,10 @@ ALL_LEXICONS = {
    
    'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],

+    'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
+                      'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists',
+                      'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
+
    'CITATION': ['et al'],  # TODO (for Isaac) :: Write a complex regex for finding Citations in the text

 }
--- a/testing/eval_testing.py
+++ b/testing/eval_testing.py
@ -28,5 +28,6 @@ train_file_path = project_root+'/data/tsv/train.tsv'
 print(train_file_path)

 data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
-for inst in data[:5]:
-    inst.print()
+for inst in data:
+    if len(inst.features) <= 0:
+        inst.print()
--- a/utils/constants.py
+++ b/utils/constants.py
@ -1,6 +1,12 @@
+import re
+
+
 AVG_MICRO = 'MICRO'
 AVG_MACRO = 'MACRO'

 REGEX_CONSTANTS = {
-    'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b'
+    'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"),
+    'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'),
+    'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'),
+    'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*")
 }