Added few comments and Regex Patterns

6 years ago · d41c674b49
parent 190f9f35e6
commit d41c674b49
5 changed files with 35 additions and 12 deletions
--- a/feature_extraction/features.py
+++ b/feature_extraction/features.py
@ -1,13 +1,13 @@
 from utils.constants import REGEX_CONSTANTS
 import feature_extraction.lexicons as lexicons
-import re
+from utils.constants import REGEX_CONSTANTS
 """ List of supported features for feature extraction from Input String """
 FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
                'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM',
-                'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']
+                'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
-REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE']
+""" Features with Regex Pattern Matching - For these features, get the regex pattern from constants"""
 REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
 def extract_features_from_text(text: str):
@ -25,11 +25,16 @@ def extract_features_from_text(text: str):
    # Iterate through the list features and get list of words from the lexicon dictionary,
    # for each word in the word list, check if it appears in input text and add it to the text feature list
    for feature in FEATURE_LIST:
        # If the feature is Regex Pattern Match, get the pattern from :`~feature_extraction.lexicons.ALL_LEXICONS`
        # and match it with the input text
        if feature in REGEX_FEATURES:
            pattern = REGEX_CONSTANTS[feature]
            if bool(pattern.match(text)):
                text_feature_list.append(feature)
            continue
        # If the feature is not a Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
        word_list = lexicon_dict[feature]
        for word in word_list:
            if word in text:
--- a/feature_extraction/lexicons.py
+++ b/feature_extraction/lexicons.py
@ -1,3 +1,6 @@
 """
 Dictionary of Lexicons used for Feature Extraction
 """
 ALL_LEXICONS = {
    'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst',
--- a/testing/eval_testing.py
+++ b/testing/eval_testing.py
@ -28,6 +28,5 @@ train_file_path = project_root+'/data/tsv/train.tsv'
 print(train_file_path)
 data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
-for inst in data:
+for inst in data[:10]:
    if len(inst.features) <= 0:
    inst.print()
--- a/utils/constants.py
+++ b/utils/constants.py
@ -5,8 +5,23 @@ AVG_MICRO = 'MICRO'
 AVG_MACRO = 'MACRO'
 REGEX_CONSTANTS = {
-    'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"),
+
-    'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'),
+    # Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2
-    'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'),
+    'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"),  # TODO :: (for Isaac)
-    'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*")
+
    # Regex for matching Years in the text - > 1995 / 2020 / 2019
    'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'),  # TODO :: (for Isaac)
    # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
    'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'),  # TODO :: (for Isaac)
    # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
    'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*"),  # TODO :: (for Isaac)
    # Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
    'PERCENTAGE': re.compile(r"\d+(\.\d+)?%"),  # TODO :: (for Isaac)
    # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. ,
    # https://www.ims.uni-stuttgart.de/
    'URL': re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+")  # TODO :: (for Isaac)
 }
--- a/utils/models.py
+++ b/utils/models.py
@ -3,7 +3,8 @@ from feature_extraction.features import extract_features_from_text
 class DataInstance:
    """
-    Model Class for carrying Training and Testing data from tsc/csv file
+    Model Class for carrying Training and Testing data from tsv/csv file.
    Also carries the extracted features.
    """
    def __init__(self, r_id, text, true_label):