diff --git a/feature_extraction/features.py b/feature_extraction/features.py index 99f8767..1f22ca9 100644 --- a/feature_extraction/features.py +++ b/feature_extraction/features.py @@ -1,13 +1,13 @@ -from utils.constants import REGEX_CONSTANTS import feature_extraction.lexicons as lexicons -import re +from utils.constants import REGEX_CONSTANTS """ List of supported features for feature extraction from Input String """ FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM', - 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE'] + 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] -REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE'] +""" Features with Regex Pattern Matching - For these features, get the regex pattern from constants""" +REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] def extract_features_from_text(text: str): @@ -25,11 +25,16 @@ def extract_features_from_text(text: str): # Iterate through the list features and get list of words from the lexicon dictionary, # for each word in the word list, check if it appears in input text and add it to the text feature list for feature in FEATURE_LIST: + + # If the feature is Regex Pattern Match, get the pattern from :`~feature_extraction.lexicons.ALL_LEXICONS` + # and match it with the input text if feature in REGEX_FEATURES: pattern = REGEX_CONSTANTS[feature] if bool(pattern.match(text)): text_feature_list.append(feature) continue + + # If the feature is not a Regex Pattern Match, then get the list of dictionary words from lexicon dictionary word_list = lexicon_dict[feature] for word in word_list: if word in text: diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py index 5239d3e..2b05958 100644 --- a/feature_extraction/lexicons.py +++ b/feature_extraction/lexicons.py @@ -1,3 +1,6 @@ +""" +Dictionary of Lexicons used for Feature Extraction +""" ALL_LEXICONS = { 'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst', diff --git a/testing/eval_testing.py b/testing/eval_testing.py index 85b8460..e8e7b16 100644 --- a/testing/eval_testing.py +++ b/testing/eval_testing.py @@ -28,6 +28,5 @@ train_file_path = project_root+'/data/tsv/train.tsv' print(train_file_path) data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') -for inst in data: - if len(inst.features) <= 0: - inst.print() +for inst in data[:10]: + inst.print() diff --git a/utils/constants.py b/utils/constants.py index 1f25cfb..e0b3874 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -5,8 +5,23 @@ AVG_MICRO = 'MICRO' AVG_MACRO = 'MACRO' REGEX_CONSTANTS = { - 'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"), - 'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'), - 'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'), - 'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*") + + # Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2 + 'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"), # TODO :: (for Isaac) + + # Regex for matching Years in the text - > 1995 / 2020 / 2019 + 'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'), # TODO :: (for Isaac) + + # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14) + 'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'), # TODO :: (for Isaac) + + # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500] + 'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*"), # TODO :: (for Isaac) + + # Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30% + 'PERCENTAGE': re.compile(r"\d+(\.\d+)?%"), # TODO :: (for Isaac) + + # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. , + # https://www.ims.uni-stuttgart.de/ + 'URL': re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") # TODO :: (for Isaac) } diff --git a/utils/models.py b/utils/models.py index dfea3a0..9ac6326 100644 --- a/utils/models.py +++ b/utils/models.py @@ -3,7 +3,8 @@ from feature_extraction.features import extract_features_from_text class DataInstance: """ - Model Class for carrying Training and Testing data from tsc/csv file + Model Class for carrying Training and Testing data from tsv/csv file. + Also carries the extracted features. """ def __init__(self, r_id, text, true_label):