Added few comments and Regex Patterns

isaac
Pavan Mandava 6 years ago
parent 190f9f35e6
commit d41c674b49

@ -1,13 +1,13 @@
from utils.constants import REGEX_CONSTANTS
import feature_extraction.lexicons as lexicons import feature_extraction.lexicons as lexicons
import re from utils.constants import REGEX_CONSTANTS
""" List of supported features for feature extraction from Input String """ """ List of supported features for feature extraction from Input String """
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH', FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'CITATION', 'ACRONYM',
'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE'] 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE'] """ Features with Regex Pattern Matching - For these features, get the regex pattern from constants"""
REGEX_FEATURES = ['ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
def extract_features_from_text(text: str): def extract_features_from_text(text: str):
@ -25,11 +25,16 @@ def extract_features_from_text(text: str):
# Iterate through the list features and get list of words from the lexicon dictionary, # Iterate through the list features and get list of words from the lexicon dictionary,
# for each word in the word list, check if it appears in input text and add it to the text feature list # for each word in the word list, check if it appears in input text and add it to the text feature list
for feature in FEATURE_LIST: for feature in FEATURE_LIST:
# If the feature is Regex Pattern Match, get the pattern from :`~feature_extraction.lexicons.ALL_LEXICONS`
# and match it with the input text
if feature in REGEX_FEATURES: if feature in REGEX_FEATURES:
pattern = REGEX_CONSTANTS[feature] pattern = REGEX_CONSTANTS[feature]
if bool(pattern.match(text)): if bool(pattern.match(text)):
text_feature_list.append(feature) text_feature_list.append(feature)
continue continue
# If the feature is not a Regex Pattern Match, then get the list of dictionary words from lexicon dictionary
word_list = lexicon_dict[feature] word_list = lexicon_dict[feature]
for word in word_list: for word in word_list:
if word in text: if word in text:

@ -1,3 +1,6 @@
"""
Dictionary of Lexicons used for Feature Extraction
"""
ALL_LEXICONS = { ALL_LEXICONS = {
'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst', 'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst',

@ -28,6 +28,5 @@ train_file_path = project_root+'/data/tsv/train.tsv'
print(train_file_path) print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
for inst in data: for inst in data[:10]:
if len(inst.features) <= 0:
inst.print() inst.print()

@ -5,8 +5,23 @@ AVG_MICRO = 'MICRO'
AVG_MACRO = 'MACRO' AVG_MACRO = 'MACRO'
REGEX_CONSTANTS = { REGEX_CONSTANTS = {
'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"),
'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'), # Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2
'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'), 'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"), # TODO :: (for Isaac)
'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*")
# Regex for matching Years in the text - > 1995 / 2020 / 2019
'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'), # TODO :: (for Isaac)
# Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14)
'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'), # TODO :: (for Isaac)
# Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500]
'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*"), # TODO :: (for Isaac)
# Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30%
'PERCENTAGE': re.compile(r"\d+(\.\d+)?%"), # TODO :: (for Isaac)
# Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. ,
# https://www.ims.uni-stuttgart.de/
'URL': re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") # TODO :: (for Isaac)
} }

@ -3,7 +3,8 @@ from feature_extraction.features import extract_features_from_text
class DataInstance: class DataInstance:
""" """
Model Class for carrying Training and Testing data from tsc/csv file Model Class for carrying Training and Testing data from tsv/csv file.
Also carries the extracted features.
""" """
def __init__(self, r_id, text, true_label): def __init__(self, r_id, text, true_label):

Loading…
Cancel
Save