From ecdce0116e06ab28c7ee7c392aa91d47323f5ebd Mon Sep 17 00:00:00 2001 From: Isaac Riley Date: Sun, 17 May 2020 19:36:36 +0200 Subject: [PATCH] fixed regular expressions --- utils/constants.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/utils/constants.py b/utils/constants.py index e0b3874..4308dc0 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -7,21 +7,21 @@ AVG_MACRO = 'MACRO' REGEX_CONSTANTS = { # Regex for matching Acronym Patterns -> COVID-19 / SEKA / SMY2 / EAP1 / SCP16 / ASC1 / DENV-2 - 'ACRONYM': re.compile(r"\s*\b[A-Z.]{2,}s?\b\s*"), # TODO :: (for Isaac) + 'ACRONYM': re.compile(r"[m0-9\W]([A-Z]{2,})[s0-9\W]"), # Regex for matching Years in the text - > 1995 / 2020 / 2019 - 'CONTAINS_YEAR': re.compile('.*([1-2][0-9]{3})'), # TODO :: (for Isaac) + 'CONTAINS_YEAR': re.compile(r"(?<=[^\d])1[8-9]\d\d(?=[^\d])|(?<=[^\d])2[0-1]\d{2}(?=[^\d])"), # Regex for matching Number Sequences in the text -> (15) / (10, 11, 112, 113) / (1,7,8,10-14) - 'SEQUENCE': re.compile(r'\s+\((\d+,* *)*\)\s+'), # TODO :: (for Isaac) + 'SEQUENCE': re.compile(r"\([\d-, ]+\)"), # Regex for matching References in the text -> [4] / [ 10-17, 19, 20] / [123, 500] - 'REFERENCE': re.compile(r"\s*\[(\d+,* *)*\]\s*"), # TODO :: (for Isaac) + 'REFERENCE': re.compile(r"\[[\d-, ]+\]"), # Regex for matching percentages in the text -> 99% / 99.99% / 10 % / 23.98% / 10-20% / 25%-30% - 'PERCENTAGE': re.compile(r"\d+(\.\d+)?%"), # TODO :: (for Isaac) + 'PERCENTAGE': re.compile(r"\d[\d.-]+%"), # Regex for matching URLs -> http://www.phrap.org/, http://www. , http://carcfordjournals. , # https://www.ims.uni-stuttgart.de/ - 'URL': re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") # TODO :: (for Isaac) + 'URL': re.compile(r"https?://\S+")#...\S+(?=\.?,?:?[\s\"$])") }