From cc77b3a7550cb0bbd90d41c5d94a06319f174034 Mon Sep 17 00:00:00 2001
From: Pavan Mandava <mspavan04@gmail.com>
Date: Mon, 11 May 2020 00:28:07 +0200
Subject: [PATCH] Feature Extraction with LEXICONS, Need to add more Lexicons
 and improve feature representation

---
 feature_extraction/__init__.py |  0
 feature_extraction/features.py | 38 ++++++++++++++++++++++++++++++++++
 feature_extraction/lexicons.py | 34 ++++++++++++++++++++++++++++++
 utils/constants.py             |  6 +++++-
 utils/models.py                |  6 +++++-
 5 files changed, 82 insertions(+), 2 deletions(-)
 create mode 100644 feature_extraction/__init__.py
 create mode 100644 feature_extraction/features.py
 create mode 100644 feature_extraction/lexicons.py

diff --git a/feature_extraction/__init__.py b/feature_extraction/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/feature_extraction/features.py b/feature_extraction/features.py
new file mode 100644
index 0000000..3710817
--- /dev/null
+++ b/feature_extraction/features.py
@@ -0,0 +1,38 @@
+from utils.constants import REGEX_CONSTANTS
+import feature_extraction.lexicons as lexicons
+import re
+
+""" List of supported features for feature extraction from Input String """
+FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'USE', 'IMPORTANT', 'RESEARCH', 'APPROACH',
+                'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'CITATION', 'ACRONYM']
+
+REGEX_FEATURES = ['ACRONYM']
+
+
+def extract_features_from_text(text: str):
+    """
+    This function takes text string as input, extracts and returns a list of features by checking each word in
+        :`~feature_extraction.lexicons.ALL_LEXICONS`
+    :param text: takes string text as param
+    :return: returns a list of extracted features from the text, empty list for no features
+    """
+
+    # ALL_LEXICONS
+    lexicon_dict = lexicons.ALL_LEXICONS
+
+    text_feature_list = []
+    # Iterate through the list features and get list of words from the lexicon dictionary,
+    # for each word in the word list, check if it appears in input text and add it to the text feature list
+    for feature in FEATURE_LIST:
+        if feature in REGEX_FEATURES:
+            regex = REGEX_CONSTANTS[feature]
+            if bool(re.match(regex, text)):
+                text_feature_list.append(feature)
+            continue
+        word_list = lexicon_dict[feature]
+        for word in word_list:
+            if word in text:
+                text_feature_list.append(feature)
+                break
+
+    return text_feature_list
diff --git a/feature_extraction/lexicons.py b/feature_extraction/lexicons.py
new file mode 100644
index 0000000..6105b94
--- /dev/null
+++ b/feature_extraction/lexicons.py
@@ -0,0 +1,34 @@
+ALL_LEXICONS = {
+
+    'COMPARE': ['compar' 'compet', 'evaluat', 'test', 'superior', 'inferior', 'better', 'best', 'worse', 'worst',
+                'greater', 'larger', 'faster', 'measur'],
+
+    'CONTRAST': ['contrast', 'different' 'distinct', 'conflict', 'disagree', 'oppose', 'distinguish', 'contrary'],
+
+    'RESULT': ['evidence', 'experiment', 'find', 'progress', 'observation', 'outcome', 'result'],
+
+    'USE': ['use', 'using', 'apply', 'applied', 'employ', 'make use', 'utilize', 'implement'],
+
+    'IMPORTANT': ['important', 'main', 'key', 'basic', 'central', 'crucial', 'critical', 'essential', 'fundamental',
+                  'great', 'largest', 'major', 'overall', 'primary', 'principle', 'serious', 'substantial', 'ultimate'],
+
+    'RESEARCH': ['apply', 'analyze', 'characteri', 'formali', 'investigat', 'implement', 'interpret', 'examin',
+                 'observ', 'predict', 'verify', 'work on', 'empirical', 'experiment', 'exploratory', 'ongoing',
+                 'quantitative', 'qualitative', 'preliminary', 'statistical', 'underway'],
+
+    'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
+                 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
+                 'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy',
+                 'system', 'technique', 'theory', 'tool', 'treatment'],
+
+    'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
+               'hint', 'insist', 'mention', 'proclaim', 'promise', 'protest', 'remark', 'reply', 'report', 'say',
+               'suggest', 'swear', 'write'],
+    
+    'BEFORE': ['earlier', 'initial', 'past', 'previous', 'prior'],
+    
+    'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
+
+    'CITATION': ['et al'],  # TODO (for Isaac) :: Write a complex regex for finding Citations in the text
+
+}
diff --git a/utils/constants.py b/utils/constants.py
index 7941faf..64bee15 100644
--- a/utils/constants.py
+++ b/utils/constants.py
@@ -1,2 +1,6 @@
 AVG_MICRO = 'MICRO'
-AVG_MACRO = 'MACRO'
\ No newline at end of file
+AVG_MACRO = 'MACRO'
+
+REGEX_CONSTANTS = {
+    'ACRONYM': '\\b[A-Z\\.]{2,}s?\\b'
+}
diff --git a/utils/models.py b/utils/models.py
index f2a6753..dfea3a0 100644
--- a/utils/models.py
+++ b/utils/models.py
@@ -1,3 +1,5 @@
+from feature_extraction.features import extract_features_from_text
+
 
 class DataInstance:
     """
@@ -8,6 +10,8 @@ class DataInstance:
         self.did = r_id
         self.text = text
         self.true_label = true_label
+        self.features = extract_features_from_text(text)
 
     def print(self):
-        print('True Label :: ', self.true_label, ' Text :: ', self.text)
+        print('\nTrue Label :: ', self.true_label, ' Text :: ', self.text)
+        print('Features :: ', self.features)