Added some more Lexicons

6 years ago · c6440b2553
parent ce8b6684f7
commit c6440b2553
3 changed files with 33 additions and 15 deletions
--- a/feature_extraction/features.py
+++ b/feature_extraction/features.py
@ -2,8 +2,9 @@ import feature_extraction.lexicons as lexicons
 from utils.constants import REGEX_CONSTANTS

 """ List of supported features for feature extraction from Input String """
-FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT', 'IMPORTANT', 'RESEARCH',
-                'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'CITATION',
+FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
+                'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
+                'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
                'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']

 """ Feature Name for Theta Bias -- need to add it to the list of features for all data instances """
--- a/feature_extraction/lexicons.py
+++ b/feature_extraction/lexicons.py
@ -26,13 +26,13 @@ ALL_LEXICONS = {
                  'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'],

    'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
-                 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical',
+                 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
                 'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
-                 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document'],
+                 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation'],

    'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
                 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
-                 'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy',
+                 'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strateg',
                 'system', 'technique', 'theory', 'tool', 'treatment'],

    'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
@ -44,18 +44,25 @@ ALL_LEXICONS = {
    'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],

    'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
-                      'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists',
+                      'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
                      'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],

    'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia',
                 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium',
-                 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule',
+                 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'rna',
                 'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
-                 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'antibodies'],
-
-    'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram',
-             'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient',
-             'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'math'],
+                 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
+                 'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant',
+                 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen'],
+
+    'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
+             'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
+             'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
+             'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', 'probabilit', 'multivariate'],
+
+    'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', 'algorithm',
+                         'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', 'corpus', 'uml', 'system',
+                         'security', 'protocol'],

    'CITATION': ['et al'],  # TODO (for Isaac) :: Write a complex regex for finding Citations in the text

--- a/testing/feature_testing.py
+++ b/testing/feature_testing.py
@ -11,11 +11,21 @@ print(train_file_path)
 data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')

 i = 0
+feature_dict = {}
 for inst in data:
-    if len(inst.features) <= 0:
-        inst.print()
+    if len(inst.features) >= 0:
+        # inst.print()
        i += 1
-print('Data Points without Features :: ', i)
+        tokens = inst.text.split()
+        for token in tokens:
+            if token not in feature_dict:
+                feature_dict[token] = 1
+                continue
+            feature_dict[token] += 1
+
+for key in sorted(feature_dict, key=feature_dict.get, reverse=True):
+    print(key, ' -> ', feature_dict.get(key))
+# print('Data Points without Features :: ', i)

 #         tokens = inst.text.split()
 #         for token in tokens: