Added some more Lexicons

isaac
Pavan Mandava 6 years ago
parent ce8b6684f7
commit c6440b2553

@ -2,8 +2,9 @@ import feature_extraction.lexicons as lexicons
from utils.constants import REGEX_CONSTANTS from utils.constants import REGEX_CONSTANTS
""" List of supported features for feature extraction from Input String """ """ List of supported features for feature extraction from Input String """
FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT', 'IMPORTANT', 'RESEARCH', FEATURE_LIST = ['COMPARE', 'CONTRAST', 'RESULT', 'INCREASE', 'CHANGE', 'USE', 'PRESENT',
'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION', 'PROFESSIONALS', 'MEDICINE', 'MATH', 'CITATION', 'IMPORTANT', 'RESEARCH', 'APPROACH', 'PUBLIC', 'BEFORE', 'BETTER_SOLUTION',
'PROFESSIONALS', 'MEDICINE', 'MATH', 'COMPUTER_SCIENCE', 'CITATION',
'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL'] 'ACRONYM', 'CONTAINS_YEAR', 'SEQUENCE', 'REFERENCE', 'PERCENTAGE', 'URL']
""" Feature Name for Theta Bias -- need to add it to the list of features for all data instances """ """ Feature Name for Theta Bias -- need to add it to the list of features for all data instances """

@ -26,13 +26,13 @@ ALL_LEXICONS = {
'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'], 'significant', 'remarkable', 'noteworthy', 'crucial', 'emerge'],
'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat', 'RESEARCH': ['research', 'paper', 'study', 'studie', 'apply', 'analyze', 'characteri', 'formali', 'investigat',
'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'implement', 'interpret', 'examin', 'observ', 'predict', 'verify', 'work on', 'empirical', 'determin',
'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical', 'experiment', 'exploratory', 'ongoing', 'quantitative', 'qualitative', 'preliminary', 'statistical',
'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document'], 'knowledge', 'underway', 'discuss', 'reference', 'publish', 'document', 'orientation'],
'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri', 'APPROACH': ['approach', 'account', 'algorithm', 'analys', 'approach', 'application', 'architecture', 'characteri',
'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine', 'component', 'design', 'extension', 'formali', 'framework', 'implement', 'investigat', 'machine',
'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strategy', 'method', 'methodology', 'module', 'process', 'procedure', 'program', 'prototype', 'strateg',
'system', 'technique', 'theory', 'tool', 'treatment'], 'system', 'technique', 'theory', 'tool', 'treatment'],
'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain', 'PUBLIC': ['acknowledge', 'admit', 'agree', 'assert', 'claim', 'complain', 'declare', 'deny', 'explain',
@ -44,18 +44,25 @@ ALL_LEXICONS = {
'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'], 'BETTER_SOLUTION': ['boost', 'enhance', 'defeat', 'improve', 'perform better', 'outperform', 'outweigh', 'surpass'],
'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts', 'PROFESSIONALS': ['colleagues', 'community', 'computer scientists', 'computational linguists', 'discourse analysts',
'expert', 'investigators', 'linguists', 'logicians', 'philosophers', 'psycholinguists', 'expert', 'investigators', 'linguists', 'philosophers', 'psycholinguists',
'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'], 'psychologists', 'researchers', 'scholars', 'semanticists', 'scientists'],
'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia', 'MEDICINE': ['medicine', 'tissue', 'gene', 'inflammatory', 'mutant', 'neuro', 'digest', 'ortho', 'kinase', 'pneumonia',
'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium', 'clinical', 'therap', 'kidney', 'receptor', 'cancer', 'synthesis', 'protein', 'syndrom', 'toxin', 'death', 'calcium',
'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'pharma', 'heart', 'disease', 'vitamin', 'tumor', 'blind', 'symptom', 'medical', 'vaccin', 'molecule', 'rna',
'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug', 'biotic', 'patient', 'cells', 'immune', 'blood', 'plasma', 'diagnos', 'neura', 'reproductive', 'plasm', 'drug',
'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'antibodies'], 'membrane', 'muscle', 'contagious', 'inflam', 'physician', 'dna', 'genome', 'bacteria', 'cavity', 'injury',
'antibodies', 'liver', 'treatment', 'pcr', 'acid', 'chronic', 'respirat', 'oxygen', 'stroke', 'antioxidant',
'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'metabolic', 'transmission', 'endogenous', 'syndrome', 'ultrasound', 'pathogen'],
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient',
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'math'], 'MATH': ['matrix', 'gaussian', 'variance', 'radius', 'function', 'comput', 'once', 'twice', 'thrice', 'diagram', 'mean',
'vector', 'rectangle', 'logic', 'amount', 'maxim', 'minim', 'linear', 'magnitude', 'theorem', 'gradient', 'median',
'exponential', 'complex', 'graph', 'mean', 'equation', 'offset', 'calculat', 'coefficient', 'discrete', 'equation',
'math', 'correlation', 'outcome', 'divergence', 'differentiation', 'statistic', 'parameter', 'probabilit', 'multivariate'],
'COMPUTER_SCIENCE': ['database', 'software', 'evaluation', 'framework', 'computer', 'network', 'algorithm',
'dataset','data sets', 'technology', 'kernel', 'metrics', 'nlp', 'xml', 'corpus', 'uml', 'system',
'security', 'protocol'],
'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text 'CITATION': ['et al'], # TODO (for Isaac) :: Write a complex regex for finding Citations in the text

@ -11,11 +11,21 @@ print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t') data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
i = 0 i = 0
feature_dict = {}
for inst in data: for inst in data:
if len(inst.features) <= 0: if len(inst.features) >= 0:
inst.print() # inst.print()
i += 1 i += 1
print('Data Points without Features :: ', i) tokens = inst.text.split()
for token in tokens:
if token not in feature_dict:
feature_dict[token] = 1
continue
feature_dict[token] += 1
for key in sorted(feature_dict, key=feature_dict.get, reverse=True):
print(key, ' -> ', feature_dict.get(key))
# print('Data Points without Features :: ', i)
# tokens = inst.text.split() # tokens = inst.text.split()
# for token in tokens: # for token in tokens:

Loading…
Cancel
Save