You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
citation-analysis/testing/feature_testing.py

29 lines
800 B

import os
from utils.csv import read_csv_file
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
train_file_path = project_root+'/data/tsv/train.tsv'
test_file_path = project_root+'/data/tsv/test.tsv'
print(train_file_path)
data = read_csv_file(csv_file_path=train_file_path, delimiter='\t')
i = 0
for inst in data:
if len(inst.features) <= 0:
inst.print()
i += 1
print('Data Points without Features :: ', i)
# tokens = inst.text.split()
# for token in tokens:
# if token not in feature_dict:
# feature_dict[token] = 1
# continue
# feature_dict[token] += 1
#
# for key in sorted(feature_dict, key=feature_dict.get, reverse=True):
# print(key, ' -> ', feature_dict.get(key))