From 10c77c51c02ab09071f57f4f3ebd74d87e387ce0 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Thu, 23 Jan 2020 19:03:54 +0100 Subject: [PATCH] CSV Added for Books Data, text files folder changed, Added create xml to run.py --- books_data.csv | 16 ++++++++++++++ {txt => data}/crime_EN.txt | 0 {txt => data}/crime_FR.txt | 0 {txt => data}/crime_RU.txt | 0 {txt => data}/dead_EN.txt | 0 {txt => data}/dead_FR.txt | 0 {txt => data}/dead_RU.txt | 0 {txt => data}/karamazov_EN.txt | 0 {txt => data}/karamazov_FR.txt | 0 {txt => data}/karamazov_RU.txt | 0 {txt => data}/poor_EN.txt | 0 {txt => data}/poor_FR.txt | 0 {txt => data}/poor_RU.txt | 0 {txt => data}/underground_EN.txt | 0 {txt => data}/underground_FR.txt | 0 {txt => data}/underground_RU.txt | 0 run.py | 11 ++++++++-- txt_parser/csv_utils.py | 37 ++++++++++++++++++++++++++++++++ txt_parser/test_txt.py | 10 +++++++++ txt_parser/txt_cleaner.py | 2 +- utils/constants.py | 4 ++++ 21 files changed, 77 insertions(+), 3 deletions(-) create mode 100644 books_data.csv rename {txt => data}/crime_EN.txt (100%) rename {txt => data}/crime_FR.txt (100%) rename {txt => data}/crime_RU.txt (100%) rename {txt => data}/dead_EN.txt (100%) rename {txt => data}/dead_FR.txt (100%) rename {txt => data}/dead_RU.txt (100%) rename {txt => data}/karamazov_EN.txt (100%) rename {txt => data}/karamazov_FR.txt (100%) rename {txt => data}/karamazov_RU.txt (100%) rename {txt => data}/poor_EN.txt (100%) rename {txt => data}/poor_FR.txt (100%) rename {txt => data}/poor_RU.txt (100%) rename {txt => data}/underground_EN.txt (100%) rename {txt => data}/underground_FR.txt (100%) rename {txt => data}/underground_RU.txt (100%) create mode 100644 txt_parser/csv_utils.py create mode 100644 txt_parser/test_txt.py diff --git a/books_data.csv b/books_data.csv new file mode 100644 index 0000000..accd7c1 --- /dev/null +++ b/books_data.csv @@ -0,0 +1,16 @@ +Index;BookCode;Language;BookName;Status +1;dost_cap;en;crime_EN.txt; +2;dost_cap;fr;crime_FR.txt; +3;dost_cap;ru;crime_RU.txt; +4;dost_deadhouse;en;dead_EN.txt; +5;dost_deadhouse;fr;dead_FR.txt; +6;dost_deadhouse;ru;dead_RU.txt; +7;dost_karamazov;en;karamazov_EN.txt; +8;dost_karamazov;fr;karamazov_FR.txt; +9;dost_karamazov;ru;karamazov_RU.txt; +10;dost_poorfolk;en;poor_EN.txt; +11;dost_poorfolk;fr;poor_FR.txt; +12;dost_poorfolk;ru;poor_RU.txt; +13;dost_underground;en;underground_EN.txt; +14;dost_underground;fr;underground_FR.txt; +15;dost_underground;ru;underground_RU.txt; diff --git a/txt/crime_EN.txt b/data/crime_EN.txt similarity index 100% rename from txt/crime_EN.txt rename to data/crime_EN.txt diff --git a/txt/crime_FR.txt b/data/crime_FR.txt similarity index 100% rename from txt/crime_FR.txt rename to data/crime_FR.txt diff --git a/txt/crime_RU.txt b/data/crime_RU.txt similarity index 100% rename from txt/crime_RU.txt rename to data/crime_RU.txt diff --git a/txt/dead_EN.txt b/data/dead_EN.txt similarity index 100% rename from txt/dead_EN.txt rename to data/dead_EN.txt diff --git a/txt/dead_FR.txt b/data/dead_FR.txt similarity index 100% rename from txt/dead_FR.txt rename to data/dead_FR.txt diff --git a/txt/dead_RU.txt b/data/dead_RU.txt similarity index 100% rename from txt/dead_RU.txt rename to data/dead_RU.txt diff --git a/txt/karamazov_EN.txt b/data/karamazov_EN.txt similarity index 100% rename from txt/karamazov_EN.txt rename to data/karamazov_EN.txt diff --git a/txt/karamazov_FR.txt b/data/karamazov_FR.txt similarity index 100% rename from txt/karamazov_FR.txt rename to data/karamazov_FR.txt diff --git a/txt/karamazov_RU.txt b/data/karamazov_RU.txt similarity index 100% rename from txt/karamazov_RU.txt rename to data/karamazov_RU.txt diff --git a/txt/poor_EN.txt b/data/poor_EN.txt similarity index 100% rename from txt/poor_EN.txt rename to data/poor_EN.txt diff --git a/txt/poor_FR.txt b/data/poor_FR.txt similarity index 100% rename from txt/poor_FR.txt rename to data/poor_FR.txt diff --git a/txt/poor_RU.txt b/data/poor_RU.txt similarity index 100% rename from txt/poor_RU.txt rename to data/poor_RU.txt diff --git a/txt/underground_EN.txt b/data/underground_EN.txt similarity index 100% rename from txt/underground_EN.txt rename to data/underground_EN.txt diff --git a/txt/underground_FR.txt b/data/underground_FR.txt similarity index 100% rename from txt/underground_FR.txt rename to data/underground_FR.txt diff --git a/txt/underground_RU.txt b/data/underground_RU.txt similarity index 100% rename from txt/underground_RU.txt rename to data/underground_RU.txt diff --git a/run.py b/run.py index 0a0c5aa..f6d4028 100644 --- a/run.py +++ b/run.py @@ -4,6 +4,8 @@ import xml_parser.validate as validate import utils.json_utils as json_utils import utils.constants as const import utils.env_utils as env +import xml_parser.create_xml as create_xml +from csv2df import get_book_content, get_book_metadata def validate_all_xml_files(): @@ -33,6 +35,11 @@ def save_validated_files_to_db(): json_utils.write_json_file(const.JSON_PATH, json_data) +def create_xml_file(book_content_dict, book_metadata_dict): + create_xml.create_xml_file(book_content_dict, book_metadata_dict) + + if env.check_env_variables(): - validate_all_xml_files() - # save_validated_files_to_db() \ No newline at end of file + create_xml_file(get_book_content(), get_book_metadata()) + # validate_all_xml_files() + # save_validated_files_to_db() diff --git a/txt_parser/csv_utils.py b/txt_parser/csv_utils.py new file mode 100644 index 0000000..acacf4a --- /dev/null +++ b/txt_parser/csv_utils.py @@ -0,0 +1,37 @@ +import csv +import os +import utils.constants as const + + +csv_header_row = ['Index', 'BookCode', 'Language', 'BookName', 'Status'] + + +def read_books_csv_file(csv_file_name): + csv_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+csv_file_name + with open(csv_file_path, 'r') as file: + books_data = csv.reader(file, delimiter=';') + books_list = [] + is_header = True + for book in books_data: + if is_header: + is_header = False + continue + books_list.append(book) + return books_list + + +def write_books_data_to_csv(csv_file_name, books_list): + csv_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+csv_file_name + with open(csv_file_path, 'w') as file: + writer = csv.writer(file, delimiter=';') + writer.writerow(csv_header_row) + for book in books_list: + writer.writerow(book) + + +def read_data_file(file_path): + txt_file_path = os.path.dirname(os.path.dirname(__file__)) + const.DATA_FOLDER + file_path + with open(txt_file_path, 'r') as file: + lines = file.readline() + file.close() + return lines \ No newline at end of file diff --git a/txt_parser/test_txt.py b/txt_parser/test_txt.py new file mode 100644 index 0000000..b4a27e5 --- /dev/null +++ b/txt_parser/test_txt.py @@ -0,0 +1,10 @@ +import txt_parser.csv_utils as read_csv +import utils.constants as const + +books_list = read_csv.read_books_csv_file(const.CSV_FILE) + +for book in books_list: + print(book) + print(type(book)) + +read_csv.write_books_data_to_csv(const.CSV_FILE, books_list) \ No newline at end of file diff --git a/txt_parser/txt_cleaner.py b/txt_parser/txt_cleaner.py index a7d7be1..c8b5b1f 100644 --- a/txt_parser/txt_cleaner.py +++ b/txt_parser/txt_cleaner.py @@ -11,7 +11,7 @@ import pandas as pd def get_text(): - file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\txt\\crime_EN.txt", 'r') + file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r') lines = file.readlines() file.close() count = 0 diff --git a/utils/constants.py b/utils/constants.py index 9d91590..22ce12a 100644 --- a/utils/constants.py +++ b/utils/constants.py @@ -2,6 +2,10 @@ JSON_PATH = 'json/books.json' XSD_PATH = 'xml_parser/book.xsd' +CSV_FILE = 'books_data.csv' + +DATA_FOLDER = '/data/' + TRANSLATE_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS' MYSQL_PASS_ENV_VAR = 'MYSQL_PASSWORD'