diff --git a/csv2df.py b/csv2df.py index d3fcd66..bc924a2 100644 --- a/csv2df.py +++ b/csv2df.py @@ -1,10 +1,12 @@ from collections import OrderedDict - +import os import pandas as pd def get_book_content(): - df = pd.read_csv("test_example.csv", header=None).rename( + csv_path = os.path.dirname(os.path.realpath(__file__)) + '/test_example.csv' + print('Test CSV File :: ', csv_path) + df = pd.read_csv(csv_path, header=None).rename( columns={0: 'chapter', 1: 'sentence', 2: 'text'}) book_dict = OrderedDict() @@ -25,22 +27,24 @@ def get_book_content(): def get_book_metadata(): dict_metadata = { - "book_id": "abcdef", - "title": "Bullshit", + "book_id": "fdcap_book", + "title": "Crime and Punishment", "lang": "en", "isTranslation": "true", "totalChapters": "2", "authors": [ { - "name": "Herr Riley", + "name": "Herr Isaac Riley", "translator": "true" }, { - "name": "Herr Singh" + "name": "Fyodor Dostoevsky" } ], - "description": "Some Random Bullshit description", - "source": "https://www.idontcare.com" + "description": "Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author " + "Fyodor Dostoevsky.First published in a journal named The Russian Messenger, it appeared in " + "twelve monthly installments in 1866, and was later published as a novel", + "source": "https://en.wikisource.org/wiki/Crime_and_Punishment" } return dict_metadata diff --git a/db_schema/db_schema.sql b/db_schema/db_schema.sql index d81fc44..a5b97bf 100644 --- a/db_schema/db_schema.sql +++ b/db_schema/db_schema.sql @@ -52,7 +52,7 @@ DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_info` ; CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` ( `id` INT NOT NULL AUTO_INCREMENT, `title` VARCHAR(90) NOT NULL, - `description` VARCHAR(200) NULL, + `description` VARCHAR(450) NULL, `lang` VARCHAR(5) NOT NULL, `source` VARCHAR(90) NOT NULL, `is_translation` TINYINT NOT NULL, @@ -124,7 +124,7 @@ DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_sentence` ; CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` ( `id` INT NOT NULL AUTO_INCREMENT, `s_num` INT UNSIGNED NOT NULL, - `text` VARCHAR(500) NOT NULL, + `text` VARCHAR(900) NOT NULL, `chapter` INT NOT NULL, PRIMARY KEY (`id`), CONSTRAINT `sen_chapter_fk` diff --git a/run.py b/run.py index 6f1055a..5be3377 100644 --- a/run.py +++ b/run.py @@ -24,7 +24,10 @@ def save_validated_files_to_db(): book_dict = read_xml.parse_xml_file(book['xml_file_path']) result = adb.add_book_to_db(book_code, book_dict) book['is_saved_to_db'] = result - print(const.BLUE, 'Result :: ', result, const.END, '\n') + w_str = const.WARNING + if result: + w_str = const.BLUE + print(w_str, 'Result :: ', result, const.END, '\n') json_data['books'] = books_json json_utils.write_json_file(const.JSON_PATH, json_data) diff --git a/test_example.csv b/test_example.csv index 958dda3..a753069 100644 --- a/test_example.csv +++ b/test_example.csv @@ -2,4 +2,5 @@ 1,2,"Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt." 1,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem." 2,1,"Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?" -2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?" \ No newline at end of file +2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?" +2,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem." \ No newline at end of file diff --git a/utils/json_utils.py b/utils/json_utils.py index d02296c..976f05d 100644 --- a/utils/json_utils.py +++ b/utils/json_utils.py @@ -1,9 +1,9 @@ import json -from pathlib import Path +import os def read_json_file(file_path): - json_file_path = Path(file_path) + json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path with open(json_file_path, 'r') as json_file: json_data = json.load(json_file) @@ -12,7 +12,7 @@ def read_json_file(file_path): def write_json_file(file_path, json_data): - json_file_path = Path(file_path) + json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path with open(json_file_path, 'w') as updated_json: updated_json.write(json.dumps(json_data, indent=4)) diff --git a/xml_files/book_structure.xml b/xml_files/book_structure.xml index 18d93d4..1f19b8d 100644 --- a/xml_files/book_structure.xml +++ b/xml_files/book_structure.xml @@ -2,17 +2,17 @@ Crime and Punishment + en + true + 2 + https://en.wikisource.org/wiki/Crime_and_Punishment Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author Fyodor Dostoevsky. First published in a journal named The Russian Messenger, it appeared in twelve monthly installments in 1866, and was later published as a novel. - en + n.a. Fyodor Dostoevsky Constance Garnett - https://en.wikisource.org/wiki/Crime_and_Punishment - true - 2 - n.a. diff --git a/xml_parser/__init__.py b/xml_parser/__init__.py index d3a4463..24662b6 100644 --- a/xml_parser/__init__.py +++ b/xml_parser/__init__.py @@ -1,12 +1,16 @@ from pathlib import Path import json +import utils.constants as const +import os -json_file_path = Path('json/books.json') + +json_path = os.path.dirname(os.path.dirname(__file__))+'/'+const.JSON_PATH +json_file_path = Path(json_path) json_data = {'books': {}} if not json_file_path.is_file(): json_file = open(json_file_path, 'w') json_file.write(json.dumps(json_data, indent=4)) json_file.close() - print('JSON File Created :: '+json_file.name) + print(const.BLUE, 'JSON File Created :: '+json_file.name, const.END) diff --git a/xml_parser/create_xml.py b/xml_parser/create_xml.py index ca90bab..c917bde 100644 --- a/xml_parser/create_xml.py +++ b/xml_parser/create_xml.py @@ -58,9 +58,9 @@ def create_xml_file(book_dict, book_metadata): filename = book_root.get('code') + "_" + lang.text + ".xml" file = open(output_dir + '/' + filename, 'w') file_path = file.name - print('XML File Path :: ', file_path) file.write(prettify(book_root)) file.close() + print(const.BLUE, 'Saved XML File Path :: ', file_path, const.END) json_obj = {} book_code = book_root.get('code') json_obj['xml_file'] = filename diff --git a/xml_parser/test_parser.py b/xml_parser/test_parser.py index 0ac315c..fc00ba8 100644 --- a/xml_parser/test_parser.py +++ b/xml_parser/test_parser.py @@ -1,11 +1,10 @@ from csv2df import get_book_content, get_book_metadata import xml_parser.create_xml as create_xml import xml_parser.read_xml as read_xml -import xmlschema -from pathlib import Path import xml_parser.validate as validate -# file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata()) + +file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata()) # print(file_path) diff --git a/xml_parser/validate.py b/xml_parser/validate.py index d763feb..a908ae0 100644 --- a/xml_parser/validate.py +++ b/xml_parser/validate.py @@ -1,8 +1,8 @@ import xmlschema import json -from pathlib import Path import utils.json_utils as json_utils import utils.constants as const +import os def is_valid(book_schema, xml_path): @@ -10,8 +10,8 @@ def is_valid(book_schema, xml_path): def get_book_schema(book_xsd_path): - xsd_path = Path(book_xsd_path) - book_schema = xmlschema.XMLSchema(str(xsd_path.absolute())) + xsd_full_path = os.path.dirname(os.path.dirname(__file__))+'/'+book_xsd_path + book_schema = xmlschema.XMLSchema(xsd_full_path) return book_schema