diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db/mysql_connection.py b/db/mysql_connection.py new file mode 100644 index 0000000..731b7d5 --- /dev/null +++ b/db/mysql_connection.py @@ -0,0 +1,23 @@ +import mysql.connector +from mysql.connector import errorcode + +import db.read_config as config + + +def get_new_mysql_connection(): + + db_config = config.read_db_config('../db_config.ini', 'mysql') + + connection = None + + try: + connection = mysql.connector.connect(**db_config) + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + print('Invalid Database User and Password') + elif err.errno == errorcode.ER_BAD_DB_ERROR: + print('Database doesn\'t exist ') + else: + print(err) + + return connection diff --git a/db/read_config.py b/db/read_config.py new file mode 100644 index 0000000..9bbf5f3 --- /dev/null +++ b/db/read_config.py @@ -0,0 +1,28 @@ +from configparser import ConfigParser +import os + + +def read_db_config(filename, section): + """ Read database configuration file and return a dictionary object + :param filename: name of the configuration file + :param section: section of database configuration + :return: a dictionary of database parameters + """ + + parser = ConfigParser() + parser.read(filename) + + db = {} + if parser.has_section(section): + items = parser.items(section) + for item in items: + db[item[0]] = item[1] + else: + raise Exception('{0} not found in the {1} file'.format(section, filename)) + + try: + db['password'] = os.environ[db['password']] + except KeyError: + print('Please set the Environment Variable ', db['password']) + + return db diff --git a/db/test.py b/db/test.py new file mode 100644 index 0000000..61cbb42 --- /dev/null +++ b/db/test.py @@ -0,0 +1,9 @@ +import db.mysql_connection as connection + +conn = connection.get_new_mysql_connection() + +print(conn.charset) +print('isConnected :: ', conn.is_connected()) + +conn.close() + diff --git a/db_config.ini b/db_config.ini new file mode 100644 index 0000000..85719a5 --- /dev/null +++ b/db_config.ini @@ -0,0 +1,6 @@ +[mysql] +host = 127.0.0.1 +port = 3306 +database = bitext-aligner +user = root +password = MYSQL_PASSWORD \ No newline at end of file diff --git a/db_schema/db_schema.pdf b/db_schema/db_schema.pdf index c158807..e93ad6a 100644 Binary files a/db_schema/db_schema.pdf and b/db_schema/db_schema.pdf differ diff --git a/db_schema/db_schema.png b/db_schema/db_schema.png index b6befc6..62e448f 100644 Binary files a/db_schema/db_schema.png and b/db_schema/db_schema.png differ diff --git a/db_schema/db_schema.sql b/db_schema/db_schema.sql index 78ef29b..1172375 100644 --- a/db_schema/db_schema.sql +++ b/db_schema/db_schema.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- Thu Jan 16 23:41:59 2020 +-- Sat Jan 18 00:33:33 2020 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -35,6 +35,7 @@ ENGINE = InnoDB; CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book` ( `id` INT NOT NULL, `code` VARCHAR(90) NOT NULL, + `added_at` BIGINT UNSIGNED NOT NULL, PRIMARY KEY (`id`)) ENGINE = InnoDB; diff --git a/xml_parser/__init__.py b/xml_parser/__init__.py index 4677764..d3a4463 100644 --- a/xml_parser/__init__.py +++ b/xml_parser/__init__.py @@ -3,7 +3,7 @@ import json json_file_path = Path('json/books.json') -json_data = {'books': []} +json_data = {'books': {}} if not json_file_path.is_file(): json_file = open(json_file_path, 'w') json_file.write(json.dumps(json_data, indent=4)) diff --git a/xml_parser/create_xml.py b/xml_parser/create_xml.py index 4109a5d..83b29a7 100644 --- a/xml_parser/create_xml.py +++ b/xml_parser/create_xml.py @@ -48,38 +48,45 @@ def create_xml_file(book_dict, book_metadata): chapter.set('num', str(key)) for idx, val in enumerate(book_dict[key]): sentence = ET.SubElement(chapter, 'sentence') - sentence.set('id', str(idx + 1)) + sentence.set('num', str(idx + 1)) sentence.text = val # tree = ET.ElementTree(book_root) # tree.write(filename) root_dir = os.path.dirname(os.path.dirname(__file__)) output_dir = os.path.join(root_dir, "xml_files") - filename = book_root.get('id') + "_" + lang.text + ".xml" + filename = book_root.get('code') + "_" + lang.text + ".xml" file = open(output_dir + '/' + filename, 'w') file_path = file.name print('XML File Path :: ', file_path) file.write(prettify(book_root)) file.close() json_obj = {} - json_obj['book_id'] = book_root.get('id') + bbok_code = book_root.get('code') json_obj['xml_file'] = filename json_obj['lang'] = lang.text json_obj['xml_file_path'] = file_path json_obj['is_validated'] = False json_obj['is_saved_to_db'] = False - add_xml_book_data_to_json(json_obj) + add_xml_book_data_to_json(bbok_code, json_obj) -def add_xml_book_data_to_json(json_obj): +def add_xml_book_data_to_json(book_code, json_obj): json_file_path = Path('json/books.json') json_file = open(json_file_path, 'r') json_data = json.load(json_file) json_file.close() + books = json_data['books'] + if book_code in books.keys(): + books[book_code].append(json_obj) + else: + books[book_code] = [json_obj] + + json_data['books'] = books + json_file = open(json_file_path, 'w') - json_data['books'].append(json_obj) json_file.write(json.dumps(json_data, indent=4)) json_file.close() diff --git a/xml_parser/read_xml.py b/xml_parser/read_xml.py new file mode 100644 index 0000000..c53706f --- /dev/null +++ b/xml_parser/read_xml.py @@ -0,0 +1,42 @@ +import xml.etree.ElementTree as ET + + +def parse_xml_file(full_path): + + book_dict = {} + + tree = ET.parse(full_path) + book_root = tree.getroot() + # print('Root Element :: ', book_root.tag, ' | Attributes :: ', book_root.attrib) + book_dict['code'] = book_root.attrib['code'] + + book_info_dict = {} + book_content_dict = {} + book_info_element = book_root.find('bookInfo') + book_content_element = book_root.find('content') + + book_info_dict['authors'] = [] + for child in book_info_element: + if 'author' == child.tag: + author = {'name': child.text} + if 'translator' in child.attrib: + author['translator'] = child.attrib['translator'] + book_info_dict['authors'].append(author) + else: + book_info_dict[child.tag] = child.text + + book_dict['bookInfo'] = book_info_dict + + book_content_dict['chapters'] = [] + for chapter in book_content_element: + chapter_dict = {'num': chapter.attrib['num']} + if 'name' in chapter.attrib: + chapter_dict['name'] = chapter.attrib['name'] + chapter_dict['sentences'] = {} + for sentence in chapter.findall('sentence'): + chapter_dict['sentences'][sentence.attrib['num']] = sentence.text + book_content_dict['chapters'].append(chapter_dict) + + book_dict['content'] = book_content_dict + + return book_dict diff --git a/xml_parser/test.py b/xml_parser/test.py index 40a0f6f..ad15380 100644 --- a/xml_parser/test.py +++ b/xml_parser/test.py @@ -1,6 +1,11 @@ from csv2df import get_book_content, get_book_metadata +import xml_parser.create_xml as create_xml +import xml_parser.read_xml as read_xml -from xml_parser.create_xml import create_xml_file +create_xml.create_xml_file(get_book_content(), get_book_metadata()) -create_xml_file(get_book_content(), get_book_metadata()) + +book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml') + +print(book_dict)