diff --git a/.gitignore b/.gitignore index 872c981..5d2709c 100644 --- a/.gitignore +++ b/.gitignore @@ -129,3 +129,4 @@ dmypy.json .pyre/ .idea +*.mwb \ No newline at end of file diff --git a/bitext_align.py b/bitext_align.py new file mode 100644 index 0000000..5068a02 --- /dev/null +++ b/bitext_align.py @@ -0,0 +1,170 @@ +# -*- coding: utf-8 -*- + + +import os,sys +import re +import pandas as pd +import numpy as np +from numpy import cumsum +from pandas import DataFrame +from nltk import word_tokenize, sent_tokenize +#import xml.etree.ElementTree as ET +from jellyfish import levenshtein_distance as lev +#import six +from google.cloud import translate_v2 as translate +from itertools import product as cp + +translate_client = translate.Client() + +''' + +''' + + + +def master_align(text0, text1, lang0, lang1): + """ Takes two equivalent texts (original and trnslation) and returns + aligned texts. """ + df0 = frame_from_text(text0, lang0, lang1) + print('A') + df1 = frame_from_text(text1, lang1, lang0, is1=True) + print('B') + # returns dfs with ['sent', 'trans', 'rellen', 'relpos'] + anchors = anchors_from_frames(df0, df1, window=2) + print('C') + alignments = intermediate_align(df0, df1, anchors, lookahead=4) + print('D') + textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments) + print('E') + return textdict0, textdict1 + + +def frame_from_text(text, source, target, is1=False): # + """ """ # + #print(source, '-->', target) + cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] + #print(cols) + frame = pd.DataFrame(columns=cols) + frame[cols[0]] = sent_tokenize(text) + frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) + frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x)) + frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max() + cumul_b = list(np.cumsum(frame[cols[2]])) + cumul_a = [0]+cumul_b[:-1] + frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b))) + #print(frame[[cols[0], cols[1]]]) + return frame + + +def anchors_from_frames(frame0, frame1, window): # + """ """ + pairdf = generate_pairdf(frame0, frame1, window) + frame0['index0'] = frame0.index + frame1['index1'] = frame1.index + pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1') + pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1) + pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1) + pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1) + pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1) + pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1) + pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3) + return list(pairdf[pairdf.isanchor][['index0','index1']].values) + + +def intermediate_align(frame0, frame1, anchs, lookahead): # + """ """ + aligns = [] + end0, end1 = frame0.shape[0], frame1.shape[0] + anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)])) + for rang in anchor_ranges: + interaligns = get_interalign(frame0, frame1, *rang, lookahead) + a,b = rang[0] + aligns.append(((a,b),(a,b))) + aligns.extend(interaligns) + return aligns[1:] # format [((i_start, i_end),(j_start, j_end))] + + +def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # + """ """ + print(anchors_init, anchors_next) + interaligns = [] + i,j = anchors_init + i+=1 + j+=1 + end0, end1 = anchors_next + while i0] + len0 = frame0.shape[0] + len1 = frame1.shape[0] + allpairs = [] + for i,j in overlap: + for k in range(-window, window+1): + for l in range(-window, window+1): + allpairs.append((i+k,j+l)) + allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and ac and b<=d: + return b-max(a,c) + elif a>=c and a=a and c1 else x # +trdist = lambda x,y: lev(x,y)/max(len(x),len(y)) # + + diff --git a/csv2df.py b/csv2df.py index d3fcd66..bc924a2 100644 --- a/csv2df.py +++ b/csv2df.py @@ -1,10 +1,12 @@ from collections import OrderedDict - +import os import pandas as pd def get_book_content(): - df = pd.read_csv("test_example.csv", header=None).rename( + csv_path = os.path.dirname(os.path.realpath(__file__)) + '/test_example.csv' + print('Test CSV File :: ', csv_path) + df = pd.read_csv(csv_path, header=None).rename( columns={0: 'chapter', 1: 'sentence', 2: 'text'}) book_dict = OrderedDict() @@ -25,22 +27,24 @@ def get_book_content(): def get_book_metadata(): dict_metadata = { - "book_id": "abcdef", - "title": "Bullshit", + "book_id": "fdcap_book", + "title": "Crime and Punishment", "lang": "en", "isTranslation": "true", "totalChapters": "2", "authors": [ { - "name": "Herr Riley", + "name": "Herr Isaac Riley", "translator": "true" }, { - "name": "Herr Singh" + "name": "Fyodor Dostoevsky" } ], - "description": "Some Random Bullshit description", - "source": "https://www.idontcare.com" + "description": "Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author " + "Fyodor Dostoevsky.First published in a journal named The Russian Messenger, it appeared in " + "twelve monthly installments in 1866, and was later published as a novel", + "source": "https://en.wikisource.org/wiki/Crime_and_Punishment" } return dict_metadata diff --git a/db/__init__.py b/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/db/add_book.py b/db/add_book.py new file mode 100644 index 0000000..c3acb86 --- /dev/null +++ b/db/add_book.py @@ -0,0 +1,271 @@ +import time +import db.mysql_connection as mysql +import db.constants as const + + +def add_book_to_db(book_code, book_dict): + # print('Adding Book Code :: ', book_code, ' Dict :: ', book_dict) + + conn = mysql.get_new_mysql_connection(const.DB_CONFIG_FILE) + if conn is None: + return False + + db_cursor = conn.cursor(buffered=True) + + # add book data to the Table First + book_row = { + 'code': book_code, + 'added_at': int(time.time()) + } + + # returns the last row id, if row added to the table successfully + last_rowid = add_book_row_to_table(db_cursor, const.BOOK_INSERT_QUERY, book_row) + book_id = last_rowid + print('Book Row Id :: ', last_rowid) + + book_info_dict = book_dict['bookInfo'] + if last_rowid > 0: + book_info_row = { + 'title': book_info_dict['title'], + 'description': book_info_dict['description'] if 'description' in book_info_dict else None, + 'lang': book_info_dict['lang'], + 'source': book_info_dict['source'], + 'is_translation': 'true' == book_info_dict['isTranslation'].lower(), + 'total_chapters': book_info_dict['totalChapters'], + 'isbn': book_info_dict['isbn'] if 'isbn' in book_info_dict else None, + 'book': book_id + } + + # returns the last row id, if row added to the table successfully + last_rowid = add_book_info_row_to_table(db_cursor, const.BOOK_INFO_INSERT_QUERY, book_info_row) + print('Book Info Row Id :: ', last_rowid) + + if last_rowid > 0: + book_info_id = last_rowid + authors_list = book_info_dict['authors'] + for author in authors_list: + author_row = { + 'id': -1, + 'name': author['name'].strip().lower(), + 'total_books': 1 + } + author_row = search_author(db_cursor, const.AUTHOR_SEARCH_QUERY, author_row) + print('Author Search Result :: ', author_row) + if author_row['id'] > 0: + author_row['total_books'] = author_row['total_books'] + 1 + last_rowid = update_author_book_count(db_cursor, const.AUTHOR_UPDATE_QUERY, author_row) + print('Author Update Row count :: ', last_rowid) + if last_rowid <= 0: + break + else: + author_row['name'] = author['name'] + author_row['total_books'] = 1 + last_rowid = add_author_to_table(db_cursor, const.AUTHOR_INSERT_QUERY, author_row) + print('Add Author Row Id :: ', last_rowid) + if last_rowid > 0: + author_row['id'] = last_rowid + + if author_row['id'] > 0: + author_is_translator = False + if 'translator' in author: + author_is_translator = 'true' == author['translator'].lower() + map_author_book = { + 'author': author_row['id'], + 'book': book_info_id, + 'translator': author_is_translator + } + + last_rowid = add_author_book_mapping(db_cursor, const.BOOK_AUTHOR_INSERT_QUERY, map_author_book) + print('Author Book Mapping Row ID :: ', last_rowid) + if last_rowid < 0: + break + + if last_rowid > 0: + book_content_row = { + 'book': book_id + } + + # returns the last row id, if row added to the table successfully + last_rowid = add_book_content_row_to_table(db_cursor, const.CONTENT_INSERT_QUERY, book_content_row) + print('Book Content Row Id :: ', last_rowid) + + if last_rowid > 0: + content_id = last_rowid + book_chapters_list = book_dict['content']['chapters'] + for chapter in book_chapters_list: + book_chapter_row = { + 'c_num': chapter['num'], + 'name': chapter['name'] if 'name' in chapter else None, + 'book_content': content_id + } + chapter_id = add_book_chapter_to_table(db_cursor, const.CHAPTER_INSERT_QUERY, book_chapter_row) + print('Book Chapter Row Id :: ', chapter_id) + if chapter_id > 0: + sentences_dict = chapter['sentences'] + for s_num in sentences_dict.keys(): + sentence_row = { + 's_num': s_num, + 'text': sentences_dict[s_num], + 'chapter': chapter_id + } + sen_id = add_book_sentence_to_table(db_cursor, const.SENTENCE_INSERT_QUERY, sentence_row) + print('Book Sentence Id :: ', sen_id) + if sen_id <= 0: + break + else: + last_rowid = sen_id + else: + break + + db_cursor.close() + + is_success = False + if last_rowid > 0: + conn.commit() + is_success = True + else: + conn.rollback() + is_success = False + + conn.close() + + return is_success + + +def add_book_row_to_table(db_cursor, book_insert_query, book_row): + try: + # Insert this Book row to Table + db_cursor.execute(book_insert_query, book_row) + book_id = db_cursor.lastrowid + if book_id is not None: + return book_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def add_book_info_row_to_table(db_cursor, book_info_insert_query, book_info_row): + try: + # Insert this BookInfo row + db_cursor.execute(book_info_insert_query, book_info_row) + book_info_id = db_cursor.lastrowid + if book_info_id is not None: + return book_info_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def add_book_content_row_to_table(db_cursor, book_content_insert_query, book_content_row): + try: + # Insert Book Content row + db_cursor.execute(book_content_insert_query, book_content_row) + book_content_id = db_cursor.lastrowid + if book_content_id is not None: + return book_content_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def add_book_chapter_to_table(db_cursor, book_chapter_insert_query, book_chapter_row): + try: + # Insert Book chapter row + db_cursor.execute(book_chapter_insert_query, book_chapter_row) + book_chapter_id = db_cursor.lastrowid + if book_chapter_id is not None: + return book_chapter_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def add_book_sentence_to_table(db_cursor, book_sentence_insert_query, book_sentence): + try: + # Insert sentence + db_cursor.execute(book_sentence_insert_query, book_sentence) + book_sen_id = db_cursor.lastrowid + if book_sen_id is not None: + return book_sen_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def add_author_to_table(db_cursor, author_insert_query, author_data): + try: + # Insert Author + db_cursor.execute(author_insert_query, author_data) + author_id = db_cursor.lastrowid + if author_id is not None: + return author_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def add_author_book_mapping(db_cursor, book_author_insert_query, book_author_data): + try: + # Insert Book Author Mapping + db_cursor.execute(book_author_insert_query, book_author_data) + map_id = db_cursor.rowcount + if map_id > 0: + return map_id + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 + + +def search_author(db_cursor, author_search_query, author_data): + try: + # Search Author + db_cursor.execute(author_search_query, author_data) + row = db_cursor.fetchone() + if row is not None: + author_data['id'] = int(row[0]) + author_data['total_books'] = int(row[2]) + return author_data + else: + author_data['id'] = -1 + return author_data + + except Exception as e: + print(str(e)) + author_data['id'] = -1 + return author_data + + +def update_author_book_count(db_cursor, author_update_query, author_data): + try: + # Update Author Book Count + db_cursor.execute(author_update_query, author_data) + row_cnt = db_cursor.rowcount + if row_cnt > 0: + return row_cnt + else: + return -1 + + except Exception as e: + print(str(e)) + return -1 diff --git a/db/constants.py b/db/constants.py new file mode 100644 index 0000000..fdd5fdf --- /dev/null +++ b/db/constants.py @@ -0,0 +1,26 @@ +DB_CONFIG_FILE = 'db_config.ini' + +BOOK_INSERT_QUERY = "INSERT INTO dim_book (code, added_at) " \ + "VALUES (%(code)s, %(added_at)s)" + +AUTHOR_INSERT_QUERY = "INSERT INTO dim_author (name, total_books) " \ + "VALUES (%(name)s, %(total_books)s)" + +BOOK_INFO_INSERT_QUERY = "INSERT INTO dim_book_info (title, description, lang, source, is_translation, " \ + "total_chapters, isbn, book) " \ + "VALUES (%(title)s, %(description)s, %(lang)s, %(source)s, %(is_translation)s, " \ + "%(total_chapters)s, %(isbn)s, %(book)s) " + +BOOK_AUTHOR_INSERT_QUERY = "INSERT INTO map_book_author (author, book, translator) " \ + "VALUES (%(author)s, %(book)s, %(translator)s)" + +CONTENT_INSERT_QUERY = "INSERT INTO dim_book_content (book) VALUES(%(book)s)" + +CHAPTER_INSERT_QUERY = "INSERT INTO dim_book_chapter (c_num, name, book_content) " \ + "VALUES (%(c_num)s, %(name)s, %(book_content)s)" + +SENTENCE_INSERT_QUERY = "INSERT INTO dim_book_sentence (s_num, text, chapter) VALUES (%(s_num)s, %(text)s, %(chapter)s)" + +AUTHOR_SEARCH_QUERY = "SELECT * FROM dim_author WHERE dim_author.name = %(name)s" + +AUTHOR_UPDATE_QUERY = "UPDATE dim_author SET dim_author.total_books = %(total_books)s WHERE id = %(id)s" \ No newline at end of file diff --git a/db/mysql_connection.py b/db/mysql_connection.py new file mode 100644 index 0000000..8c0d81f --- /dev/null +++ b/db/mysql_connection.py @@ -0,0 +1,32 @@ +import mysql.connector +from mysql.connector import errorcode +import db.read_config as config +import utils.constants as const +import os + + +def get_new_mysql_connection(config_file_name): + + config_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+config_file_name + db_config = config.read_db_config(config_file_path, 'mysql') + + connection = None + + try: + connection = mysql.connector.connect(**db_config) + except mysql.connector.Error as err: + if err.errno == errorcode.ER_ACCESS_DENIED_ERROR: + print(const.WARNING, 'Invalid Database User and Password', const.END) + elif err.errno == errorcode.ER_BAD_DB_ERROR: + print(const.WARNING, 'Database doesn\'t exist ', const.END) + else: + print(err) + + if connection is not None: + if connection.is_connected(): + connection.autocommit = False + print(const.GREEN, 'MySQL Connection Successful => Connection ID :: ', connection.connection_id, const.END) + else: + connection = None + + return connection diff --git a/db/read_config.py b/db/read_config.py new file mode 100644 index 0000000..decf3e6 --- /dev/null +++ b/db/read_config.py @@ -0,0 +1,33 @@ +from configparser import ConfigParser +import os + + +def read_db_config(filename, section): + """ Read database configuration file and return a dictionary object + :param filename: name of the configuration file + :param section: section of database configuration + :return: a dictionary of database parameters + """ + + parser = ConfigParser() + parser.read(filename) + + db = {} + if parser.has_section(section): + items = parser.items(section) + for item in items: + db[item[0]] = item[1] + else: + raise Exception('{0} not found in the {1} file'.format(section, filename)) + + try: + db['password'] = os.environ[db['password']] + except KeyError: + print('Please set the Environment Variable ', db['password']) + + try: + db['host'] = os.environ[db['host']] + except KeyError: + print('Please set the Environment Variable ', db['host']) + + return db diff --git a/db/test_db.py b/db/test_db.py new file mode 100644 index 0000000..1c048e9 --- /dev/null +++ b/db/test_db.py @@ -0,0 +1,10 @@ +import db.mysql_connection as connection +import db.constants as const + +conn = connection.get_new_mysql_connection(const.DB_CONFIG_FILE) + +print('MySQL Server version :: ', conn.get_server_info()) +print('isConnected :: ', conn.is_connected()) + +conn.close() + diff --git a/db_config.ini b/db_config.ini new file mode 100644 index 0000000..402e8c0 --- /dev/null +++ b/db_config.ini @@ -0,0 +1,6 @@ +[mysql] +host = MYSQL_HOST +port = 3306 +database = bitext-aligner +user = root +password = MYSQL_PASSWORD \ No newline at end of file diff --git a/db_schema/db_schema.pdf b/db_schema/db_schema.pdf index c158807..e93ad6a 100644 Binary files a/db_schema/db_schema.pdf and b/db_schema/db_schema.pdf differ diff --git a/db_schema/db_schema.png b/db_schema/db_schema.png index b6befc6..62e448f 100644 Binary files a/db_schema/db_schema.png and b/db_schema/db_schema.png differ diff --git a/db_schema/db_schema.sql b/db_schema/db_schema.sql index 78ef29b..cdc446b 100644 --- a/db_schema/db_schema.sql +++ b/db_schema/db_schema.sql @@ -1,5 +1,5 @@ -- MySQL Script generated by MySQL Workbench --- Thu Jan 16 23:41:59 2020 +-- Wed Jan 22 11:08:37 2020 -- Model: New Model Version: 1.0 -- MySQL Workbench Forward Engineering @@ -21,6 +21,8 @@ USE `bitext-aligner` ; -- ----------------------------------------------------- -- Table `bitext-aligner`.`dim_author` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`dim_author` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_author` ( `id` INT NOT NULL AUTO_INCREMENT, `name` VARCHAR(90) NOT NULL, @@ -32,9 +34,12 @@ ENGINE = InnoDB; -- ----------------------------------------------------- -- Table `bitext-aligner`.`dim_book` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`dim_book` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book` ( - `id` INT NOT NULL, + `id` INT NOT NULL AUTO_INCREMENT, `code` VARCHAR(90) NOT NULL, + `added_at` BIGINT UNSIGNED NOT NULL, PRIMARY KEY (`id`)) ENGINE = InnoDB; @@ -42,10 +47,12 @@ ENGINE = InnoDB; -- ----------------------------------------------------- -- Table `bitext-aligner`.`dim_book_info` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_info` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` ( `id` INT NOT NULL AUTO_INCREMENT, `title` VARCHAR(90) NOT NULL, - `description` VARCHAR(200) NULL, + `description` VARCHAR(500) NULL, `lang` VARCHAR(5) NOT NULL, `source` VARCHAR(90) NOT NULL, `is_translation` TINYINT NOT NULL, @@ -53,6 +60,9 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` ( `isbn` VARCHAR(80) NULL, `book` INT NOT NULL, PRIMARY KEY (`id`), + INDEX `book_fk_idx` (`book` ASC), + UNIQUE INDEX `book_UNIQUE` (`book` ASC), + UNIQUE INDEX `id_UNIQUE` (`id` ASC), CONSTRAINT `info_book_fk` FOREIGN KEY (`book`) REFERENCES `bitext-aligner`.`dim_book` (`id`) @@ -60,20 +70,18 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` ( ON UPDATE NO ACTION) ENGINE = InnoDB; -CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`dim_book_info` (`book` ASC) VISIBLE; - -CREATE UNIQUE INDEX `book_UNIQUE` ON `bitext-aligner`.`dim_book_info` (`book` ASC) VISIBLE; - -CREATE UNIQUE INDEX `id_UNIQUE` ON `bitext-aligner`.`dim_book_info` (`id` ASC) VISIBLE; - -- ----------------------------------------------------- -- Table `bitext-aligner`.`dim_book_content` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_content` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_content` ( - `id` INT NOT NULL, + `id` INT NOT NULL AUTO_INCREMENT, `book` INT NOT NULL, PRIMARY KEY (`id`), + INDEX `book_fk_idx` (`book` ASC), + UNIQUE INDEX `book_UNIQUE` (`book` ASC), CONSTRAINT `content_book_fk` FOREIGN KEY (`book`) REFERENCES `bitext-aligner`.`dim_book` (`id`) @@ -81,20 +89,19 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_content` ( ON UPDATE CASCADE) ENGINE = InnoDB; -CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`dim_book_content` (`book` ASC) VISIBLE; - -CREATE UNIQUE INDEX `book_UNIQUE` ON `bitext-aligner`.`dim_book_content` (`book` ASC) VISIBLE; - -- ----------------------------------------------------- -- Table `bitext-aligner`.`dim_book_chapter` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_chapter` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_chapter` ( `id` INT NOT NULL AUTO_INCREMENT, `c_num` INT UNSIGNED NOT NULL, `name` VARCHAR(90) NULL, `book_content` INT NOT NULL, PRIMARY KEY (`id`), + INDEX `content_fk_idx` (`book_content` ASC), CONSTRAINT `ch_content_fk` FOREIGN KEY (`book_content`) REFERENCES `bitext-aligner`.`dim_book_content` (`id`) @@ -102,18 +109,19 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_chapter` ( ON UPDATE CASCADE) ENGINE = InnoDB; -CREATE INDEX `content_fk_idx` ON `bitext-aligner`.`dim_book_chapter` (`book_content` ASC) VISIBLE; - -- ----------------------------------------------------- -- Table `bitext-aligner`.`dim_book_sentence` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_sentence` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` ( `id` INT NOT NULL AUTO_INCREMENT, `s_num` INT UNSIGNED NOT NULL, - `text` VARCHAR(500) NOT NULL, + `text` VARCHAR(900) NOT NULL, `chapter` INT NOT NULL, PRIMARY KEY (`id`), + INDEX `chapter_fk_idx` (`chapter` ASC), CONSTRAINT `sen_chapter_fk` FOREIGN KEY (`chapter`) REFERENCES `bitext-aligner`.`dim_book_chapter` (`id`) @@ -121,16 +129,18 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` ( ON UPDATE CASCADE) ENGINE = InnoDB; -CREATE INDEX `chapter_fk_idx` ON `bitext-aligner`.`dim_book_sentence` (`chapter` ASC) VISIBLE; - -- ----------------------------------------------------- -- Table `bitext-aligner`.`map_book_author` -- ----------------------------------------------------- +DROP TABLE IF EXISTS `bitext-aligner`.`map_book_author` ; + CREATE TABLE IF NOT EXISTS `bitext-aligner`.`map_book_author` ( `author` INT NOT NULL, `book` INT NOT NULL, `translator` TINYINT NOT NULL, + INDEX `book_fk_idx` (`book` ASC) , + INDEX `author_fk_idx` (`author` ASC), CONSTRAINT `map_book_fk` FOREIGN KEY (`book`) REFERENCES `bitext-aligner`.`dim_book_info` (`id`) @@ -143,10 +153,6 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`map_book_author` ( ON UPDATE CASCADE) ENGINE = InnoDB; -CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`map_book_author` (`book` ASC) VISIBLE; - -CREATE INDEX `author_fk_idx` ON `bitext-aligner`.`map_book_author` (`author` ASC) VISIBLE; - SET SQL_MODE=@OLD_SQL_MODE; SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS; diff --git a/requirements.txt b/requirements.txt index 76fe572..05a4055 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,8 @@ google-cloud-translate==2.0.0 google-cloud-storage==1.19.1 mysql-connector-python==8.0.19 -pandas \ No newline at end of file +pandas +xmlschema +numpy +jellyfish +nltk \ No newline at end of file diff --git a/run.py b/run.py new file mode 100644 index 0000000..5be3377 --- /dev/null +++ b/run.py @@ -0,0 +1,38 @@ +import xml_parser.read_xml as read_xml +import db.add_book as adb +import xml_parser.validate as validate +import utils.json_utils as json_utils +import utils.constants as const +import utils.env_utils as env + + +def validate_all_xml_files(): + validate.validate_all_xml_files() + + +def save_validated_files_to_db(): + json_data = json_utils.read_json_file(const.JSON_PATH) + books_json = json_data['books'] + for book_code in books_json.keys(): + books_list = books_json[book_code] + for book in books_list: + if not book['is_validated']: + print(const.WARNING, 'Book : ', book['xml_file'], ' is not validated against XSD', const.END) + continue + if not book['is_saved_to_db']: + print(const.BLUE, 'Adding Book : ', book['xml_file'], ' to the DB', const.END) + book_dict = read_xml.parse_xml_file(book['xml_file_path']) + result = adb.add_book_to_db(book_code, book_dict) + book['is_saved_to_db'] = result + w_str = const.WARNING + if result: + w_str = const.BLUE + print(w_str, 'Result :: ', result, const.END, '\n') + + json_data['books'] = books_json + json_utils.write_json_file(const.JSON_PATH, json_data) + + +if env.check_env_variables(): + validate_all_xml_files() + save_validated_files_to_db() \ No newline at end of file diff --git a/test_example.csv b/test_example.csv index 958dda3..a753069 100644 --- a/test_example.csv +++ b/test_example.csv @@ -2,4 +2,5 @@ 1,2,"Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt." 1,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem." 2,1,"Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?" -2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?" \ No newline at end of file +2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?" +2,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem." \ No newline at end of file diff --git a/utils/constants.py b/utils/constants.py new file mode 100644 index 0000000..f8c4860 --- /dev/null +++ b/utils/constants.py @@ -0,0 +1,15 @@ +JSON_PATH = 'json/books.json' + +XSD_PATH = 'xml_files/book.xsd' + +TRANSLATE_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS' + +MYSQL_PASS_ENV_VAR = 'MYSQL_PASSWORD' + +MYSQL_HOST_ENV_VAR = 'MYSQL_HOST' + +WARNING = '\033[91m' +END = '\033[0m' + +BLUE = '\033[94m' +GREEN = '\033[92m' \ No newline at end of file diff --git a/utils/env_utils.py b/utils/env_utils.py new file mode 100644 index 0000000..ea65302 --- /dev/null +++ b/utils/env_utils.py @@ -0,0 +1,15 @@ +import os +import utils.constants as const + + +def check_env_variables(): + if const.TRANSLATE_ENV_VAR not in os.environ: + print(const.WARNING, 'Please set the ', const.TRANSLATE_ENV_VAR, ' Environment Variable to continue....', const.END) + return False + if const.MYSQL_PASS_ENV_VAR not in os.environ: + print(const.WARNING, 'Please set the ', const.MYSQL_PASS_ENV_VAR, ' Environment Variable to continue....', const.END) + return False + if const.MYSQL_HOST_ENV_VAR not in os.environ: + print(const.WARNING, 'Please set the ', const.MYSQL_HOST_ENV_VAR, ' Environment Variable to continue....', const.END) + return False + return True diff --git a/utils/json_utils.py b/utils/json_utils.py new file mode 100644 index 0000000..976f05d --- /dev/null +++ b/utils/json_utils.py @@ -0,0 +1,19 @@ +import json +import os + + +def read_json_file(file_path): + json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path + + with open(json_file_path, 'r') as json_file: + json_data = json.load(json_file) + json_file.close() + return json_data + + +def write_json_file(file_path, json_data): + json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path + + with open(json_file_path, 'w') as updated_json: + updated_json.write(json.dumps(json_data, indent=4)) + updated_json.close() \ No newline at end of file diff --git a/xml_files/book.xsd b/xml_files/book.xsd new file mode 100644 index 0000000..f64d1ac --- /dev/null +++ b/xml_files/book.xsd @@ -0,0 +1,67 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/xml_files/book_structure.xml b/xml_files/book_structure.xml index 18d93d4..1f19b8d 100644 --- a/xml_files/book_structure.xml +++ b/xml_files/book_structure.xml @@ -2,17 +2,17 @@ Crime and Punishment + en + true + 2 + https://en.wikisource.org/wiki/Crime_and_Punishment Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author Fyodor Dostoevsky. First published in a journal named The Russian Messenger, it appeared in twelve monthly installments in 1866, and was later published as a novel. - en + n.a. Fyodor Dostoevsky Constance Garnett - https://en.wikisource.org/wiki/Crime_and_Punishment - true - 2 - n.a. diff --git a/xml_parser/__init__.py b/xml_parser/__init__.py index 4677764..24662b6 100644 --- a/xml_parser/__init__.py +++ b/xml_parser/__init__.py @@ -1,12 +1,16 @@ from pathlib import Path import json +import utils.constants as const +import os -json_file_path = Path('json/books.json') -json_data = {'books': []} +json_path = os.path.dirname(os.path.dirname(__file__))+'/'+const.JSON_PATH +json_file_path = Path(json_path) + +json_data = {'books': {}} if not json_file_path.is_file(): json_file = open(json_file_path, 'w') json_file.write(json.dumps(json_data, indent=4)) json_file.close() - print('JSON File Created :: '+json_file.name) + print(const.BLUE, 'JSON File Created :: '+json_file.name, const.END) diff --git a/xml_parser/create_xml.py b/xml_parser/create_xml.py index 4109a5d..c917bde 100644 --- a/xml_parser/create_xml.py +++ b/xml_parser/create_xml.py @@ -2,7 +2,8 @@ from xml.etree import ElementTree as ET from xml.dom import minidom import os import json -from pathlib import Path +import utils.json_utils as json_utils +import utils.constants as const def create_xml_file(book_dict, book_metadata): @@ -24,14 +25,13 @@ def create_xml_file(book_dict, book_metadata): total_chapters = ET.SubElement(book_info, 'totalChapters') total_chapters.text = book_metadata['totalChapters'] + source = ET.SubElement(book_info, 'source') + source.text = book_metadata['source'] + if 'description' in book_metadata: description = ET.SubElement(book_info, 'description') description.text = book_metadata['description'] - if 'source' in book_metadata: - source = ET.SubElement(book_info, 'source') - source.text = book_metadata['source'] - if 'isbn' in book_metadata: isbn = ET.SubElement(book_info, 'isbn') isbn.text = book_metadata['isbn'] @@ -48,40 +48,44 @@ def create_xml_file(book_dict, book_metadata): chapter.set('num', str(key)) for idx, val in enumerate(book_dict[key]): sentence = ET.SubElement(chapter, 'sentence') - sentence.set('id', str(idx + 1)) + sentence.set('num', str(idx + 1)) sentence.text = val # tree = ET.ElementTree(book_root) # tree.write(filename) root_dir = os.path.dirname(os.path.dirname(__file__)) output_dir = os.path.join(root_dir, "xml_files") - filename = book_root.get('id') + "_" + lang.text + ".xml" + filename = book_root.get('code') + "_" + lang.text + ".xml" file = open(output_dir + '/' + filename, 'w') file_path = file.name - print('XML File Path :: ', file_path) file.write(prettify(book_root)) file.close() + print(const.BLUE, 'Saved XML File Path :: ', file_path, const.END) json_obj = {} - json_obj['book_id'] = book_root.get('id') + book_code = book_root.get('code') json_obj['xml_file'] = filename json_obj['lang'] = lang.text json_obj['xml_file_path'] = file_path json_obj['is_validated'] = False json_obj['is_saved_to_db'] = False - add_xml_book_data_to_json(json_obj) + add_xml_book_data_to_json(book_code, json_obj) + + return file_path + + +def add_xml_book_data_to_json(book_code, json_obj): + json_data = json_utils.read_json_file(const.JSON_PATH) -def add_xml_book_data_to_json(json_obj): - json_file_path = Path('json/books.json') + books = json_data['books'] + if book_code in books.keys(): + books[book_code].append(json_obj) + else: + books[book_code] = [json_obj] - json_file = open(json_file_path, 'r') - json_data = json.load(json_file) - json_file.close() + json_data['books'] = books - json_file = open(json_file_path, 'w') - json_data['books'].append(json_obj) - json_file.write(json.dumps(json_data, indent=4)) - json_file.close() + json_utils.write_json_file(const.JSON_PATH, json_data) def prettify(root): diff --git a/xml_parser/read_xml.py b/xml_parser/read_xml.py new file mode 100644 index 0000000..c53706f --- /dev/null +++ b/xml_parser/read_xml.py @@ -0,0 +1,42 @@ +import xml.etree.ElementTree as ET + + +def parse_xml_file(full_path): + + book_dict = {} + + tree = ET.parse(full_path) + book_root = tree.getroot() + # print('Root Element :: ', book_root.tag, ' | Attributes :: ', book_root.attrib) + book_dict['code'] = book_root.attrib['code'] + + book_info_dict = {} + book_content_dict = {} + book_info_element = book_root.find('bookInfo') + book_content_element = book_root.find('content') + + book_info_dict['authors'] = [] + for child in book_info_element: + if 'author' == child.tag: + author = {'name': child.text} + if 'translator' in child.attrib: + author['translator'] = child.attrib['translator'] + book_info_dict['authors'].append(author) + else: + book_info_dict[child.tag] = child.text + + book_dict['bookInfo'] = book_info_dict + + book_content_dict['chapters'] = [] + for chapter in book_content_element: + chapter_dict = {'num': chapter.attrib['num']} + if 'name' in chapter.attrib: + chapter_dict['name'] = chapter.attrib['name'] + chapter_dict['sentences'] = {} + for sentence in chapter.findall('sentence'): + chapter_dict['sentences'][sentence.attrib['num']] = sentence.text + book_content_dict['chapters'].append(chapter_dict) + + book_dict['content'] = book_content_dict + + return book_dict diff --git a/xml_parser/test.py b/xml_parser/test.py deleted file mode 100644 index 40a0f6f..0000000 --- a/xml_parser/test.py +++ /dev/null @@ -1,6 +0,0 @@ -from csv2df import get_book_content, get_book_metadata - -from xml_parser.create_xml import create_xml_file - -create_xml_file(get_book_content(), get_book_metadata()) - diff --git a/xml_parser/test_parser.py b/xml_parser/test_parser.py new file mode 100644 index 0000000..fc00ba8 --- /dev/null +++ b/xml_parser/test_parser.py @@ -0,0 +1,14 @@ +from csv2df import get_book_content, get_book_metadata +import xml_parser.create_xml as create_xml +import xml_parser.read_xml as read_xml +import xml_parser.validate as validate + + +file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata()) + +# print(file_path) + +validate.validate_all_xml_files() + +# book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml') + diff --git a/xml_parser/validate.py b/xml_parser/validate.py new file mode 100644 index 0000000..a908ae0 --- /dev/null +++ b/xml_parser/validate.py @@ -0,0 +1,38 @@ +import xmlschema +import json +import utils.json_utils as json_utils +import utils.constants as const +import os + + +def is_valid(book_schema, xml_path): + return book_schema.is_valid(xml_path) + + +def get_book_schema(book_xsd_path): + xsd_full_path = os.path.dirname(os.path.dirname(__file__))+'/'+book_xsd_path + book_schema = xmlschema.XMLSchema(xsd_full_path) + return book_schema + + +def validate_all_xml_files(): + + json_data = json_utils.read_json_file(const.JSON_PATH) + + book_schema = get_book_schema(const.XSD_PATH) + + books_json = json_data['books'] + for book_code in books_json.keys(): + books_list = books_json[book_code] + for book in books_list: + if book['is_validated']: + print(const.BLUE, 'Book : ', book['xml_file'], ' is valid', const.END) + continue + else: + if 'xml_file_path' in book: + result = book_schema.is_valid(book['xml_file_path']) + print('Validating Book : ', book['xml_file'], ' -> ', result) + book['is_validated'] = result + + json_data['books'] = books_json + json_utils.write_json_file(const.JSON_PATH, json_data)