Merge remote-tracking branch 'origin/master'

6 years ago · 1540b701a4
parent 96ae438e6c a16dabf853
commit 1540b701a4
27 changed files with 880 additions and 66 deletions
--- a/.gitignore
+++ b/.gitignore
@ -129,3 +129,4 @@ dmypy.json
 .pyre/
 .idea
 *.mwb
--- a/bitext_align.py
+++ b/bitext_align.py
@ -0,0 +1,170 @@
 # -*- coding: utf-8 -*- 
 import os,sys
 import re
 import pandas as pd
 import numpy as np
 from numpy import cumsum
 from pandas import DataFrame
 from nltk import word_tokenize, sent_tokenize
 #import xml.etree.ElementTree as ET
 from jellyfish import levenshtein_distance as lev
 #import six
 from google.cloud import translate_v2 as translate
 from itertools import product as cp
 translate_client = translate.Client()
 '''
 '''
 def master_align(text0, text1, lang0, lang1): 
    """ Takes two equivalent texts (original and trnslation) and returns 
        aligned texts. """
    df0 = frame_from_text(text0, lang0, lang1)
    print('A')
    df1 = frame_from_text(text1, lang1, lang0, is1=True)
    print('B')
    # returns dfs with ['sent', 'trans', 'rellen', 'relpos']
    anchors = anchors_from_frames(df0, df1, window=2)
    print('C')
    alignments = intermediate_align(df0, df1, anchors, lookahead=4)
    print('D')
    textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments)
    print('E')
    return textdict0, textdict1
 def frame_from_text(text, source, target, is1=False): # 
    """  """ # 
    #print(source, '-->', target)
    cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
    #print(cols)
    frame = pd.DataFrame(columns=cols)
    frame[cols[0]] = sent_tokenize(text)
    frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
    frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
    frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max()
    cumul_b = list(np.cumsum(frame[cols[2]]))
    cumul_a = [0]+cumul_b[:-1]
    frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b)))
    #print(frame[[cols[0], cols[1]]])
    return frame
 def anchors_from_frames(frame0, frame1, window): # 
    """  """
    pairdf = generate_pairdf(frame0, frame1, window)
    frame0['index0'] = frame0.index
    frame1['index1'] = frame1.index
    pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1')
    pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1)
    pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1)
    pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1)
    pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
    pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
    pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3)
    return list(pairdf[pairdf.isanchor][['index0','index1']].values)
 def intermediate_align(frame0, frame1, anchs, lookahead): # 
    """  """
    aligns = []
    end0, end1 = frame0.shape[0], frame1.shape[0]
    anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)]))
    for rang in anchor_ranges:
        interaligns =  get_interalign(frame0, frame1, *rang, lookahead)
        a,b = rang[0]
        aligns.append(((a,b),(a,b)))
        aligns.extend(interaligns)
    return aligns[1:] # format [((i_start, i_end),(j_start, j_end))]
 def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): # 
    """  """
    print(anchors_init, anchors_next)
    interaligns = []
    i,j = anchors_init
    i+=1
    j+=1
    end0, end1 = anchors_next
    while i<end0 and j<end1:
        room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead)
        lambdascore = lambda p,q: score(df0, df1, i, j, p, q)
        i_,j_ = min([(x,y) for x,y in cp(range(i,i+room0),range(j,j+room1)) if x==i or y==j], key=lambda a: lambdascore(*a))
        print((i,j), (i_,j_))
        interaligns.append(((i,j),(i_,j_)))
        i,j = i_+1,j_+1
    return interaligns
 def score(frame0, frame1, start0, start1, end0, end1): # 
    #print(frame0.columns)
    #print(frame1.columns)
    s0 = ' '.join(frame0.loc[start0:end0, 'sent0'])
    s1 = ' '.join(frame1.loc[start1:end1, 'sent1'])
    t0 = ' '.join(frame0.loc[start0:end0, 'trans0'])
    t1 = ' '.join(frame1.loc[start1:end1, 'trans1'])
    l0 = sum(frame0.loc[start0:end0, 'rellen0'])
    l1 = sum(frame1.loc[start1:end1, 'rellen1'])
    #print(s0, s1, t0, t1, l0, l1)
    return (trdist(s0,t1)+trdist(s1,t0))*gr1(l0/l1)/2
 def textdicts_from_alignments(frame0, frame1, aligns): # 
    """  """
    textdict0, textdict1 = {},{}
    for i,((a0,a1),(b0,b1)) in enumerate(aligns):
        t0 = ' '.join(frame0.loc[a0:b0, 'sent0'])
        t1 = ' '.join(frame1.loc[a1:b1, 'sent1'])
        print('***************************')
        print(aligns[i])
        print(t0)
        print(t1)
        textdict0.update({i:t0})
        textdict1.update({i:t1})
    return textdict0, textdict1
 def generate_pairdf(frame0, frame1, window): 
    """  """
    pairdf = pd.DataFrame(columns=['index0', 'index1'])
    ranges0 = frame0.relpos0
    ranges1 = frame1.relpos1
    overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
    len0 = frame0.shape[0]
    len1 = frame1.shape[0]
    allpairs = []
    for i,j in overlap:
        for k in range(-window, window+1):
            for l in range(-window, window+1):
                allpairs.append((i+k,j+l))
    allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and a<len0 and b<len1]
    allpairs = sorted(list(set(allpairs)))
    pairdf = pd.DataFrame(allpairs).rename(columns={0:'index0', 1:'index1'})
    return pairdf
 def get_overlap(a,b,c,d): 
    #print(a0,b0,a1,b1)
    if b>c and b<=d:
        return b-max(a,c)
    elif a>=c and a<d:
        return min(b,d)-a
    elif c>=a and c<b:
        return d-max(a,c)
    else:
        return 0
 gr1 = lambda x: 1/less1(x)                       # 
 less1 = lambda x: 1/x if abs(x)>1 else x         # 
 trdist = lambda x,y: lev(x,y)/max(len(x),len(y))           # 
--- a/csv2df.py
+++ b/csv2df.py
@ -1,10 +1,12 @@
 from collections import OrderedDict
-
+import os
 import pandas as pd
 def get_book_content():
-    df = pd.read_csv("test_example.csv", header=None).rename(
+    csv_path = os.path.dirname(os.path.realpath(__file__)) + '/test_example.csv'
    print('Test CSV File :: ', csv_path)
    df = pd.read_csv(csv_path, header=None).rename(
        columns={0: 'chapter', 1: 'sentence', 2: 'text'})
    book_dict = OrderedDict()
@ -25,22 +27,24 @@ def get_book_content():
 def get_book_metadata():
    dict_metadata = {
-        "book_id": "abcdef",
+        "book_id": "fdcap_book",
-        "title": "Bullshit",
+        "title": "Crime and Punishment",
        "lang": "en",
        "isTranslation": "true",
        "totalChapters": "2",
        "authors": [
            {
-                "name": "Herr Riley",
+                "name": "Herr Isaac Riley",
                "translator": "true"
            },
            {
-                "name": "Herr Singh"
+                "name": "Fyodor Dostoevsky"
            }
        ],
-        "description": "Some Random Bullshit description",
+        "description": "Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author "
-        "source": "https://www.idontcare.com"
+                       "Fyodor Dostoevsky.First published in a journal named The Russian Messenger, it appeared in "
                       "twelve monthly installments in 1866, and was later published as a novel",
        "source": "https://en.wikisource.org/wiki/Crime_and_Punishment"
    }
    return dict_metadata
--- a/db/init.py
+++ b/db/init.py
--- a/db/add_book.py
+++ b/db/add_book.py
@ -0,0 +1,271 @@
 import time
 import db.mysql_connection as mysql
 import db.constants as const
 def add_book_to_db(book_code, book_dict):
    # print('Adding Book Code :: ', book_code, ' Dict  :: ', book_dict)
    conn = mysql.get_new_mysql_connection(const.DB_CONFIG_FILE)
    if conn is None:
        return False
    db_cursor = conn.cursor(buffered=True)
    # add book data to the Table First
    book_row = {
        'code': book_code,
        'added_at': int(time.time())
    }
    # returns the last row id, if row added to the table successfully
    last_rowid = add_book_row_to_table(db_cursor, const.BOOK_INSERT_QUERY, book_row)
    book_id = last_rowid
    print('Book Row Id :: ', last_rowid)
    book_info_dict = book_dict['bookInfo']
    if last_rowid > 0:
        book_info_row = {
            'title': book_info_dict['title'],
            'description': book_info_dict['description'] if 'description' in book_info_dict else None,
            'lang': book_info_dict['lang'],
            'source': book_info_dict['source'],
            'is_translation': 'true' == book_info_dict['isTranslation'].lower(),
            'total_chapters': book_info_dict['totalChapters'],
            'isbn': book_info_dict['isbn'] if 'isbn' in book_info_dict else None,
            'book': book_id
        }
        # returns the last row id, if row added to the table successfully
        last_rowid = add_book_info_row_to_table(db_cursor, const.BOOK_INFO_INSERT_QUERY, book_info_row)
        print('Book Info Row Id :: ', last_rowid)
    if last_rowid > 0:
        book_info_id = last_rowid
        authors_list = book_info_dict['authors']
        for author in authors_list:
            author_row = {
                'id': -1,
                'name': author['name'].strip().lower(),
                'total_books': 1
            }
            author_row = search_author(db_cursor, const.AUTHOR_SEARCH_QUERY, author_row)
            print('Author Search Result :: ', author_row)
            if author_row['id'] > 0:
                author_row['total_books'] = author_row['total_books'] + 1
                last_rowid = update_author_book_count(db_cursor, const.AUTHOR_UPDATE_QUERY, author_row)
                print('Author Update Row count :: ', last_rowid)
                if last_rowid <= 0:
                    break
            else:
                author_row['name'] = author['name']
                author_row['total_books'] = 1
                last_rowid = add_author_to_table(db_cursor, const.AUTHOR_INSERT_QUERY, author_row)
                print('Add Author Row Id :: ', last_rowid)
                if last_rowid > 0:
                    author_row['id'] = last_rowid
            if author_row['id'] > 0:
                author_is_translator = False
                if 'translator' in author:
                    author_is_translator = 'true' == author['translator'].lower()
                map_author_book = {
                    'author': author_row['id'],
                    'book': book_info_id,
                    'translator': author_is_translator
                }
                last_rowid = add_author_book_mapping(db_cursor, const.BOOK_AUTHOR_INSERT_QUERY, map_author_book)
                print('Author Book Mapping Row ID :: ', last_rowid)
                if last_rowid < 0:
                    break
    if last_rowid > 0:
        book_content_row = {
            'book': book_id
        }
        # returns the last row id, if row added to the table successfully
        last_rowid = add_book_content_row_to_table(db_cursor, const.CONTENT_INSERT_QUERY, book_content_row)
        print('Book Content Row Id :: ', last_rowid)
    if last_rowid > 0:
        content_id = last_rowid
        book_chapters_list = book_dict['content']['chapters']
        for chapter in book_chapters_list:
            book_chapter_row = {
                'c_num': chapter['num'],
                'name': chapter['name'] if 'name' in chapter else None,
                'book_content': content_id
            }
            chapter_id = add_book_chapter_to_table(db_cursor, const.CHAPTER_INSERT_QUERY, book_chapter_row)
            print('Book Chapter Row Id :: ', chapter_id)
            if chapter_id > 0:
                sentences_dict = chapter['sentences']
                for s_num in sentences_dict.keys():
                    sentence_row = {
                        's_num': s_num,
                        'text': sentences_dict[s_num],
                        'chapter': chapter_id
                    }
                    sen_id = add_book_sentence_to_table(db_cursor, const.SENTENCE_INSERT_QUERY, sentence_row)
                    print('Book Sentence Id :: ', sen_id)
                    if sen_id <= 0:
                        break
                    else:
                        last_rowid = sen_id
            else:
                break
    db_cursor.close()
    is_success = False
    if last_rowid > 0:
        conn.commit()
        is_success = True
    else:
        conn.rollback()
        is_success = False
    conn.close()
    return is_success
 def add_book_row_to_table(db_cursor, book_insert_query, book_row):
    try:
        # Insert this Book row to Table
        db_cursor.execute(book_insert_query, book_row)
        book_id = db_cursor.lastrowid
        if book_id is not None:
            return book_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def add_book_info_row_to_table(db_cursor, book_info_insert_query, book_info_row):
    try:
        # Insert this BookInfo row
        db_cursor.execute(book_info_insert_query, book_info_row)
        book_info_id = db_cursor.lastrowid
        if book_info_id is not None:
            return book_info_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def add_book_content_row_to_table(db_cursor, book_content_insert_query, book_content_row):
    try:
        # Insert Book Content row
        db_cursor.execute(book_content_insert_query, book_content_row)
        book_content_id = db_cursor.lastrowid
        if book_content_id is not None:
            return book_content_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def add_book_chapter_to_table(db_cursor, book_chapter_insert_query, book_chapter_row):
    try:
        # Insert Book chapter row
        db_cursor.execute(book_chapter_insert_query, book_chapter_row)
        book_chapter_id = db_cursor.lastrowid
        if book_chapter_id is not None:
            return book_chapter_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def add_book_sentence_to_table(db_cursor, book_sentence_insert_query, book_sentence):
    try:
        # Insert sentence
        db_cursor.execute(book_sentence_insert_query, book_sentence)
        book_sen_id = db_cursor.lastrowid
        if book_sen_id is not None:
            return book_sen_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def add_author_to_table(db_cursor, author_insert_query, author_data):
    try:
        # Insert Author
        db_cursor.execute(author_insert_query, author_data)
        author_id = db_cursor.lastrowid
        if author_id is not None:
            return author_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def add_author_book_mapping(db_cursor, book_author_insert_query, book_author_data):
    try:
        # Insert Book Author Mapping
        db_cursor.execute(book_author_insert_query, book_author_data)
        map_id = db_cursor.rowcount
        if map_id > 0:
            return map_id
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
 def search_author(db_cursor, author_search_query, author_data):
    try:
        # Search Author
        db_cursor.execute(author_search_query, author_data)
        row = db_cursor.fetchone()
        if row is not None:
            author_data['id'] = int(row[0])
            author_data['total_books'] = int(row[2])
            return author_data
        else:
            author_data['id'] = -1
            return author_data
    except Exception as e:
        print(str(e))
        author_data['id'] = -1
        return author_data
 def update_author_book_count(db_cursor, author_update_query, author_data):
    try:
        # Update Author Book Count
        db_cursor.execute(author_update_query, author_data)
        row_cnt = db_cursor.rowcount
        if row_cnt > 0:
            return row_cnt
        else:
            return -1
    except Exception as e:
        print(str(e))
        return -1
--- a/db/constants.py
+++ b/db/constants.py
@ -0,0 +1,26 @@
 DB_CONFIG_FILE = 'db_config.ini'
 BOOK_INSERT_QUERY = "INSERT INTO dim_book (code, added_at) " \
                    "VALUES (%(code)s, %(added_at)s)"
 AUTHOR_INSERT_QUERY = "INSERT INTO dim_author (name, total_books) " \
                      "VALUES (%(name)s, %(total_books)s)"
 BOOK_INFO_INSERT_QUERY = "INSERT INTO dim_book_info (title, description, lang, source, is_translation, " \
                         "total_chapters, isbn, book) " \
                         "VALUES (%(title)s, %(description)s, %(lang)s, %(source)s, %(is_translation)s, " \
                         "%(total_chapters)s, %(isbn)s, %(book)s) "
 BOOK_AUTHOR_INSERT_QUERY = "INSERT INTO map_book_author (author, book, translator) " \
                           "VALUES (%(author)s, %(book)s, %(translator)s)"
 CONTENT_INSERT_QUERY = "INSERT INTO dim_book_content (book) VALUES(%(book)s)"
 CHAPTER_INSERT_QUERY = "INSERT INTO dim_book_chapter (c_num, name, book_content) " \
                       "VALUES (%(c_num)s, %(name)s, %(book_content)s)"
 SENTENCE_INSERT_QUERY = "INSERT INTO dim_book_sentence (s_num, text, chapter) VALUES (%(s_num)s, %(text)s, %(chapter)s)"
 AUTHOR_SEARCH_QUERY = "SELECT * FROM dim_author WHERE dim_author.name = %(name)s"
 AUTHOR_UPDATE_QUERY = "UPDATE dim_author SET dim_author.total_books = %(total_books)s WHERE id = %(id)s"
--- a/db/mysql_connection.py
+++ b/db/mysql_connection.py
@ -0,0 +1,32 @@
 import mysql.connector
 from mysql.connector import errorcode
 import db.read_config as config
 import utils.constants as const
 import os
 def get_new_mysql_connection(config_file_name):
    config_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+config_file_name
    db_config = config.read_db_config(config_file_path, 'mysql')
    connection = None
    try:
        connection = mysql.connector.connect(**db_config)
    except mysql.connector.Error as err:
        if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
            print(const.WARNING, 'Invalid Database User and Password', const.END)
        elif err.errno == errorcode.ER_BAD_DB_ERROR:
            print(const.WARNING, 'Database doesn\'t exist ', const.END)
        else:
            print(err)
    if connection is not None:
        if connection.is_connected():
            connection.autocommit = False
            print(const.GREEN, 'MySQL Connection Successful => Connection ID :: ', connection.connection_id, const.END)
        else:
            connection = None
    return connection
--- a/db/read_config.py
+++ b/db/read_config.py
@ -0,0 +1,33 @@
 from configparser import ConfigParser
 import os
 def read_db_config(filename, section):
    """ Read database configuration file and return a dictionary object
        :param filename: name of the configuration file
        :param section: section of database configuration
        :return: a dictionary of database parameters
    """
    parser = ConfigParser()
    parser.read(filename)
    db = {}
    if parser.has_section(section):
        items = parser.items(section)
        for item in items:
            db[item[0]] = item[1]
    else:
        raise Exception('{0} not found in the {1} file'.format(section, filename))
    try:
        db['password'] = os.environ[db['password']]
    except KeyError:
        print('Please set the Environment Variable ', db['password'])
    try:
        db['host'] = os.environ[db['host']]
    except KeyError:
        print('Please set the Environment Variable ', db['host'])
    return db
--- a/db/test_db.py
+++ b/db/test_db.py
@ -0,0 +1,10 @@
 import db.mysql_connection as connection
 import db.constants as const
 conn = connection.get_new_mysql_connection(const.DB_CONFIG_FILE)
 print('MySQL Server version :: ', conn.get_server_info())
 print('isConnected :: ', conn.is_connected())
 conn.close()
--- a/db_config.ini
+++ b/db_config.ini
@ -0,0 +1,6 @@
 [mysql]
 host = MYSQL_HOST
 port = 3306
 database = bitext-aligner
 user = root
 password = MYSQL_PASSWORD
--- a/db_schema/db_schema.pdf
+++ b/db_schema/db_schema.pdf
--- a/db_schema/db_schema.png
+++ b/db_schema/db_schema.png
--- a/db_schema/db_schema.sql
+++ b/db_schema/db_schema.sql
@ -1,5 +1,5 @@
 -- MySQL Script generated by MySQL Workbench
-- Thu Jan 16 23:41:59 2020
+-- Wed Jan 22 11:08:37 2020
 -- Model: New Model    Version: 1.0
 -- MySQL Workbench Forward Engineering
@ -21,6 +21,8 @@ USE `bitext-aligner` ;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`dim_author`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`dim_author` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_author` (
  `id` INT NOT NULL AUTO_INCREMENT,
  `name` VARCHAR(90) NOT NULL,
@ -32,9 +34,12 @@ ENGINE = InnoDB;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`dim_book`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`dim_book` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book` (
-  `id` INT NOT NULL,
+  `id` INT NOT NULL AUTO_INCREMENT,
  `code` VARCHAR(90) NOT NULL,
  `added_at` BIGINT UNSIGNED NOT NULL,
  PRIMARY KEY (`id`))
 ENGINE = InnoDB;
@ -42,10 +47,12 @@ ENGINE = InnoDB;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`dim_book_info`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_info` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` (
  `id` INT NOT NULL AUTO_INCREMENT,
  `title` VARCHAR(90) NOT NULL,
-  `description` VARCHAR(200) NULL,
+  `description` VARCHAR(500) NULL,
  `lang` VARCHAR(5) NOT NULL,
  `source` VARCHAR(90) NOT NULL,
  `is_translation` TINYINT NOT NULL,
@ -53,6 +60,9 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` (
  `isbn` VARCHAR(80) NULL,
  `book` INT NOT NULL,
  PRIMARY KEY (`id`),
  INDEX `book_fk_idx` (`book` ASC),
  UNIQUE INDEX `book_UNIQUE` (`book` ASC),
  UNIQUE INDEX `id_UNIQUE` (`id` ASC),
  CONSTRAINT `info_book_fk`
    FOREIGN KEY (`book`)
    REFERENCES `bitext-aligner`.`dim_book` (`id`)
@ -60,20 +70,18 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` (
    ON UPDATE NO ACTION)
 ENGINE = InnoDB;
 CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`dim_book_info` (`book` ASC) VISIBLE;
 CREATE UNIQUE INDEX `book_UNIQUE` ON `bitext-aligner`.`dim_book_info` (`book` ASC) VISIBLE;
 CREATE UNIQUE INDEX `id_UNIQUE` ON `bitext-aligner`.`dim_book_info` (`id` ASC) VISIBLE;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`dim_book_content`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_content` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_content` (
-  `id` INT NOT NULL,
+  `id` INT NOT NULL AUTO_INCREMENT,
  `book` INT NOT NULL,
  PRIMARY KEY (`id`),
  INDEX `book_fk_idx` (`book` ASC),
  UNIQUE INDEX `book_UNIQUE` (`book` ASC),
  CONSTRAINT `content_book_fk`
    FOREIGN KEY (`book`)
    REFERENCES `bitext-aligner`.`dim_book` (`id`)
@ -81,20 +89,19 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_content` (
    ON UPDATE CASCADE)
 ENGINE = InnoDB;
 CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`dim_book_content` (`book` ASC) VISIBLE;
 CREATE UNIQUE INDEX `book_UNIQUE` ON `bitext-aligner`.`dim_book_content` (`book` ASC) VISIBLE;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`dim_book_chapter`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_chapter` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_chapter` (
  `id` INT NOT NULL AUTO_INCREMENT,
  `c_num` INT UNSIGNED NOT NULL,
  `name` VARCHAR(90) NULL,
  `book_content` INT NOT NULL,
  PRIMARY KEY (`id`),
  INDEX `content_fk_idx` (`book_content` ASC),
  CONSTRAINT `ch_content_fk`
    FOREIGN KEY (`book_content`)
    REFERENCES `bitext-aligner`.`dim_book_content` (`id`)
@ -102,18 +109,19 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_chapter` (
    ON UPDATE CASCADE)
 ENGINE = InnoDB;
 CREATE INDEX `content_fk_idx` ON `bitext-aligner`.`dim_book_chapter` (`book_content` ASC) VISIBLE;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`dim_book_sentence`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_sentence` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` (
  `id` INT NOT NULL AUTO_INCREMENT,
  `s_num` INT UNSIGNED NOT NULL,
-  `text` VARCHAR(500) NOT NULL,
+  `text` VARCHAR(900) NOT NULL,
  `chapter` INT NOT NULL,
  PRIMARY KEY (`id`),
  INDEX `chapter_fk_idx` (`chapter` ASC),
  CONSTRAINT `sen_chapter_fk`
    FOREIGN KEY (`chapter`)
    REFERENCES `bitext-aligner`.`dim_book_chapter` (`id`)
@ -121,16 +129,18 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` (
    ON UPDATE CASCADE)
 ENGINE = InnoDB;
 CREATE INDEX `chapter_fk_idx` ON `bitext-aligner`.`dim_book_sentence` (`chapter` ASC) VISIBLE;
 -- -----------------------------------------------------
 -- Table `bitext-aligner`.`map_book_author`
 -- -----------------------------------------------------
 DROP TABLE IF EXISTS `bitext-aligner`.`map_book_author` ;
 CREATE TABLE IF NOT EXISTS `bitext-aligner`.`map_book_author` (
  `author` INT NOT NULL,
  `book` INT NOT NULL,
  `translator` TINYINT NOT NULL,
  INDEX `book_fk_idx` (`book` ASC) ,
  INDEX `author_fk_idx` (`author` ASC),
  CONSTRAINT `map_book_fk`
    FOREIGN KEY (`book`)
    REFERENCES `bitext-aligner`.`dim_book_info` (`id`)
@ -143,10 +153,6 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`map_book_author` (
    ON UPDATE CASCADE)
 ENGINE = InnoDB;
 CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`map_book_author` (`book` ASC) VISIBLE;
 CREATE INDEX `author_fk_idx` ON `bitext-aligner`.`map_book_author` (`author` ASC) VISIBLE;
 SET SQL_MODE=@OLD_SQL_MODE;
 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;
--- a/requirements.txt
+++ b/requirements.txt
@ -2,3 +2,7 @@ google-cloud-translate==2.0.0
 google-cloud-storage==1.19.1
 mysql-connector-python==8.0.19
 pandas
 xmlschema
 numpy
 jellyfish
 nltk
--- a/run.py
+++ b/run.py
@ -0,0 +1,38 @@
 import xml_parser.read_xml as read_xml
 import db.add_book as adb
 import xml_parser.validate as validate
 import utils.json_utils as json_utils
 import utils.constants as const
 import utils.env_utils as env
 def validate_all_xml_files():
    validate.validate_all_xml_files()
 def save_validated_files_to_db():
    json_data = json_utils.read_json_file(const.JSON_PATH)
    books_json = json_data['books']
    for book_code in books_json.keys():
        books_list = books_json[book_code]
        for book in books_list:
            if not book['is_validated']:
                print(const.WARNING, 'Book : ', book['xml_file'], ' is not validated against XSD', const.END)
                continue
            if not book['is_saved_to_db']:
                print(const.BLUE, 'Adding Book : ', book['xml_file'], ' to the DB', const.END)
                book_dict = read_xml.parse_xml_file(book['xml_file_path'])
                result = adb.add_book_to_db(book_code, book_dict)
                book['is_saved_to_db'] = result
                w_str = const.WARNING
                if result:
                    w_str = const.BLUE
                print(w_str, 'Result :: ', result, const.END, '\n')
    json_data['books'] = books_json
    json_utils.write_json_file(const.JSON_PATH, json_data)
 if env.check_env_variables():
    validate_all_xml_files()
    save_validated_files_to_db()
--- a/test_example.csv
+++ b/test_example.csv
@ -3,3 +3,4 @@
 1,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."
 2,1,"Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?"
 2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"
 2,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."
--- a/utils/constants.py
+++ b/utils/constants.py
@ -0,0 +1,15 @@
 JSON_PATH = 'json/books.json'
 XSD_PATH = 'xml_files/book.xsd'
 TRANSLATE_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS'
 MYSQL_PASS_ENV_VAR = 'MYSQL_PASSWORD'
 MYSQL_HOST_ENV_VAR = 'MYSQL_HOST'
 WARNING = '\033[91m'
 END = '\033[0m'
 BLUE = '\033[94m'
 GREEN = '\033[92m'
--- a/utils/env_utils.py
+++ b/utils/env_utils.py
@ -0,0 +1,15 @@
 import os
 import utils.constants as const
 def check_env_variables():
    if const.TRANSLATE_ENV_VAR not in os.environ:
        print(const.WARNING, 'Please set the ', const.TRANSLATE_ENV_VAR, ' Environment Variable to continue....', const.END)
        return False
    if const.MYSQL_PASS_ENV_VAR not in os.environ:
        print(const.WARNING, 'Please set the ', const.MYSQL_PASS_ENV_VAR, ' Environment Variable to continue....', const.END)
        return False
    if const.MYSQL_HOST_ENV_VAR not in os.environ:
        print(const.WARNING, 'Please set the ', const.MYSQL_HOST_ENV_VAR, ' Environment Variable to continue....', const.END)
        return False
    return True
--- a/utils/json_utils.py
+++ b/utils/json_utils.py
@ -0,0 +1,19 @@
 import json
 import os
 def read_json_file(file_path):
    json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
    with open(json_file_path, 'r') as json_file:
        json_data = json.load(json_file)
        json_file.close()
        return json_data
 def write_json_file(file_path, json_data):
    json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
    with open(json_file_path, 'w') as updated_json:
        updated_json.write(json.dumps(json_data, indent=4))
        updated_json.close()
--- a/xml_files/book.xsd
+++ b/xml_files/book.xsd
@ -0,0 +1,67 @@
 <?xml version="1.0" encoding="UTF-8" ?>
 <xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
    <xsd:element name="book">
        <xsd:complexType>
            <xsd:sequence>
                <xsd:element ref="bookInfo" minOccurs="1" maxOccurs="1" />
                <xsd:element ref="content" minOccurs="1" maxOccurs="1" />
            </xsd:sequence>
            <xsd:attribute name="code" type="xsd:string" use="required" />
        </xsd:complexType>
    </xsd:element>
    <xsd:element name="bookInfo">
        <xsd:complexType>
            <xsd:sequence>
                <xsd:element name="title" type="xsd:string" minOccurs="1" maxOccurs="1" />
                <xsd:element name="lang" type="xsd:string" minOccurs="1" maxOccurs="1" />
                <xsd:element name="isTranslation" type="xsd:boolean" minOccurs="1" maxOccurs="1" />
                <xsd:element name="totalChapters" type="xsd:nonNegativeInteger" minOccurs="1" maxOccurs="1" />
                <xsd:element name="source" type="xsd:string" minOccurs="1" maxOccurs="1" />
                <xsd:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1" />
                <xsd:element name="isbn" type="xsd:string" minOccurs="0" maxOccurs="1" />
                <xsd:element ref="author" minOccurs="1" maxOccurs="unbounded" />
            </xsd:sequence>
        </xsd:complexType>
    </xsd:element>
    <xsd:element name="author">
        <xsd:complexType>
            <xsd:simpleContent>
                <xsd:extension base="xsd:string">
                    <xsd:attribute name="translator" type="xsd:boolean" use="optional" />
                </xsd:extension>
            </xsd:simpleContent>
        </xsd:complexType>
    </xsd:element>
    <xsd:element name="content">
        <xsd:complexType>
            <xsd:sequence>
                <xsd:element ref="chapter" minOccurs="1" maxOccurs="unbounded" />
            </xsd:sequence>
        </xsd:complexType>
    </xsd:element>
    <xsd:element name="chapter">
        <xsd:complexType>
            <xsd:sequence>
                <xsd:element ref="sentence" minOccurs="1" maxOccurs="unbounded" />
            </xsd:sequence>
            <xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
            <xsd:attribute name="name" type="xsd:string" use="optional" />
        </xsd:complexType>
    </xsd:element>
    <xsd:element name="sentence">
        <xsd:complexType>
            <xsd:simpleContent>
                <xsd:extension base="xsd:string">
                    <xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
                </xsd:extension>
            </xsd:simpleContent>
        </xsd:complexType>
    </xsd:element>
 </xsd:schema>
--- a/xml_files/book_structure.xml
+++ b/xml_files/book_structure.xml
@ -2,17 +2,17 @@
 <book code="abc_book">
    <bookInfo>
        <title>Crime and Punishment</title>
        <lang>en</lang>
        <isTranslation>true</isTranslation>
        <totalChapters>2</totalChapters>
        <source>https://en.wikisource.org/wiki/Crime_and_Punishment</source>
        <description> <!--Optional-->
            Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author Fyodor Dostoevsky.
            First published in a journal named The Russian Messenger, it appeared in twelve monthly installments in 1866, and was later published as a novel.
        </description>
-        <lang>en</lang>
+        <isbn>n.a.</isbn> <!--Optional-->
        <author>Fyodor Dostoevsky</author>
        <author translator="true">Constance Garnett</author>
        <source>https://en.wikisource.org/wiki/Crime_and_Punishment</source>
        <isTranslation>true</isTranslation>
        <totalChapters>2</totalChapters>
        <isbn>n.a.</isbn> <!--Optional-->
    </bookInfo>
    <content>
        <chapter num="1" name="Erstes Kapitel">
--- a/xml_parser/init.py
+++ b/xml_parser/init.py
@ -1,12 +1,16 @@
 from pathlib import Path
 import json
 import utils.constants as const
 import os
 json_file_path = Path('json/books.json')
-json_data = {'books': []}
+json_path = os.path.dirname(os.path.dirname(__file__))+'/'+const.JSON_PATH
 json_file_path = Path(json_path)
 json_data = {'books': {}}
 if not json_file_path.is_file():
    json_file = open(json_file_path, 'w')
    json_file.write(json.dumps(json_data, indent=4))
    json_file.close()
-    print('JSON File Created :: '+json_file.name)
+    print(const.BLUE, 'JSON File Created :: '+json_file.name, const.END)
--- a/xml_parser/create_xml.py
+++ b/xml_parser/create_xml.py
@ -2,7 +2,8 @@ from xml.etree import ElementTree as ET
 from xml.dom import minidom
 import os
 import json
-from pathlib import Path
+import utils.json_utils as json_utils
 import utils.constants as const
 def create_xml_file(book_dict, book_metadata):
@ -24,14 +25,13 @@ def create_xml_file(book_dict, book_metadata):
    total_chapters = ET.SubElement(book_info, 'totalChapters')
    total_chapters.text = book_metadata['totalChapters']
    source = ET.SubElement(book_info, 'source')
    source.text = book_metadata['source']
    if 'description' in book_metadata:
        description = ET.SubElement(book_info, 'description')
        description.text = book_metadata['description']
    if 'source' in book_metadata:
        source = ET.SubElement(book_info, 'source')
        source.text = book_metadata['source']
    if 'isbn' in book_metadata:
        isbn = ET.SubElement(book_info, 'isbn')
        isbn.text = book_metadata['isbn']
@ -48,40 +48,44 @@ def create_xml_file(book_dict, book_metadata):
        chapter.set('num', str(key))
        for idx, val in enumerate(book_dict[key]):
            sentence = ET.SubElement(chapter, 'sentence')
-            sentence.set('id', str(idx + 1))
+            sentence.set('num', str(idx + 1))
            sentence.text = val
    # tree = ET.ElementTree(book_root)
    # tree.write(filename)
    root_dir = os.path.dirname(os.path.dirname(__file__))
    output_dir = os.path.join(root_dir, "xml_files")
-    filename = book_root.get('id') + "_" + lang.text + ".xml"
+    filename = book_root.get('code') + "_" + lang.text + ".xml"
    file = open(output_dir + '/' + filename, 'w')
    file_path = file.name
    print('XML File Path :: ', file_path)
    file.write(prettify(book_root))
    file.close()
    print(const.BLUE, 'Saved XML File Path :: ', file_path, const.END)
    json_obj = {}
-    json_obj['book_id'] = book_root.get('id')
+    book_code = book_root.get('code')
    json_obj['xml_file'] = filename
    json_obj['lang'] = lang.text
    json_obj['xml_file_path'] = file_path
    json_obj['is_validated'] = False
    json_obj['is_saved_to_db'] = False
-    add_xml_book_data_to_json(json_obj)
+    add_xml_book_data_to_json(book_code, json_obj)
    return file_path
 def add_xml_book_data_to_json(book_code, json_obj):
    json_data = json_utils.read_json_file(const.JSON_PATH)
-def add_xml_book_data_to_json(json_obj):
+    books = json_data['books']
-    json_file_path = Path('json/books.json')
+    if book_code in books.keys():
        books[book_code].append(json_obj)
    else:
        books[book_code] = [json_obj]
-    json_file = open(json_file_path, 'r')
+    json_data['books'] = books
    json_data = json.load(json_file)
    json_file.close()
-    json_file = open(json_file_path, 'w')
+    json_utils.write_json_file(const.JSON_PATH, json_data)
    json_data['books'].append(json_obj)
    json_file.write(json.dumps(json_data, indent=4))
    json_file.close()
 def prettify(root):
--- a/xml_parser/read_xml.py
+++ b/xml_parser/read_xml.py
@ -0,0 +1,42 @@
 import xml.etree.ElementTree as ET
 def parse_xml_file(full_path):
    book_dict = {}
    tree = ET.parse(full_path)
    book_root = tree.getroot()
    # print('Root Element :: ', book_root.tag, ' | Attributes :: ', book_root.attrib)
    book_dict['code'] = book_root.attrib['code']
    book_info_dict = {}
    book_content_dict = {}
    book_info_element = book_root.find('bookInfo')
    book_content_element = book_root.find('content')
    book_info_dict['authors'] = []
    for child in book_info_element:
        if 'author' == child.tag:
            author = {'name': child.text}
            if 'translator' in child.attrib:
                author['translator'] = child.attrib['translator']
            book_info_dict['authors'].append(author)
        else:
            book_info_dict[child.tag] = child.text
    book_dict['bookInfo'] = book_info_dict
    book_content_dict['chapters'] = []
    for chapter in book_content_element:
        chapter_dict = {'num': chapter.attrib['num']}
        if 'name' in chapter.attrib:
            chapter_dict['name'] = chapter.attrib['name']
        chapter_dict['sentences'] = {}
        for sentence in chapter.findall('sentence'):
            chapter_dict['sentences'][sentence.attrib['num']] = sentence.text
        book_content_dict['chapters'].append(chapter_dict)
    book_dict['content'] = book_content_dict
    return book_dict
--- a/xml_parser/test.py
+++ b/xml_parser/test.py
@ -1,6 +0,0 @@
 from csv2df import get_book_content, get_book_metadata
 from xml_parser.create_xml import create_xml_file
 create_xml_file(get_book_content(), get_book_metadata())
--- a/xml_parser/test_parser.py
+++ b/xml_parser/test_parser.py
@ -0,0 +1,14 @@
 from csv2df import get_book_content, get_book_metadata
 import xml_parser.create_xml as create_xml
 import xml_parser.read_xml as read_xml
 import xml_parser.validate as validate
 file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata())
 # print(file_path)
 validate.validate_all_xml_files()
 # book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml')
--- a/xml_parser/validate.py
+++ b/xml_parser/validate.py
@ -0,0 +1,38 @@
 import xmlschema
 import json
 import utils.json_utils as json_utils
 import utils.constants as const
 import os
 def is_valid(book_schema, xml_path):
    return book_schema.is_valid(xml_path)
 def get_book_schema(book_xsd_path):
    xsd_full_path = os.path.dirname(os.path.dirname(__file__))+'/'+book_xsd_path
    book_schema = xmlschema.XMLSchema(xsd_full_path)
    return book_schema
 def validate_all_xml_files():
    json_data = json_utils.read_json_file(const.JSON_PATH)
    book_schema = get_book_schema(const.XSD_PATH)
    books_json = json_data['books']
    for book_code in books_json.keys():
        books_list = books_json[book_code]
        for book in books_list:
            if book['is_validated']:
                print(const.BLUE, 'Book : ', book['xml_file'], ' is valid', const.END)
                continue
            else:
                if 'xml_file_path' in book:
                    result = book_schema.is_valid(book['xml_file_path'])
                    print('Validating Book : ', book['xml_file'], ' -> ', result)
                    book['is_validated'] = result
    json_data['books'] = books_json
    json_utils.write_json_file(const.JSON_PATH, json_data)