Merge remote-tracking branch 'origin/master'

master
Nerv 6 years ago
commit 1540b701a4

1
.gitignore vendored

@ -129,3 +129,4 @@ dmypy.json
.pyre/
.idea
*.mwb

@ -0,0 +1,170 @@
# -*- coding: utf-8 -*-
import os,sys
import re
import pandas as pd
import numpy as np
from numpy import cumsum
from pandas import DataFrame
from nltk import word_tokenize, sent_tokenize
#import xml.etree.ElementTree as ET
from jellyfish import levenshtein_distance as lev
#import six
from google.cloud import translate_v2 as translate
from itertools import product as cp
translate_client = translate.Client()
'''
'''
def master_align(text0, text1, lang0, lang1):
""" Takes two equivalent texts (original and trnslation) and returns
aligned texts. """
df0 = frame_from_text(text0, lang0, lang1)
print('A')
df1 = frame_from_text(text1, lang1, lang0, is1=True)
print('B')
# returns dfs with ['sent', 'trans', 'rellen', 'relpos']
anchors = anchors_from_frames(df0, df1, window=2)
print('C')
alignments = intermediate_align(df0, df1, anchors, lookahead=4)
print('D')
textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments)
print('E')
return textdict0, textdict1
def frame_from_text(text, source, target, is1=False): #
""" """ #
#print(source, '-->', target)
cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
#print(cols)
frame = pd.DataFrame(columns=cols)
frame[cols[0]] = sent_tokenize(text)
frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max()
cumul_b = list(np.cumsum(frame[cols[2]]))
cumul_a = [0]+cumul_b[:-1]
frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b)))
#print(frame[[cols[0], cols[1]]])
return frame
def anchors_from_frames(frame0, frame1, window): #
""" """
pairdf = generate_pairdf(frame0, frame1, window)
frame0['index0'] = frame0.index
frame1['index1'] = frame1.index
pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1')
pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1)
pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1)
pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1)
pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3)
return list(pairdf[pairdf.isanchor][['index0','index1']].values)
def intermediate_align(frame0, frame1, anchs, lookahead): #
""" """
aligns = []
end0, end1 = frame0.shape[0], frame1.shape[0]
anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)]))
for rang in anchor_ranges:
interaligns = get_interalign(frame0, frame1, *rang, lookahead)
a,b = rang[0]
aligns.append(((a,b),(a,b)))
aligns.extend(interaligns)
return aligns[1:] # format [((i_start, i_end),(j_start, j_end))]
def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
""" """
print(anchors_init, anchors_next)
interaligns = []
i,j = anchors_init
i+=1
j+=1
end0, end1 = anchors_next
while i<end0 and j<end1:
room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead)
lambdascore = lambda p,q: score(df0, df1, i, j, p, q)
i_,j_ = min([(x,y) for x,y in cp(range(i,i+room0),range(j,j+room1)) if x==i or y==j], key=lambda a: lambdascore(*a))
print((i,j), (i_,j_))
interaligns.append(((i,j),(i_,j_)))
i,j = i_+1,j_+1
return interaligns
def score(frame0, frame1, start0, start1, end0, end1): #
#print(frame0.columns)
#print(frame1.columns)
s0 = ' '.join(frame0.loc[start0:end0, 'sent0'])
s1 = ' '.join(frame1.loc[start1:end1, 'sent1'])
t0 = ' '.join(frame0.loc[start0:end0, 'trans0'])
t1 = ' '.join(frame1.loc[start1:end1, 'trans1'])
l0 = sum(frame0.loc[start0:end0, 'rellen0'])
l1 = sum(frame1.loc[start1:end1, 'rellen1'])
#print(s0, s1, t0, t1, l0, l1)
return (trdist(s0,t1)+trdist(s1,t0))*gr1(l0/l1)/2
def textdicts_from_alignments(frame0, frame1, aligns): #
""" """
textdict0, textdict1 = {},{}
for i,((a0,a1),(b0,b1)) in enumerate(aligns):
t0 = ' '.join(frame0.loc[a0:b0, 'sent0'])
t1 = ' '.join(frame1.loc[a1:b1, 'sent1'])
print('***************************')
print(aligns[i])
print(t0)
print(t1)
textdict0.update({i:t0})
textdict1.update({i:t1})
return textdict0, textdict1
def generate_pairdf(frame0, frame1, window):
""" """
pairdf = pd.DataFrame(columns=['index0', 'index1'])
ranges0 = frame0.relpos0
ranges1 = frame1.relpos1
overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
len0 = frame0.shape[0]
len1 = frame1.shape[0]
allpairs = []
for i,j in overlap:
for k in range(-window, window+1):
for l in range(-window, window+1):
allpairs.append((i+k,j+l))
allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and a<len0 and b<len1]
allpairs = sorted(list(set(allpairs)))
pairdf = pd.DataFrame(allpairs).rename(columns={0:'index0', 1:'index1'})
return pairdf
def get_overlap(a,b,c,d):
#print(a0,b0,a1,b1)
if b>c and b<=d:
return b-max(a,c)
elif a>=c and a<d:
return min(b,d)-a
elif c>=a and c<b:
return d-max(a,c)
else:
return 0
gr1 = lambda x: 1/less1(x) #
less1 = lambda x: 1/x if abs(x)>1 else x #
trdist = lambda x,y: lev(x,y)/max(len(x),len(y)) #

@ -1,10 +1,12 @@
from collections import OrderedDict
import os
import pandas as pd
def get_book_content():
df = pd.read_csv("test_example.csv", header=None).rename(
csv_path = os.path.dirname(os.path.realpath(__file__)) + '/test_example.csv'
print('Test CSV File :: ', csv_path)
df = pd.read_csv(csv_path, header=None).rename(
columns={0: 'chapter', 1: 'sentence', 2: 'text'})
book_dict = OrderedDict()
@ -25,22 +27,24 @@ def get_book_content():
def get_book_metadata():
dict_metadata = {
"book_id": "abcdef",
"title": "Bullshit",
"book_id": "fdcap_book",
"title": "Crime and Punishment",
"lang": "en",
"isTranslation": "true",
"totalChapters": "2",
"authors": [
{
"name": "Herr Riley",
"name": "Herr Isaac Riley",
"translator": "true"
},
{
"name": "Herr Singh"
"name": "Fyodor Dostoevsky"
}
],
"description": "Some Random Bullshit description",
"source": "https://www.idontcare.com"
"description": "Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author "
"Fyodor Dostoevsky.First published in a journal named The Russian Messenger, it appeared in "
"twelve monthly installments in 1866, and was later published as a novel",
"source": "https://en.wikisource.org/wiki/Crime_and_Punishment"
}
return dict_metadata

@ -0,0 +1,271 @@
import time
import db.mysql_connection as mysql
import db.constants as const
def add_book_to_db(book_code, book_dict):
# print('Adding Book Code :: ', book_code, ' Dict :: ', book_dict)
conn = mysql.get_new_mysql_connection(const.DB_CONFIG_FILE)
if conn is None:
return False
db_cursor = conn.cursor(buffered=True)
# add book data to the Table First
book_row = {
'code': book_code,
'added_at': int(time.time())
}
# returns the last row id, if row added to the table successfully
last_rowid = add_book_row_to_table(db_cursor, const.BOOK_INSERT_QUERY, book_row)
book_id = last_rowid
print('Book Row Id :: ', last_rowid)
book_info_dict = book_dict['bookInfo']
if last_rowid > 0:
book_info_row = {
'title': book_info_dict['title'],
'description': book_info_dict['description'] if 'description' in book_info_dict else None,
'lang': book_info_dict['lang'],
'source': book_info_dict['source'],
'is_translation': 'true' == book_info_dict['isTranslation'].lower(),
'total_chapters': book_info_dict['totalChapters'],
'isbn': book_info_dict['isbn'] if 'isbn' in book_info_dict else None,
'book': book_id
}
# returns the last row id, if row added to the table successfully
last_rowid = add_book_info_row_to_table(db_cursor, const.BOOK_INFO_INSERT_QUERY, book_info_row)
print('Book Info Row Id :: ', last_rowid)
if last_rowid > 0:
book_info_id = last_rowid
authors_list = book_info_dict['authors']
for author in authors_list:
author_row = {
'id': -1,
'name': author['name'].strip().lower(),
'total_books': 1
}
author_row = search_author(db_cursor, const.AUTHOR_SEARCH_QUERY, author_row)
print('Author Search Result :: ', author_row)
if author_row['id'] > 0:
author_row['total_books'] = author_row['total_books'] + 1
last_rowid = update_author_book_count(db_cursor, const.AUTHOR_UPDATE_QUERY, author_row)
print('Author Update Row count :: ', last_rowid)
if last_rowid <= 0:
break
else:
author_row['name'] = author['name']
author_row['total_books'] = 1
last_rowid = add_author_to_table(db_cursor, const.AUTHOR_INSERT_QUERY, author_row)
print('Add Author Row Id :: ', last_rowid)
if last_rowid > 0:
author_row['id'] = last_rowid
if author_row['id'] > 0:
author_is_translator = False
if 'translator' in author:
author_is_translator = 'true' == author['translator'].lower()
map_author_book = {
'author': author_row['id'],
'book': book_info_id,
'translator': author_is_translator
}
last_rowid = add_author_book_mapping(db_cursor, const.BOOK_AUTHOR_INSERT_QUERY, map_author_book)
print('Author Book Mapping Row ID :: ', last_rowid)
if last_rowid < 0:
break
if last_rowid > 0:
book_content_row = {
'book': book_id
}
# returns the last row id, if row added to the table successfully
last_rowid = add_book_content_row_to_table(db_cursor, const.CONTENT_INSERT_QUERY, book_content_row)
print('Book Content Row Id :: ', last_rowid)
if last_rowid > 0:
content_id = last_rowid
book_chapters_list = book_dict['content']['chapters']
for chapter in book_chapters_list:
book_chapter_row = {
'c_num': chapter['num'],
'name': chapter['name'] if 'name' in chapter else None,
'book_content': content_id
}
chapter_id = add_book_chapter_to_table(db_cursor, const.CHAPTER_INSERT_QUERY, book_chapter_row)
print('Book Chapter Row Id :: ', chapter_id)
if chapter_id > 0:
sentences_dict = chapter['sentences']
for s_num in sentences_dict.keys():
sentence_row = {
's_num': s_num,
'text': sentences_dict[s_num],
'chapter': chapter_id
}
sen_id = add_book_sentence_to_table(db_cursor, const.SENTENCE_INSERT_QUERY, sentence_row)
print('Book Sentence Id :: ', sen_id)
if sen_id <= 0:
break
else:
last_rowid = sen_id
else:
break
db_cursor.close()
is_success = False
if last_rowid > 0:
conn.commit()
is_success = True
else:
conn.rollback()
is_success = False
conn.close()
return is_success
def add_book_row_to_table(db_cursor, book_insert_query, book_row):
try:
# Insert this Book row to Table
db_cursor.execute(book_insert_query, book_row)
book_id = db_cursor.lastrowid
if book_id is not None:
return book_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def add_book_info_row_to_table(db_cursor, book_info_insert_query, book_info_row):
try:
# Insert this BookInfo row
db_cursor.execute(book_info_insert_query, book_info_row)
book_info_id = db_cursor.lastrowid
if book_info_id is not None:
return book_info_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def add_book_content_row_to_table(db_cursor, book_content_insert_query, book_content_row):
try:
# Insert Book Content row
db_cursor.execute(book_content_insert_query, book_content_row)
book_content_id = db_cursor.lastrowid
if book_content_id is not None:
return book_content_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def add_book_chapter_to_table(db_cursor, book_chapter_insert_query, book_chapter_row):
try:
# Insert Book chapter row
db_cursor.execute(book_chapter_insert_query, book_chapter_row)
book_chapter_id = db_cursor.lastrowid
if book_chapter_id is not None:
return book_chapter_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def add_book_sentence_to_table(db_cursor, book_sentence_insert_query, book_sentence):
try:
# Insert sentence
db_cursor.execute(book_sentence_insert_query, book_sentence)
book_sen_id = db_cursor.lastrowid
if book_sen_id is not None:
return book_sen_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def add_author_to_table(db_cursor, author_insert_query, author_data):
try:
# Insert Author
db_cursor.execute(author_insert_query, author_data)
author_id = db_cursor.lastrowid
if author_id is not None:
return author_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def add_author_book_mapping(db_cursor, book_author_insert_query, book_author_data):
try:
# Insert Book Author Mapping
db_cursor.execute(book_author_insert_query, book_author_data)
map_id = db_cursor.rowcount
if map_id > 0:
return map_id
else:
return -1
except Exception as e:
print(str(e))
return -1
def search_author(db_cursor, author_search_query, author_data):
try:
# Search Author
db_cursor.execute(author_search_query, author_data)
row = db_cursor.fetchone()
if row is not None:
author_data['id'] = int(row[0])
author_data['total_books'] = int(row[2])
return author_data
else:
author_data['id'] = -1
return author_data
except Exception as e:
print(str(e))
author_data['id'] = -1
return author_data
def update_author_book_count(db_cursor, author_update_query, author_data):
try:
# Update Author Book Count
db_cursor.execute(author_update_query, author_data)
row_cnt = db_cursor.rowcount
if row_cnt > 0:
return row_cnt
else:
return -1
except Exception as e:
print(str(e))
return -1

@ -0,0 +1,26 @@
DB_CONFIG_FILE = 'db_config.ini'
BOOK_INSERT_QUERY = "INSERT INTO dim_book (code, added_at) " \
"VALUES (%(code)s, %(added_at)s)"
AUTHOR_INSERT_QUERY = "INSERT INTO dim_author (name, total_books) " \
"VALUES (%(name)s, %(total_books)s)"
BOOK_INFO_INSERT_QUERY = "INSERT INTO dim_book_info (title, description, lang, source, is_translation, " \
"total_chapters, isbn, book) " \
"VALUES (%(title)s, %(description)s, %(lang)s, %(source)s, %(is_translation)s, " \
"%(total_chapters)s, %(isbn)s, %(book)s) "
BOOK_AUTHOR_INSERT_QUERY = "INSERT INTO map_book_author (author, book, translator) " \
"VALUES (%(author)s, %(book)s, %(translator)s)"
CONTENT_INSERT_QUERY = "INSERT INTO dim_book_content (book) VALUES(%(book)s)"
CHAPTER_INSERT_QUERY = "INSERT INTO dim_book_chapter (c_num, name, book_content) " \
"VALUES (%(c_num)s, %(name)s, %(book_content)s)"
SENTENCE_INSERT_QUERY = "INSERT INTO dim_book_sentence (s_num, text, chapter) VALUES (%(s_num)s, %(text)s, %(chapter)s)"
AUTHOR_SEARCH_QUERY = "SELECT * FROM dim_author WHERE dim_author.name = %(name)s"
AUTHOR_UPDATE_QUERY = "UPDATE dim_author SET dim_author.total_books = %(total_books)s WHERE id = %(id)s"

@ -0,0 +1,32 @@
import mysql.connector
from mysql.connector import errorcode
import db.read_config as config
import utils.constants as const
import os
def get_new_mysql_connection(config_file_name):
config_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+config_file_name
db_config = config.read_db_config(config_file_path, 'mysql')
connection = None
try:
connection = mysql.connector.connect(**db_config)
except mysql.connector.Error as err:
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
print(const.WARNING, 'Invalid Database User and Password', const.END)
elif err.errno == errorcode.ER_BAD_DB_ERROR:
print(const.WARNING, 'Database doesn\'t exist ', const.END)
else:
print(err)
if connection is not None:
if connection.is_connected():
connection.autocommit = False
print(const.GREEN, 'MySQL Connection Successful => Connection ID :: ', connection.connection_id, const.END)
else:
connection = None
return connection

@ -0,0 +1,33 @@
from configparser import ConfigParser
import os
def read_db_config(filename, section):
""" Read database configuration file and return a dictionary object
:param filename: name of the configuration file
:param section: section of database configuration
:return: a dictionary of database parameters
"""
parser = ConfigParser()
parser.read(filename)
db = {}
if parser.has_section(section):
items = parser.items(section)
for item in items:
db[item[0]] = item[1]
else:
raise Exception('{0} not found in the {1} file'.format(section, filename))
try:
db['password'] = os.environ[db['password']]
except KeyError:
print('Please set the Environment Variable ', db['password'])
try:
db['host'] = os.environ[db['host']]
except KeyError:
print('Please set the Environment Variable ', db['host'])
return db

@ -0,0 +1,10 @@
import db.mysql_connection as connection
import db.constants as const
conn = connection.get_new_mysql_connection(const.DB_CONFIG_FILE)
print('MySQL Server version :: ', conn.get_server_info())
print('isConnected :: ', conn.is_connected())
conn.close()

@ -0,0 +1,6 @@
[mysql]
host = MYSQL_HOST
port = 3306
database = bitext-aligner
user = root
password = MYSQL_PASSWORD

Binary file not shown.

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 MiB

After

Width:  |  Height:  |  Size: 1.4 MiB

@ -1,5 +1,5 @@
-- MySQL Script generated by MySQL Workbench
-- Thu Jan 16 23:41:59 2020
-- Wed Jan 22 11:08:37 2020
-- Model: New Model Version: 1.0
-- MySQL Workbench Forward Engineering
@ -21,6 +21,8 @@ USE `bitext-aligner` ;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`dim_author`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`dim_author` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_author` (
`id` INT NOT NULL AUTO_INCREMENT,
`name` VARCHAR(90) NOT NULL,
@ -32,9 +34,12 @@ ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`dim_book`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`dim_book` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book` (
`id` INT NOT NULL,
`id` INT NOT NULL AUTO_INCREMENT,
`code` VARCHAR(90) NOT NULL,
`added_at` BIGINT UNSIGNED NOT NULL,
PRIMARY KEY (`id`))
ENGINE = InnoDB;
@ -42,10 +47,12 @@ ENGINE = InnoDB;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`dim_book_info`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_info` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` (
`id` INT NOT NULL AUTO_INCREMENT,
`title` VARCHAR(90) NOT NULL,
`description` VARCHAR(200) NULL,
`description` VARCHAR(500) NULL,
`lang` VARCHAR(5) NOT NULL,
`source` VARCHAR(90) NOT NULL,
`is_translation` TINYINT NOT NULL,
@ -53,6 +60,9 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` (
`isbn` VARCHAR(80) NULL,
`book` INT NOT NULL,
PRIMARY KEY (`id`),
INDEX `book_fk_idx` (`book` ASC),
UNIQUE INDEX `book_UNIQUE` (`book` ASC),
UNIQUE INDEX `id_UNIQUE` (`id` ASC),
CONSTRAINT `info_book_fk`
FOREIGN KEY (`book`)
REFERENCES `bitext-aligner`.`dim_book` (`id`)
@ -60,20 +70,18 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_info` (
ON UPDATE NO ACTION)
ENGINE = InnoDB;
CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`dim_book_info` (`book` ASC) VISIBLE;
CREATE UNIQUE INDEX `book_UNIQUE` ON `bitext-aligner`.`dim_book_info` (`book` ASC) VISIBLE;
CREATE UNIQUE INDEX `id_UNIQUE` ON `bitext-aligner`.`dim_book_info` (`id` ASC) VISIBLE;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`dim_book_content`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_content` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_content` (
`id` INT NOT NULL,
`id` INT NOT NULL AUTO_INCREMENT,
`book` INT NOT NULL,
PRIMARY KEY (`id`),
INDEX `book_fk_idx` (`book` ASC),
UNIQUE INDEX `book_UNIQUE` (`book` ASC),
CONSTRAINT `content_book_fk`
FOREIGN KEY (`book`)
REFERENCES `bitext-aligner`.`dim_book` (`id`)
@ -81,20 +89,19 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_content` (
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`dim_book_content` (`book` ASC) VISIBLE;
CREATE UNIQUE INDEX `book_UNIQUE` ON `bitext-aligner`.`dim_book_content` (`book` ASC) VISIBLE;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`dim_book_chapter`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_chapter` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_chapter` (
`id` INT NOT NULL AUTO_INCREMENT,
`c_num` INT UNSIGNED NOT NULL,
`name` VARCHAR(90) NULL,
`book_content` INT NOT NULL,
PRIMARY KEY (`id`),
INDEX `content_fk_idx` (`book_content` ASC),
CONSTRAINT `ch_content_fk`
FOREIGN KEY (`book_content`)
REFERENCES `bitext-aligner`.`dim_book_content` (`id`)
@ -102,18 +109,19 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_chapter` (
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE INDEX `content_fk_idx` ON `bitext-aligner`.`dim_book_chapter` (`book_content` ASC) VISIBLE;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`dim_book_sentence`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`dim_book_sentence` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` (
`id` INT NOT NULL AUTO_INCREMENT,
`s_num` INT UNSIGNED NOT NULL,
`text` VARCHAR(500) NOT NULL,
`text` VARCHAR(900) NOT NULL,
`chapter` INT NOT NULL,
PRIMARY KEY (`id`),
INDEX `chapter_fk_idx` (`chapter` ASC),
CONSTRAINT `sen_chapter_fk`
FOREIGN KEY (`chapter`)
REFERENCES `bitext-aligner`.`dim_book_chapter` (`id`)
@ -121,16 +129,18 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`dim_book_sentence` (
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE INDEX `chapter_fk_idx` ON `bitext-aligner`.`dim_book_sentence` (`chapter` ASC) VISIBLE;
-- -----------------------------------------------------
-- Table `bitext-aligner`.`map_book_author`
-- -----------------------------------------------------
DROP TABLE IF EXISTS `bitext-aligner`.`map_book_author` ;
CREATE TABLE IF NOT EXISTS `bitext-aligner`.`map_book_author` (
`author` INT NOT NULL,
`book` INT NOT NULL,
`translator` TINYINT NOT NULL,
INDEX `book_fk_idx` (`book` ASC) ,
INDEX `author_fk_idx` (`author` ASC),
CONSTRAINT `map_book_fk`
FOREIGN KEY (`book`)
REFERENCES `bitext-aligner`.`dim_book_info` (`id`)
@ -143,10 +153,6 @@ CREATE TABLE IF NOT EXISTS `bitext-aligner`.`map_book_author` (
ON UPDATE CASCADE)
ENGINE = InnoDB;
CREATE INDEX `book_fk_idx` ON `bitext-aligner`.`map_book_author` (`book` ASC) VISIBLE;
CREATE INDEX `author_fk_idx` ON `bitext-aligner`.`map_book_author` (`author` ASC) VISIBLE;
SET SQL_MODE=@OLD_SQL_MODE;
SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS;

@ -2,3 +2,7 @@ google-cloud-translate==2.0.0
google-cloud-storage==1.19.1
mysql-connector-python==8.0.19
pandas
xmlschema
numpy
jellyfish
nltk

@ -0,0 +1,38 @@
import xml_parser.read_xml as read_xml
import db.add_book as adb
import xml_parser.validate as validate
import utils.json_utils as json_utils
import utils.constants as const
import utils.env_utils as env
def validate_all_xml_files():
validate.validate_all_xml_files()
def save_validated_files_to_db():
json_data = json_utils.read_json_file(const.JSON_PATH)
books_json = json_data['books']
for book_code in books_json.keys():
books_list = books_json[book_code]
for book in books_list:
if not book['is_validated']:
print(const.WARNING, 'Book : ', book['xml_file'], ' is not validated against XSD', const.END)
continue
if not book['is_saved_to_db']:
print(const.BLUE, 'Adding Book : ', book['xml_file'], ' to the DB', const.END)
book_dict = read_xml.parse_xml_file(book['xml_file_path'])
result = adb.add_book_to_db(book_code, book_dict)
book['is_saved_to_db'] = result
w_str = const.WARNING
if result:
w_str = const.BLUE
print(w_str, 'Result :: ', result, const.END, '\n')
json_data['books'] = books_json
json_utils.write_json_file(const.JSON_PATH, json_data)
if env.check_env_variables():
validate_all_xml_files()
save_validated_files_to_db()

@ -3,3 +3,4 @@
1,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."
2,1,"Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?"
2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"
2,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."
1 1 1 Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo.
3 1 3 Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem.
4 2 1 Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?
5 2 2 Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?
6 2 3 Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem.

@ -0,0 +1,15 @@
JSON_PATH = 'json/books.json'
XSD_PATH = 'xml_files/book.xsd'
TRANSLATE_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS'
MYSQL_PASS_ENV_VAR = 'MYSQL_PASSWORD'
MYSQL_HOST_ENV_VAR = 'MYSQL_HOST'
WARNING = '\033[91m'
END = '\033[0m'
BLUE = '\033[94m'
GREEN = '\033[92m'

@ -0,0 +1,15 @@
import os
import utils.constants as const
def check_env_variables():
if const.TRANSLATE_ENV_VAR not in os.environ:
print(const.WARNING, 'Please set the ', const.TRANSLATE_ENV_VAR, ' Environment Variable to continue....', const.END)
return False
if const.MYSQL_PASS_ENV_VAR not in os.environ:
print(const.WARNING, 'Please set the ', const.MYSQL_PASS_ENV_VAR, ' Environment Variable to continue....', const.END)
return False
if const.MYSQL_HOST_ENV_VAR not in os.environ:
print(const.WARNING, 'Please set the ', const.MYSQL_HOST_ENV_VAR, ' Environment Variable to continue....', const.END)
return False
return True

@ -0,0 +1,19 @@
import json
import os
def read_json_file(file_path):
json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
with open(json_file_path, 'r') as json_file:
json_data = json.load(json_file)
json_file.close()
return json_data
def write_json_file(file_path, json_data):
json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
with open(json_file_path, 'w') as updated_json:
updated_json.write(json.dumps(json_data, indent=4))
updated_json.close()

@ -0,0 +1,67 @@
<?xml version="1.0" encoding="UTF-8" ?>
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
<xsd:element name="book">
<xsd:complexType>
<xsd:sequence>
<xsd:element ref="bookInfo" minOccurs="1" maxOccurs="1" />
<xsd:element ref="content" minOccurs="1" maxOccurs="1" />
</xsd:sequence>
<xsd:attribute name="code" type="xsd:string" use="required" />
</xsd:complexType>
</xsd:element>
<xsd:element name="bookInfo">
<xsd:complexType>
<xsd:sequence>
<xsd:element name="title" type="xsd:string" minOccurs="1" maxOccurs="1" />
<xsd:element name="lang" type="xsd:string" minOccurs="1" maxOccurs="1" />
<xsd:element name="isTranslation" type="xsd:boolean" minOccurs="1" maxOccurs="1" />
<xsd:element name="totalChapters" type="xsd:nonNegativeInteger" minOccurs="1" maxOccurs="1" />
<xsd:element name="source" type="xsd:string" minOccurs="1" maxOccurs="1" />
<xsd:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1" />
<xsd:element name="isbn" type="xsd:string" minOccurs="0" maxOccurs="1" />
<xsd:element ref="author" minOccurs="1" maxOccurs="unbounded" />
</xsd:sequence>
</xsd:complexType>
</xsd:element>
<xsd:element name="author">
<xsd:complexType>
<xsd:simpleContent>
<xsd:extension base="xsd:string">
<xsd:attribute name="translator" type="xsd:boolean" use="optional" />
</xsd:extension>
</xsd:simpleContent>
</xsd:complexType>
</xsd:element>
<xsd:element name="content">
<xsd:complexType>
<xsd:sequence>
<xsd:element ref="chapter" minOccurs="1" maxOccurs="unbounded" />
</xsd:sequence>
</xsd:complexType>
</xsd:element>
<xsd:element name="chapter">
<xsd:complexType>
<xsd:sequence>
<xsd:element ref="sentence" minOccurs="1" maxOccurs="unbounded" />
</xsd:sequence>
<xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
<xsd:attribute name="name" type="xsd:string" use="optional" />
</xsd:complexType>
</xsd:element>
<xsd:element name="sentence">
<xsd:complexType>
<xsd:simpleContent>
<xsd:extension base="xsd:string">
<xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
</xsd:extension>
</xsd:simpleContent>
</xsd:complexType>
</xsd:element>
</xsd:schema>

@ -2,17 +2,17 @@
<book code="abc_book">
<bookInfo>
<title>Crime and Punishment</title>
<lang>en</lang>
<isTranslation>true</isTranslation>
<totalChapters>2</totalChapters>
<source>https://en.wikisource.org/wiki/Crime_and_Punishment</source>
<description> <!--Optional-->
Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author Fyodor Dostoevsky.
First published in a journal named The Russian Messenger, it appeared in twelve monthly installments in 1866, and was later published as a novel.
</description>
<lang>en</lang>
<isbn>n.a.</isbn> <!--Optional-->
<author>Fyodor Dostoevsky</author>
<author translator="true">Constance Garnett</author>
<source>https://en.wikisource.org/wiki/Crime_and_Punishment</source>
<isTranslation>true</isTranslation>
<totalChapters>2</totalChapters>
<isbn>n.a.</isbn> <!--Optional-->
</bookInfo>
<content>
<chapter num="1" name="Erstes Kapitel">

@ -1,12 +1,16 @@
from pathlib import Path
import json
import utils.constants as const
import os
json_file_path = Path('json/books.json')
json_data = {'books': []}
json_path = os.path.dirname(os.path.dirname(__file__))+'/'+const.JSON_PATH
json_file_path = Path(json_path)
json_data = {'books': {}}
if not json_file_path.is_file():
json_file = open(json_file_path, 'w')
json_file.write(json.dumps(json_data, indent=4))
json_file.close()
print('JSON File Created :: '+json_file.name)
print(const.BLUE, 'JSON File Created :: '+json_file.name, const.END)

@ -2,7 +2,8 @@ from xml.etree import ElementTree as ET
from xml.dom import minidom
import os
import json
from pathlib import Path
import utils.json_utils as json_utils
import utils.constants as const
def create_xml_file(book_dict, book_metadata):
@ -24,14 +25,13 @@ def create_xml_file(book_dict, book_metadata):
total_chapters = ET.SubElement(book_info, 'totalChapters')
total_chapters.text = book_metadata['totalChapters']
source = ET.SubElement(book_info, 'source')
source.text = book_metadata['source']
if 'description' in book_metadata:
description = ET.SubElement(book_info, 'description')
description.text = book_metadata['description']
if 'source' in book_metadata:
source = ET.SubElement(book_info, 'source')
source.text = book_metadata['source']
if 'isbn' in book_metadata:
isbn = ET.SubElement(book_info, 'isbn')
isbn.text = book_metadata['isbn']
@ -48,40 +48,44 @@ def create_xml_file(book_dict, book_metadata):
chapter.set('num', str(key))
for idx, val in enumerate(book_dict[key]):
sentence = ET.SubElement(chapter, 'sentence')
sentence.set('id', str(idx + 1))
sentence.set('num', str(idx + 1))
sentence.text = val
# tree = ET.ElementTree(book_root)
# tree.write(filename)
root_dir = os.path.dirname(os.path.dirname(__file__))
output_dir = os.path.join(root_dir, "xml_files")
filename = book_root.get('id') + "_" + lang.text + ".xml"
filename = book_root.get('code') + "_" + lang.text + ".xml"
file = open(output_dir + '/' + filename, 'w')
file_path = file.name
print('XML File Path :: ', file_path)
file.write(prettify(book_root))
file.close()
print(const.BLUE, 'Saved XML File Path :: ', file_path, const.END)
json_obj = {}
json_obj['book_id'] = book_root.get('id')
book_code = book_root.get('code')
json_obj['xml_file'] = filename
json_obj['lang'] = lang.text
json_obj['xml_file_path'] = file_path
json_obj['is_validated'] = False
json_obj['is_saved_to_db'] = False
add_xml_book_data_to_json(json_obj)
add_xml_book_data_to_json(book_code, json_obj)
return file_path
def add_xml_book_data_to_json(book_code, json_obj):
json_data = json_utils.read_json_file(const.JSON_PATH)
def add_xml_book_data_to_json(json_obj):
json_file_path = Path('json/books.json')
books = json_data['books']
if book_code in books.keys():
books[book_code].append(json_obj)
else:
books[book_code] = [json_obj]
json_file = open(json_file_path, 'r')
json_data = json.load(json_file)
json_file.close()
json_data['books'] = books
json_file = open(json_file_path, 'w')
json_data['books'].append(json_obj)
json_file.write(json.dumps(json_data, indent=4))
json_file.close()
json_utils.write_json_file(const.JSON_PATH, json_data)
def prettify(root):

@ -0,0 +1,42 @@
import xml.etree.ElementTree as ET
def parse_xml_file(full_path):
book_dict = {}
tree = ET.parse(full_path)
book_root = tree.getroot()
# print('Root Element :: ', book_root.tag, ' | Attributes :: ', book_root.attrib)
book_dict['code'] = book_root.attrib['code']
book_info_dict = {}
book_content_dict = {}
book_info_element = book_root.find('bookInfo')
book_content_element = book_root.find('content')
book_info_dict['authors'] = []
for child in book_info_element:
if 'author' == child.tag:
author = {'name': child.text}
if 'translator' in child.attrib:
author['translator'] = child.attrib['translator']
book_info_dict['authors'].append(author)
else:
book_info_dict[child.tag] = child.text
book_dict['bookInfo'] = book_info_dict
book_content_dict['chapters'] = []
for chapter in book_content_element:
chapter_dict = {'num': chapter.attrib['num']}
if 'name' in chapter.attrib:
chapter_dict['name'] = chapter.attrib['name']
chapter_dict['sentences'] = {}
for sentence in chapter.findall('sentence'):
chapter_dict['sentences'][sentence.attrib['num']] = sentence.text
book_content_dict['chapters'].append(chapter_dict)
book_dict['content'] = book_content_dict
return book_dict

@ -1,6 +0,0 @@
from csv2df import get_book_content, get_book_metadata
from xml_parser.create_xml import create_xml_file
create_xml_file(get_book_content(), get_book_metadata())

@ -0,0 +1,14 @@
from csv2df import get_book_content, get_book_metadata
import xml_parser.create_xml as create_xml
import xml_parser.read_xml as read_xml
import xml_parser.validate as validate
file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata())
# print(file_path)
validate.validate_all_xml_files()
# book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml')

@ -0,0 +1,38 @@
import xmlschema
import json
import utils.json_utils as json_utils
import utils.constants as const
import os
def is_valid(book_schema, xml_path):
return book_schema.is_valid(xml_path)
def get_book_schema(book_xsd_path):
xsd_full_path = os.path.dirname(os.path.dirname(__file__))+'/'+book_xsd_path
book_schema = xmlschema.XMLSchema(xsd_full_path)
return book_schema
def validate_all_xml_files():
json_data = json_utils.read_json_file(const.JSON_PATH)
book_schema = get_book_schema(const.XSD_PATH)
books_json = json_data['books']
for book_code in books_json.keys():
books_list = books_json[book_code]
for book in books_list:
if book['is_validated']:
print(const.BLUE, 'Book : ', book['xml_file'], ' is valid', const.END)
continue
else:
if 'xml_file_path' in book:
result = book_schema.is_valid(book['xml_file_path'])
print('Validating Book : ', book['xml_file'], ' -> ', result)
book['is_validated'] = result
json_data['books'] = books_json
json_utils.write_json_file(const.JSON_PATH, json_data)
Loading…
Cancel
Save