commit
1540b701a4
@ -0,0 +1,170 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
|
||||||
|
import os,sys
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
from numpy import cumsum
|
||||||
|
from pandas import DataFrame
|
||||||
|
from nltk import word_tokenize, sent_tokenize
|
||||||
|
#import xml.etree.ElementTree as ET
|
||||||
|
from jellyfish import levenshtein_distance as lev
|
||||||
|
#import six
|
||||||
|
from google.cloud import translate_v2 as translate
|
||||||
|
from itertools import product as cp
|
||||||
|
|
||||||
|
translate_client = translate.Client()
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def master_align(text0, text1, lang0, lang1):
|
||||||
|
""" Takes two equivalent texts (original and trnslation) and returns
|
||||||
|
aligned texts. """
|
||||||
|
df0 = frame_from_text(text0, lang0, lang1)
|
||||||
|
print('A')
|
||||||
|
df1 = frame_from_text(text1, lang1, lang0, is1=True)
|
||||||
|
print('B')
|
||||||
|
# returns dfs with ['sent', 'trans', 'rellen', 'relpos']
|
||||||
|
anchors = anchors_from_frames(df0, df1, window=2)
|
||||||
|
print('C')
|
||||||
|
alignments = intermediate_align(df0, df1, anchors, lookahead=4)
|
||||||
|
print('D')
|
||||||
|
textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments)
|
||||||
|
print('E')
|
||||||
|
return textdict0, textdict1
|
||||||
|
|
||||||
|
|
||||||
|
def frame_from_text(text, source, target, is1=False): #
|
||||||
|
""" """ #
|
||||||
|
#print(source, '-->', target)
|
||||||
|
cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
|
||||||
|
#print(cols)
|
||||||
|
frame = pd.DataFrame(columns=cols)
|
||||||
|
frame[cols[0]] = sent_tokenize(text)
|
||||||
|
frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
|
||||||
|
frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
|
||||||
|
frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max()
|
||||||
|
cumul_b = list(np.cumsum(frame[cols[2]]))
|
||||||
|
cumul_a = [0]+cumul_b[:-1]
|
||||||
|
frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b)))
|
||||||
|
#print(frame[[cols[0], cols[1]]])
|
||||||
|
return frame
|
||||||
|
|
||||||
|
|
||||||
|
def anchors_from_frames(frame0, frame1, window): #
|
||||||
|
""" """
|
||||||
|
pairdf = generate_pairdf(frame0, frame1, window)
|
||||||
|
frame0['index0'] = frame0.index
|
||||||
|
frame1['index1'] = frame1.index
|
||||||
|
pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1')
|
||||||
|
pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1)
|
||||||
|
pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1)
|
||||||
|
pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1)
|
||||||
|
pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
|
||||||
|
pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
|
||||||
|
pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3)
|
||||||
|
return list(pairdf[pairdf.isanchor][['index0','index1']].values)
|
||||||
|
|
||||||
|
|
||||||
|
def intermediate_align(frame0, frame1, anchs, lookahead): #
|
||||||
|
""" """
|
||||||
|
aligns = []
|
||||||
|
end0, end1 = frame0.shape[0], frame1.shape[0]
|
||||||
|
anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)]))
|
||||||
|
for rang in anchor_ranges:
|
||||||
|
interaligns = get_interalign(frame0, frame1, *rang, lookahead)
|
||||||
|
a,b = rang[0]
|
||||||
|
aligns.append(((a,b),(a,b)))
|
||||||
|
aligns.extend(interaligns)
|
||||||
|
return aligns[1:] # format [((i_start, i_end),(j_start, j_end))]
|
||||||
|
|
||||||
|
|
||||||
|
def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
|
||||||
|
""" """
|
||||||
|
print(anchors_init, anchors_next)
|
||||||
|
interaligns = []
|
||||||
|
i,j = anchors_init
|
||||||
|
i+=1
|
||||||
|
j+=1
|
||||||
|
end0, end1 = anchors_next
|
||||||
|
while i<end0 and j<end1:
|
||||||
|
room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead)
|
||||||
|
lambdascore = lambda p,q: score(df0, df1, i, j, p, q)
|
||||||
|
i_,j_ = min([(x,y) for x,y in cp(range(i,i+room0),range(j,j+room1)) if x==i or y==j], key=lambda a: lambdascore(*a))
|
||||||
|
print((i,j), (i_,j_))
|
||||||
|
interaligns.append(((i,j),(i_,j_)))
|
||||||
|
i,j = i_+1,j_+1
|
||||||
|
return interaligns
|
||||||
|
|
||||||
|
|
||||||
|
def score(frame0, frame1, start0, start1, end0, end1): #
|
||||||
|
#print(frame0.columns)
|
||||||
|
#print(frame1.columns)
|
||||||
|
s0 = ' '.join(frame0.loc[start0:end0, 'sent0'])
|
||||||
|
s1 = ' '.join(frame1.loc[start1:end1, 'sent1'])
|
||||||
|
t0 = ' '.join(frame0.loc[start0:end0, 'trans0'])
|
||||||
|
t1 = ' '.join(frame1.loc[start1:end1, 'trans1'])
|
||||||
|
l0 = sum(frame0.loc[start0:end0, 'rellen0'])
|
||||||
|
l1 = sum(frame1.loc[start1:end1, 'rellen1'])
|
||||||
|
#print(s0, s1, t0, t1, l0, l1)
|
||||||
|
return (trdist(s0,t1)+trdist(s1,t0))*gr1(l0/l1)/2
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def textdicts_from_alignments(frame0, frame1, aligns): #
|
||||||
|
""" """
|
||||||
|
textdict0, textdict1 = {},{}
|
||||||
|
for i,((a0,a1),(b0,b1)) in enumerate(aligns):
|
||||||
|
t0 = ' '.join(frame0.loc[a0:b0, 'sent0'])
|
||||||
|
t1 = ' '.join(frame1.loc[a1:b1, 'sent1'])
|
||||||
|
print('***************************')
|
||||||
|
print(aligns[i])
|
||||||
|
print(t0)
|
||||||
|
print(t1)
|
||||||
|
textdict0.update({i:t0})
|
||||||
|
textdict1.update({i:t1})
|
||||||
|
return textdict0, textdict1
|
||||||
|
|
||||||
|
|
||||||
|
def generate_pairdf(frame0, frame1, window):
|
||||||
|
""" """
|
||||||
|
pairdf = pd.DataFrame(columns=['index0', 'index1'])
|
||||||
|
ranges0 = frame0.relpos0
|
||||||
|
ranges1 = frame1.relpos1
|
||||||
|
overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
|
||||||
|
len0 = frame0.shape[0]
|
||||||
|
len1 = frame1.shape[0]
|
||||||
|
allpairs = []
|
||||||
|
for i,j in overlap:
|
||||||
|
for k in range(-window, window+1):
|
||||||
|
for l in range(-window, window+1):
|
||||||
|
allpairs.append((i+k,j+l))
|
||||||
|
allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and a<len0 and b<len1]
|
||||||
|
allpairs = sorted(list(set(allpairs)))
|
||||||
|
pairdf = pd.DataFrame(allpairs).rename(columns={0:'index0', 1:'index1'})
|
||||||
|
return pairdf
|
||||||
|
|
||||||
|
|
||||||
|
def get_overlap(a,b,c,d):
|
||||||
|
#print(a0,b0,a1,b1)
|
||||||
|
if b>c and b<=d:
|
||||||
|
return b-max(a,c)
|
||||||
|
elif a>=c and a<d:
|
||||||
|
return min(b,d)-a
|
||||||
|
elif c>=a and c<b:
|
||||||
|
return d-max(a,c)
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
gr1 = lambda x: 1/less1(x) #
|
||||||
|
less1 = lambda x: 1/x if abs(x)>1 else x #
|
||||||
|
trdist = lambda x,y: lev(x,y)/max(len(x),len(y)) #
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,271 @@
|
|||||||
|
import time
|
||||||
|
import db.mysql_connection as mysql
|
||||||
|
import db.constants as const
|
||||||
|
|
||||||
|
|
||||||
|
def add_book_to_db(book_code, book_dict):
|
||||||
|
# print('Adding Book Code :: ', book_code, ' Dict :: ', book_dict)
|
||||||
|
|
||||||
|
conn = mysql.get_new_mysql_connection(const.DB_CONFIG_FILE)
|
||||||
|
if conn is None:
|
||||||
|
return False
|
||||||
|
|
||||||
|
db_cursor = conn.cursor(buffered=True)
|
||||||
|
|
||||||
|
# add book data to the Table First
|
||||||
|
book_row = {
|
||||||
|
'code': book_code,
|
||||||
|
'added_at': int(time.time())
|
||||||
|
}
|
||||||
|
|
||||||
|
# returns the last row id, if row added to the table successfully
|
||||||
|
last_rowid = add_book_row_to_table(db_cursor, const.BOOK_INSERT_QUERY, book_row)
|
||||||
|
book_id = last_rowid
|
||||||
|
print('Book Row Id :: ', last_rowid)
|
||||||
|
|
||||||
|
book_info_dict = book_dict['bookInfo']
|
||||||
|
if last_rowid > 0:
|
||||||
|
book_info_row = {
|
||||||
|
'title': book_info_dict['title'],
|
||||||
|
'description': book_info_dict['description'] if 'description' in book_info_dict else None,
|
||||||
|
'lang': book_info_dict['lang'],
|
||||||
|
'source': book_info_dict['source'],
|
||||||
|
'is_translation': 'true' == book_info_dict['isTranslation'].lower(),
|
||||||
|
'total_chapters': book_info_dict['totalChapters'],
|
||||||
|
'isbn': book_info_dict['isbn'] if 'isbn' in book_info_dict else None,
|
||||||
|
'book': book_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# returns the last row id, if row added to the table successfully
|
||||||
|
last_rowid = add_book_info_row_to_table(db_cursor, const.BOOK_INFO_INSERT_QUERY, book_info_row)
|
||||||
|
print('Book Info Row Id :: ', last_rowid)
|
||||||
|
|
||||||
|
if last_rowid > 0:
|
||||||
|
book_info_id = last_rowid
|
||||||
|
authors_list = book_info_dict['authors']
|
||||||
|
for author in authors_list:
|
||||||
|
author_row = {
|
||||||
|
'id': -1,
|
||||||
|
'name': author['name'].strip().lower(),
|
||||||
|
'total_books': 1
|
||||||
|
}
|
||||||
|
author_row = search_author(db_cursor, const.AUTHOR_SEARCH_QUERY, author_row)
|
||||||
|
print('Author Search Result :: ', author_row)
|
||||||
|
if author_row['id'] > 0:
|
||||||
|
author_row['total_books'] = author_row['total_books'] + 1
|
||||||
|
last_rowid = update_author_book_count(db_cursor, const.AUTHOR_UPDATE_QUERY, author_row)
|
||||||
|
print('Author Update Row count :: ', last_rowid)
|
||||||
|
if last_rowid <= 0:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
author_row['name'] = author['name']
|
||||||
|
author_row['total_books'] = 1
|
||||||
|
last_rowid = add_author_to_table(db_cursor, const.AUTHOR_INSERT_QUERY, author_row)
|
||||||
|
print('Add Author Row Id :: ', last_rowid)
|
||||||
|
if last_rowid > 0:
|
||||||
|
author_row['id'] = last_rowid
|
||||||
|
|
||||||
|
if author_row['id'] > 0:
|
||||||
|
author_is_translator = False
|
||||||
|
if 'translator' in author:
|
||||||
|
author_is_translator = 'true' == author['translator'].lower()
|
||||||
|
map_author_book = {
|
||||||
|
'author': author_row['id'],
|
||||||
|
'book': book_info_id,
|
||||||
|
'translator': author_is_translator
|
||||||
|
}
|
||||||
|
|
||||||
|
last_rowid = add_author_book_mapping(db_cursor, const.BOOK_AUTHOR_INSERT_QUERY, map_author_book)
|
||||||
|
print('Author Book Mapping Row ID :: ', last_rowid)
|
||||||
|
if last_rowid < 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if last_rowid > 0:
|
||||||
|
book_content_row = {
|
||||||
|
'book': book_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# returns the last row id, if row added to the table successfully
|
||||||
|
last_rowid = add_book_content_row_to_table(db_cursor, const.CONTENT_INSERT_QUERY, book_content_row)
|
||||||
|
print('Book Content Row Id :: ', last_rowid)
|
||||||
|
|
||||||
|
if last_rowid > 0:
|
||||||
|
content_id = last_rowid
|
||||||
|
book_chapters_list = book_dict['content']['chapters']
|
||||||
|
for chapter in book_chapters_list:
|
||||||
|
book_chapter_row = {
|
||||||
|
'c_num': chapter['num'],
|
||||||
|
'name': chapter['name'] if 'name' in chapter else None,
|
||||||
|
'book_content': content_id
|
||||||
|
}
|
||||||
|
chapter_id = add_book_chapter_to_table(db_cursor, const.CHAPTER_INSERT_QUERY, book_chapter_row)
|
||||||
|
print('Book Chapter Row Id :: ', chapter_id)
|
||||||
|
if chapter_id > 0:
|
||||||
|
sentences_dict = chapter['sentences']
|
||||||
|
for s_num in sentences_dict.keys():
|
||||||
|
sentence_row = {
|
||||||
|
's_num': s_num,
|
||||||
|
'text': sentences_dict[s_num],
|
||||||
|
'chapter': chapter_id
|
||||||
|
}
|
||||||
|
sen_id = add_book_sentence_to_table(db_cursor, const.SENTENCE_INSERT_QUERY, sentence_row)
|
||||||
|
print('Book Sentence Id :: ', sen_id)
|
||||||
|
if sen_id <= 0:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
last_rowid = sen_id
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
db_cursor.close()
|
||||||
|
|
||||||
|
is_success = False
|
||||||
|
if last_rowid > 0:
|
||||||
|
conn.commit()
|
||||||
|
is_success = True
|
||||||
|
else:
|
||||||
|
conn.rollback()
|
||||||
|
is_success = False
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return is_success
|
||||||
|
|
||||||
|
|
||||||
|
def add_book_row_to_table(db_cursor, book_insert_query, book_row):
|
||||||
|
try:
|
||||||
|
# Insert this Book row to Table
|
||||||
|
db_cursor.execute(book_insert_query, book_row)
|
||||||
|
book_id = db_cursor.lastrowid
|
||||||
|
if book_id is not None:
|
||||||
|
return book_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def add_book_info_row_to_table(db_cursor, book_info_insert_query, book_info_row):
|
||||||
|
try:
|
||||||
|
# Insert this BookInfo row
|
||||||
|
db_cursor.execute(book_info_insert_query, book_info_row)
|
||||||
|
book_info_id = db_cursor.lastrowid
|
||||||
|
if book_info_id is not None:
|
||||||
|
return book_info_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def add_book_content_row_to_table(db_cursor, book_content_insert_query, book_content_row):
|
||||||
|
try:
|
||||||
|
# Insert Book Content row
|
||||||
|
db_cursor.execute(book_content_insert_query, book_content_row)
|
||||||
|
book_content_id = db_cursor.lastrowid
|
||||||
|
if book_content_id is not None:
|
||||||
|
return book_content_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def add_book_chapter_to_table(db_cursor, book_chapter_insert_query, book_chapter_row):
|
||||||
|
try:
|
||||||
|
# Insert Book chapter row
|
||||||
|
db_cursor.execute(book_chapter_insert_query, book_chapter_row)
|
||||||
|
book_chapter_id = db_cursor.lastrowid
|
||||||
|
if book_chapter_id is not None:
|
||||||
|
return book_chapter_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def add_book_sentence_to_table(db_cursor, book_sentence_insert_query, book_sentence):
|
||||||
|
try:
|
||||||
|
# Insert sentence
|
||||||
|
db_cursor.execute(book_sentence_insert_query, book_sentence)
|
||||||
|
book_sen_id = db_cursor.lastrowid
|
||||||
|
if book_sen_id is not None:
|
||||||
|
return book_sen_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def add_author_to_table(db_cursor, author_insert_query, author_data):
|
||||||
|
try:
|
||||||
|
# Insert Author
|
||||||
|
db_cursor.execute(author_insert_query, author_data)
|
||||||
|
author_id = db_cursor.lastrowid
|
||||||
|
if author_id is not None:
|
||||||
|
return author_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def add_author_book_mapping(db_cursor, book_author_insert_query, book_author_data):
|
||||||
|
try:
|
||||||
|
# Insert Book Author Mapping
|
||||||
|
db_cursor.execute(book_author_insert_query, book_author_data)
|
||||||
|
map_id = db_cursor.rowcount
|
||||||
|
if map_id > 0:
|
||||||
|
return map_id
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
|
|
||||||
|
|
||||||
|
def search_author(db_cursor, author_search_query, author_data):
|
||||||
|
try:
|
||||||
|
# Search Author
|
||||||
|
db_cursor.execute(author_search_query, author_data)
|
||||||
|
row = db_cursor.fetchone()
|
||||||
|
if row is not None:
|
||||||
|
author_data['id'] = int(row[0])
|
||||||
|
author_data['total_books'] = int(row[2])
|
||||||
|
return author_data
|
||||||
|
else:
|
||||||
|
author_data['id'] = -1
|
||||||
|
return author_data
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
author_data['id'] = -1
|
||||||
|
return author_data
|
||||||
|
|
||||||
|
|
||||||
|
def update_author_book_count(db_cursor, author_update_query, author_data):
|
||||||
|
try:
|
||||||
|
# Update Author Book Count
|
||||||
|
db_cursor.execute(author_update_query, author_data)
|
||||||
|
row_cnt = db_cursor.rowcount
|
||||||
|
if row_cnt > 0:
|
||||||
|
return row_cnt
|
||||||
|
else:
|
||||||
|
return -1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(str(e))
|
||||||
|
return -1
|
||||||
@ -0,0 +1,26 @@
|
|||||||
|
DB_CONFIG_FILE = 'db_config.ini'
|
||||||
|
|
||||||
|
BOOK_INSERT_QUERY = "INSERT INTO dim_book (code, added_at) " \
|
||||||
|
"VALUES (%(code)s, %(added_at)s)"
|
||||||
|
|
||||||
|
AUTHOR_INSERT_QUERY = "INSERT INTO dim_author (name, total_books) " \
|
||||||
|
"VALUES (%(name)s, %(total_books)s)"
|
||||||
|
|
||||||
|
BOOK_INFO_INSERT_QUERY = "INSERT INTO dim_book_info (title, description, lang, source, is_translation, " \
|
||||||
|
"total_chapters, isbn, book) " \
|
||||||
|
"VALUES (%(title)s, %(description)s, %(lang)s, %(source)s, %(is_translation)s, " \
|
||||||
|
"%(total_chapters)s, %(isbn)s, %(book)s) "
|
||||||
|
|
||||||
|
BOOK_AUTHOR_INSERT_QUERY = "INSERT INTO map_book_author (author, book, translator) " \
|
||||||
|
"VALUES (%(author)s, %(book)s, %(translator)s)"
|
||||||
|
|
||||||
|
CONTENT_INSERT_QUERY = "INSERT INTO dim_book_content (book) VALUES(%(book)s)"
|
||||||
|
|
||||||
|
CHAPTER_INSERT_QUERY = "INSERT INTO dim_book_chapter (c_num, name, book_content) " \
|
||||||
|
"VALUES (%(c_num)s, %(name)s, %(book_content)s)"
|
||||||
|
|
||||||
|
SENTENCE_INSERT_QUERY = "INSERT INTO dim_book_sentence (s_num, text, chapter) VALUES (%(s_num)s, %(text)s, %(chapter)s)"
|
||||||
|
|
||||||
|
AUTHOR_SEARCH_QUERY = "SELECT * FROM dim_author WHERE dim_author.name = %(name)s"
|
||||||
|
|
||||||
|
AUTHOR_UPDATE_QUERY = "UPDATE dim_author SET dim_author.total_books = %(total_books)s WHERE id = %(id)s"
|
||||||
@ -0,0 +1,32 @@
|
|||||||
|
import mysql.connector
|
||||||
|
from mysql.connector import errorcode
|
||||||
|
import db.read_config as config
|
||||||
|
import utils.constants as const
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def get_new_mysql_connection(config_file_name):
|
||||||
|
|
||||||
|
config_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+config_file_name
|
||||||
|
db_config = config.read_db_config(config_file_path, 'mysql')
|
||||||
|
|
||||||
|
connection = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
connection = mysql.connector.connect(**db_config)
|
||||||
|
except mysql.connector.Error as err:
|
||||||
|
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
|
||||||
|
print(const.WARNING, 'Invalid Database User and Password', const.END)
|
||||||
|
elif err.errno == errorcode.ER_BAD_DB_ERROR:
|
||||||
|
print(const.WARNING, 'Database doesn\'t exist ', const.END)
|
||||||
|
else:
|
||||||
|
print(err)
|
||||||
|
|
||||||
|
if connection is not None:
|
||||||
|
if connection.is_connected():
|
||||||
|
connection.autocommit = False
|
||||||
|
print(const.GREEN, 'MySQL Connection Successful => Connection ID :: ', connection.connection_id, const.END)
|
||||||
|
else:
|
||||||
|
connection = None
|
||||||
|
|
||||||
|
return connection
|
||||||
@ -0,0 +1,33 @@
|
|||||||
|
from configparser import ConfigParser
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_db_config(filename, section):
|
||||||
|
""" Read database configuration file and return a dictionary object
|
||||||
|
:param filename: name of the configuration file
|
||||||
|
:param section: section of database configuration
|
||||||
|
:return: a dictionary of database parameters
|
||||||
|
"""
|
||||||
|
|
||||||
|
parser = ConfigParser()
|
||||||
|
parser.read(filename)
|
||||||
|
|
||||||
|
db = {}
|
||||||
|
if parser.has_section(section):
|
||||||
|
items = parser.items(section)
|
||||||
|
for item in items:
|
||||||
|
db[item[0]] = item[1]
|
||||||
|
else:
|
||||||
|
raise Exception('{0} not found in the {1} file'.format(section, filename))
|
||||||
|
|
||||||
|
try:
|
||||||
|
db['password'] = os.environ[db['password']]
|
||||||
|
except KeyError:
|
||||||
|
print('Please set the Environment Variable ', db['password'])
|
||||||
|
|
||||||
|
try:
|
||||||
|
db['host'] = os.environ[db['host']]
|
||||||
|
except KeyError:
|
||||||
|
print('Please set the Environment Variable ', db['host'])
|
||||||
|
|
||||||
|
return db
|
||||||
@ -0,0 +1,10 @@
|
|||||||
|
import db.mysql_connection as connection
|
||||||
|
import db.constants as const
|
||||||
|
|
||||||
|
conn = connection.get_new_mysql_connection(const.DB_CONFIG_FILE)
|
||||||
|
|
||||||
|
print('MySQL Server version :: ', conn.get_server_info())
|
||||||
|
print('isConnected :: ', conn.is_connected())
|
||||||
|
|
||||||
|
conn.close()
|
||||||
|
|
||||||
@ -0,0 +1,6 @@
|
|||||||
|
[mysql]
|
||||||
|
host = MYSQL_HOST
|
||||||
|
port = 3306
|
||||||
|
database = bitext-aligner
|
||||||
|
user = root
|
||||||
|
password = MYSQL_PASSWORD
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 1.4 MiB After Width: | Height: | Size: 1.4 MiB |
@ -0,0 +1,38 @@
|
|||||||
|
import xml_parser.read_xml as read_xml
|
||||||
|
import db.add_book as adb
|
||||||
|
import xml_parser.validate as validate
|
||||||
|
import utils.json_utils as json_utils
|
||||||
|
import utils.constants as const
|
||||||
|
import utils.env_utils as env
|
||||||
|
|
||||||
|
|
||||||
|
def validate_all_xml_files():
|
||||||
|
validate.validate_all_xml_files()
|
||||||
|
|
||||||
|
|
||||||
|
def save_validated_files_to_db():
|
||||||
|
json_data = json_utils.read_json_file(const.JSON_PATH)
|
||||||
|
books_json = json_data['books']
|
||||||
|
for book_code in books_json.keys():
|
||||||
|
books_list = books_json[book_code]
|
||||||
|
for book in books_list:
|
||||||
|
if not book['is_validated']:
|
||||||
|
print(const.WARNING, 'Book : ', book['xml_file'], ' is not validated against XSD', const.END)
|
||||||
|
continue
|
||||||
|
if not book['is_saved_to_db']:
|
||||||
|
print(const.BLUE, 'Adding Book : ', book['xml_file'], ' to the DB', const.END)
|
||||||
|
book_dict = read_xml.parse_xml_file(book['xml_file_path'])
|
||||||
|
result = adb.add_book_to_db(book_code, book_dict)
|
||||||
|
book['is_saved_to_db'] = result
|
||||||
|
w_str = const.WARNING
|
||||||
|
if result:
|
||||||
|
w_str = const.BLUE
|
||||||
|
print(w_str, 'Result :: ', result, const.END, '\n')
|
||||||
|
|
||||||
|
json_data['books'] = books_json
|
||||||
|
json_utils.write_json_file(const.JSON_PATH, json_data)
|
||||||
|
|
||||||
|
|
||||||
|
if env.check_env_variables():
|
||||||
|
validate_all_xml_files()
|
||||||
|
save_validated_files_to_db()
|
||||||
|
@ -0,0 +1,15 @@
|
|||||||
|
JSON_PATH = 'json/books.json'
|
||||||
|
|
||||||
|
XSD_PATH = 'xml_files/book.xsd'
|
||||||
|
|
||||||
|
TRANSLATE_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS'
|
||||||
|
|
||||||
|
MYSQL_PASS_ENV_VAR = 'MYSQL_PASSWORD'
|
||||||
|
|
||||||
|
MYSQL_HOST_ENV_VAR = 'MYSQL_HOST'
|
||||||
|
|
||||||
|
WARNING = '\033[91m'
|
||||||
|
END = '\033[0m'
|
||||||
|
|
||||||
|
BLUE = '\033[94m'
|
||||||
|
GREEN = '\033[92m'
|
||||||
@ -0,0 +1,15 @@
|
|||||||
|
import os
|
||||||
|
import utils.constants as const
|
||||||
|
|
||||||
|
|
||||||
|
def check_env_variables():
|
||||||
|
if const.TRANSLATE_ENV_VAR not in os.environ:
|
||||||
|
print(const.WARNING, 'Please set the ', const.TRANSLATE_ENV_VAR, ' Environment Variable to continue....', const.END)
|
||||||
|
return False
|
||||||
|
if const.MYSQL_PASS_ENV_VAR not in os.environ:
|
||||||
|
print(const.WARNING, 'Please set the ', const.MYSQL_PASS_ENV_VAR, ' Environment Variable to continue....', const.END)
|
||||||
|
return False
|
||||||
|
if const.MYSQL_HOST_ENV_VAR not in os.environ:
|
||||||
|
print(const.WARNING, 'Please set the ', const.MYSQL_HOST_ENV_VAR, ' Environment Variable to continue....', const.END)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
@ -0,0 +1,19 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def read_json_file(file_path):
|
||||||
|
json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
|
||||||
|
|
||||||
|
with open(json_file_path, 'r') as json_file:
|
||||||
|
json_data = json.load(json_file)
|
||||||
|
json_file.close()
|
||||||
|
return json_data
|
||||||
|
|
||||||
|
|
||||||
|
def write_json_file(file_path, json_data):
|
||||||
|
json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
|
||||||
|
|
||||||
|
with open(json_file_path, 'w') as updated_json:
|
||||||
|
updated_json.write(json.dumps(json_data, indent=4))
|
||||||
|
updated_json.close()
|
||||||
@ -0,0 +1,67 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
|
||||||
|
|
||||||
|
<xsd:element name="book">
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:sequence>
|
||||||
|
<xsd:element ref="bookInfo" minOccurs="1" maxOccurs="1" />
|
||||||
|
<xsd:element ref="content" minOccurs="1" maxOccurs="1" />
|
||||||
|
</xsd:sequence>
|
||||||
|
<xsd:attribute name="code" type="xsd:string" use="required" />
|
||||||
|
</xsd:complexType>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
<xsd:element name="bookInfo">
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:sequence>
|
||||||
|
<xsd:element name="title" type="xsd:string" minOccurs="1" maxOccurs="1" />
|
||||||
|
<xsd:element name="lang" type="xsd:string" minOccurs="1" maxOccurs="1" />
|
||||||
|
<xsd:element name="isTranslation" type="xsd:boolean" minOccurs="1" maxOccurs="1" />
|
||||||
|
<xsd:element name="totalChapters" type="xsd:nonNegativeInteger" minOccurs="1" maxOccurs="1" />
|
||||||
|
<xsd:element name="source" type="xsd:string" minOccurs="1" maxOccurs="1" />
|
||||||
|
<xsd:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1" />
|
||||||
|
<xsd:element name="isbn" type="xsd:string" minOccurs="0" maxOccurs="1" />
|
||||||
|
<xsd:element ref="author" minOccurs="1" maxOccurs="unbounded" />
|
||||||
|
</xsd:sequence>
|
||||||
|
</xsd:complexType>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
<xsd:element name="author">
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:simpleContent>
|
||||||
|
<xsd:extension base="xsd:string">
|
||||||
|
<xsd:attribute name="translator" type="xsd:boolean" use="optional" />
|
||||||
|
</xsd:extension>
|
||||||
|
</xsd:simpleContent>
|
||||||
|
</xsd:complexType>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
<xsd:element name="content">
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:sequence>
|
||||||
|
<xsd:element ref="chapter" minOccurs="1" maxOccurs="unbounded" />
|
||||||
|
</xsd:sequence>
|
||||||
|
</xsd:complexType>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
<xsd:element name="chapter">
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:sequence>
|
||||||
|
<xsd:element ref="sentence" minOccurs="1" maxOccurs="unbounded" />
|
||||||
|
</xsd:sequence>
|
||||||
|
<xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
|
||||||
|
<xsd:attribute name="name" type="xsd:string" use="optional" />
|
||||||
|
</xsd:complexType>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
<xsd:element name="sentence">
|
||||||
|
<xsd:complexType>
|
||||||
|
<xsd:simpleContent>
|
||||||
|
<xsd:extension base="xsd:string">
|
||||||
|
<xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
|
||||||
|
</xsd:extension>
|
||||||
|
</xsd:simpleContent>
|
||||||
|
</xsd:complexType>
|
||||||
|
</xsd:element>
|
||||||
|
|
||||||
|
</xsd:schema>
|
||||||
@ -1,12 +1,16 @@
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import json
|
import json
|
||||||
|
import utils.constants as const
|
||||||
|
import os
|
||||||
|
|
||||||
json_file_path = Path('json/books.json')
|
|
||||||
|
|
||||||
json_data = {'books': []}
|
json_path = os.path.dirname(os.path.dirname(__file__))+'/'+const.JSON_PATH
|
||||||
|
json_file_path = Path(json_path)
|
||||||
|
|
||||||
|
json_data = {'books': {}}
|
||||||
if not json_file_path.is_file():
|
if not json_file_path.is_file():
|
||||||
json_file = open(json_file_path, 'w')
|
json_file = open(json_file_path, 'w')
|
||||||
json_file.write(json.dumps(json_data, indent=4))
|
json_file.write(json.dumps(json_data, indent=4))
|
||||||
json_file.close()
|
json_file.close()
|
||||||
print('JSON File Created :: '+json_file.name)
|
print(const.BLUE, 'JSON File Created :: '+json_file.name, const.END)
|
||||||
|
|
||||||
|
|||||||
@ -0,0 +1,42 @@
|
|||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
|
|
||||||
|
def parse_xml_file(full_path):
|
||||||
|
|
||||||
|
book_dict = {}
|
||||||
|
|
||||||
|
tree = ET.parse(full_path)
|
||||||
|
book_root = tree.getroot()
|
||||||
|
# print('Root Element :: ', book_root.tag, ' | Attributes :: ', book_root.attrib)
|
||||||
|
book_dict['code'] = book_root.attrib['code']
|
||||||
|
|
||||||
|
book_info_dict = {}
|
||||||
|
book_content_dict = {}
|
||||||
|
book_info_element = book_root.find('bookInfo')
|
||||||
|
book_content_element = book_root.find('content')
|
||||||
|
|
||||||
|
book_info_dict['authors'] = []
|
||||||
|
for child in book_info_element:
|
||||||
|
if 'author' == child.tag:
|
||||||
|
author = {'name': child.text}
|
||||||
|
if 'translator' in child.attrib:
|
||||||
|
author['translator'] = child.attrib['translator']
|
||||||
|
book_info_dict['authors'].append(author)
|
||||||
|
else:
|
||||||
|
book_info_dict[child.tag] = child.text
|
||||||
|
|
||||||
|
book_dict['bookInfo'] = book_info_dict
|
||||||
|
|
||||||
|
book_content_dict['chapters'] = []
|
||||||
|
for chapter in book_content_element:
|
||||||
|
chapter_dict = {'num': chapter.attrib['num']}
|
||||||
|
if 'name' in chapter.attrib:
|
||||||
|
chapter_dict['name'] = chapter.attrib['name']
|
||||||
|
chapter_dict['sentences'] = {}
|
||||||
|
for sentence in chapter.findall('sentence'):
|
||||||
|
chapter_dict['sentences'][sentence.attrib['num']] = sentence.text
|
||||||
|
book_content_dict['chapters'].append(chapter_dict)
|
||||||
|
|
||||||
|
book_dict['content'] = book_content_dict
|
||||||
|
|
||||||
|
return book_dict
|
||||||
@ -1,6 +0,0 @@
|
|||||||
from csv2df import get_book_content, get_book_metadata
|
|
||||||
|
|
||||||
from xml_parser.create_xml import create_xml_file
|
|
||||||
|
|
||||||
create_xml_file(get_book_content(), get_book_metadata())
|
|
||||||
|
|
||||||
@ -0,0 +1,14 @@
|
|||||||
|
from csv2df import get_book_content, get_book_metadata
|
||||||
|
import xml_parser.create_xml as create_xml
|
||||||
|
import xml_parser.read_xml as read_xml
|
||||||
|
import xml_parser.validate as validate
|
||||||
|
|
||||||
|
|
||||||
|
file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata())
|
||||||
|
|
||||||
|
# print(file_path)
|
||||||
|
|
||||||
|
validate.validate_all_xml_files()
|
||||||
|
|
||||||
|
# book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml')
|
||||||
|
|
||||||
@ -0,0 +1,38 @@
|
|||||||
|
import xmlschema
|
||||||
|
import json
|
||||||
|
import utils.json_utils as json_utils
|
||||||
|
import utils.constants as const
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid(book_schema, xml_path):
|
||||||
|
return book_schema.is_valid(xml_path)
|
||||||
|
|
||||||
|
|
||||||
|
def get_book_schema(book_xsd_path):
|
||||||
|
xsd_full_path = os.path.dirname(os.path.dirname(__file__))+'/'+book_xsd_path
|
||||||
|
book_schema = xmlschema.XMLSchema(xsd_full_path)
|
||||||
|
return book_schema
|
||||||
|
|
||||||
|
|
||||||
|
def validate_all_xml_files():
|
||||||
|
|
||||||
|
json_data = json_utils.read_json_file(const.JSON_PATH)
|
||||||
|
|
||||||
|
book_schema = get_book_schema(const.XSD_PATH)
|
||||||
|
|
||||||
|
books_json = json_data['books']
|
||||||
|
for book_code in books_json.keys():
|
||||||
|
books_list = books_json[book_code]
|
||||||
|
for book in books_list:
|
||||||
|
if book['is_validated']:
|
||||||
|
print(const.BLUE, 'Book : ', book['xml_file'], ' is valid', const.END)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
if 'xml_file_path' in book:
|
||||||
|
result = book_schema.is_valid(book['xml_file_path'])
|
||||||
|
print('Validating Book : ', book['xml_file'], ' -> ', result)
|
||||||
|
book['is_validated'] = result
|
||||||
|
|
||||||
|
json_data['books'] = books_json
|
||||||
|
json_utils.write_json_file(const.JSON_PATH, json_data)
|
||||||
Loading…
Reference in new issue