commit
1540b701a4
@ -0,0 +1,170 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import os,sys
|
||||
import re
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from numpy import cumsum
|
||||
from pandas import DataFrame
|
||||
from nltk import word_tokenize, sent_tokenize
|
||||
#import xml.etree.ElementTree as ET
|
||||
from jellyfish import levenshtein_distance as lev
|
||||
#import six
|
||||
from google.cloud import translate_v2 as translate
|
||||
from itertools import product as cp
|
||||
|
||||
translate_client = translate.Client()
|
||||
|
||||
'''
|
||||
|
||||
'''
|
||||
|
||||
|
||||
|
||||
def master_align(text0, text1, lang0, lang1):
|
||||
""" Takes two equivalent texts (original and trnslation) and returns
|
||||
aligned texts. """
|
||||
df0 = frame_from_text(text0, lang0, lang1)
|
||||
print('A')
|
||||
df1 = frame_from_text(text1, lang1, lang0, is1=True)
|
||||
print('B')
|
||||
# returns dfs with ['sent', 'trans', 'rellen', 'relpos']
|
||||
anchors = anchors_from_frames(df0, df1, window=2)
|
||||
print('C')
|
||||
alignments = intermediate_align(df0, df1, anchors, lookahead=4)
|
||||
print('D')
|
||||
textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments)
|
||||
print('E')
|
||||
return textdict0, textdict1
|
||||
|
||||
|
||||
def frame_from_text(text, source, target, is1=False): #
|
||||
""" """ #
|
||||
#print(source, '-->', target)
|
||||
cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
|
||||
#print(cols)
|
||||
frame = pd.DataFrame(columns=cols)
|
||||
frame[cols[0]] = sent_tokenize(text)
|
||||
frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
|
||||
frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
|
||||
frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max()
|
||||
cumul_b = list(np.cumsum(frame[cols[2]]))
|
||||
cumul_a = [0]+cumul_b[:-1]
|
||||
frame[cols[3]] = pd.Series(list(zip(cumul_a, cumul_b)))
|
||||
#print(frame[[cols[0], cols[1]]])
|
||||
return frame
|
||||
|
||||
|
||||
def anchors_from_frames(frame0, frame1, window): #
|
||||
""" """
|
||||
pairdf = generate_pairdf(frame0, frame1, window)
|
||||
frame0['index0'] = frame0.index
|
||||
frame1['index1'] = frame1.index
|
||||
pairdf = pairdf.merge(frame0, on='index0').merge(frame1, on='index1')
|
||||
pairdf['lev0'] = pairdf.apply(lambda x: trdist(x.sent0, x.trans1), axis=1)
|
||||
pairdf['lev1'] = pairdf.apply(lambda x: trdist(x.sent1, x.trans0), axis=1)
|
||||
pairdf['rellen_ratio'] = (pairdf.rellen0/pairdf.rellen1).apply(gr1)
|
||||
pairdf['minlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
|
||||
pairdf['maxlev'] = pairdf[['lev0', 'lev1']].min(axis=1)
|
||||
pairdf['isanchor'] = (pairdf.minlev<0.45) & (pairdf.maxlev<0.6) & (pairdf.rellen_ratio<1.3)
|
||||
return list(pairdf[pairdf.isanchor][['index0','index1']].values)
|
||||
|
||||
|
||||
def intermediate_align(frame0, frame1, anchs, lookahead): #
|
||||
""" """
|
||||
aligns = []
|
||||
end0, end1 = frame0.shape[0], frame1.shape[0]
|
||||
anchor_ranges = list(zip([(-1,-1)]+anchs, anchs+[(end0, end1)]))
|
||||
for rang in anchor_ranges:
|
||||
interaligns = get_interalign(frame0, frame1, *rang, lookahead)
|
||||
a,b = rang[0]
|
||||
aligns.append(((a,b),(a,b)))
|
||||
aligns.extend(interaligns)
|
||||
return aligns[1:] # format [((i_start, i_end),(j_start, j_end))]
|
||||
|
||||
|
||||
def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
|
||||
""" """
|
||||
print(anchors_init, anchors_next)
|
||||
interaligns = []
|
||||
i,j = anchors_init
|
||||
i+=1
|
||||
j+=1
|
||||
end0, end1 = anchors_next
|
||||
while i<end0 and j<end1:
|
||||
room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead)
|
||||
lambdascore = lambda p,q: score(df0, df1, i, j, p, q)
|
||||
i_,j_ = min([(x,y) for x,y in cp(range(i,i+room0),range(j,j+room1)) if x==i or y==j], key=lambda a: lambdascore(*a))
|
||||
print((i,j), (i_,j_))
|
||||
interaligns.append(((i,j),(i_,j_)))
|
||||
i,j = i_+1,j_+1
|
||||
return interaligns
|
||||
|
||||
|
||||
def score(frame0, frame1, start0, start1, end0, end1): #
|
||||
#print(frame0.columns)
|
||||
#print(frame1.columns)
|
||||
s0 = ' '.join(frame0.loc[start0:end0, 'sent0'])
|
||||
s1 = ' '.join(frame1.loc[start1:end1, 'sent1'])
|
||||
t0 = ' '.join(frame0.loc[start0:end0, 'trans0'])
|
||||
t1 = ' '.join(frame1.loc[start1:end1, 'trans1'])
|
||||
l0 = sum(frame0.loc[start0:end0, 'rellen0'])
|
||||
l1 = sum(frame1.loc[start1:end1, 'rellen1'])
|
||||
#print(s0, s1, t0, t1, l0, l1)
|
||||
return (trdist(s0,t1)+trdist(s1,t0))*gr1(l0/l1)/2
|
||||
|
||||
|
||||
|
||||
|
||||
def textdicts_from_alignments(frame0, frame1, aligns): #
|
||||
""" """
|
||||
textdict0, textdict1 = {},{}
|
||||
for i,((a0,a1),(b0,b1)) in enumerate(aligns):
|
||||
t0 = ' '.join(frame0.loc[a0:b0, 'sent0'])
|
||||
t1 = ' '.join(frame1.loc[a1:b1, 'sent1'])
|
||||
print('***************************')
|
||||
print(aligns[i])
|
||||
print(t0)
|
||||
print(t1)
|
||||
textdict0.update({i:t0})
|
||||
textdict1.update({i:t1})
|
||||
return textdict0, textdict1
|
||||
|
||||
|
||||
def generate_pairdf(frame0, frame1, window):
|
||||
""" """
|
||||
pairdf = pd.DataFrame(columns=['index0', 'index1'])
|
||||
ranges0 = frame0.relpos0
|
||||
ranges1 = frame1.relpos1
|
||||
overlap = [(i,j) for (i,(a,b)),(j,(c,d)) in cp(enumerate(ranges0), enumerate(ranges1)) if get_overlap(a,b,c,d)>0]
|
||||
len0 = frame0.shape[0]
|
||||
len1 = frame1.shape[0]
|
||||
allpairs = []
|
||||
for i,j in overlap:
|
||||
for k in range(-window, window+1):
|
||||
for l in range(-window, window+1):
|
||||
allpairs.append((i+k,j+l))
|
||||
allpairs = [(a,b) for a,b in allpairs if min(a,b)>-1 and a<len0 and b<len1]
|
||||
allpairs = sorted(list(set(allpairs)))
|
||||
pairdf = pd.DataFrame(allpairs).rename(columns={0:'index0', 1:'index1'})
|
||||
return pairdf
|
||||
|
||||
|
||||
def get_overlap(a,b,c,d):
|
||||
#print(a0,b0,a1,b1)
|
||||
if b>c and b<=d:
|
||||
return b-max(a,c)
|
||||
elif a>=c and a<d:
|
||||
return min(b,d)-a
|
||||
elif c>=a and c<b:
|
||||
return d-max(a,c)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
gr1 = lambda x: 1/less1(x) #
|
||||
less1 = lambda x: 1/x if abs(x)>1 else x #
|
||||
trdist = lambda x,y: lev(x,y)/max(len(x),len(y)) #
|
||||
|
||||
|
||||
@ -0,0 +1,271 @@
|
||||
import time
|
||||
import db.mysql_connection as mysql
|
||||
import db.constants as const
|
||||
|
||||
|
||||
def add_book_to_db(book_code, book_dict):
|
||||
# print('Adding Book Code :: ', book_code, ' Dict :: ', book_dict)
|
||||
|
||||
conn = mysql.get_new_mysql_connection(const.DB_CONFIG_FILE)
|
||||
if conn is None:
|
||||
return False
|
||||
|
||||
db_cursor = conn.cursor(buffered=True)
|
||||
|
||||
# add book data to the Table First
|
||||
book_row = {
|
||||
'code': book_code,
|
||||
'added_at': int(time.time())
|
||||
}
|
||||
|
||||
# returns the last row id, if row added to the table successfully
|
||||
last_rowid = add_book_row_to_table(db_cursor, const.BOOK_INSERT_QUERY, book_row)
|
||||
book_id = last_rowid
|
||||
print('Book Row Id :: ', last_rowid)
|
||||
|
||||
book_info_dict = book_dict['bookInfo']
|
||||
if last_rowid > 0:
|
||||
book_info_row = {
|
||||
'title': book_info_dict['title'],
|
||||
'description': book_info_dict['description'] if 'description' in book_info_dict else None,
|
||||
'lang': book_info_dict['lang'],
|
||||
'source': book_info_dict['source'],
|
||||
'is_translation': 'true' == book_info_dict['isTranslation'].lower(),
|
||||
'total_chapters': book_info_dict['totalChapters'],
|
||||
'isbn': book_info_dict['isbn'] if 'isbn' in book_info_dict else None,
|
||||
'book': book_id
|
||||
}
|
||||
|
||||
# returns the last row id, if row added to the table successfully
|
||||
last_rowid = add_book_info_row_to_table(db_cursor, const.BOOK_INFO_INSERT_QUERY, book_info_row)
|
||||
print('Book Info Row Id :: ', last_rowid)
|
||||
|
||||
if last_rowid > 0:
|
||||
book_info_id = last_rowid
|
||||
authors_list = book_info_dict['authors']
|
||||
for author in authors_list:
|
||||
author_row = {
|
||||
'id': -1,
|
||||
'name': author['name'].strip().lower(),
|
||||
'total_books': 1
|
||||
}
|
||||
author_row = search_author(db_cursor, const.AUTHOR_SEARCH_QUERY, author_row)
|
||||
print('Author Search Result :: ', author_row)
|
||||
if author_row['id'] > 0:
|
||||
author_row['total_books'] = author_row['total_books'] + 1
|
||||
last_rowid = update_author_book_count(db_cursor, const.AUTHOR_UPDATE_QUERY, author_row)
|
||||
print('Author Update Row count :: ', last_rowid)
|
||||
if last_rowid <= 0:
|
||||
break
|
||||
else:
|
||||
author_row['name'] = author['name']
|
||||
author_row['total_books'] = 1
|
||||
last_rowid = add_author_to_table(db_cursor, const.AUTHOR_INSERT_QUERY, author_row)
|
||||
print('Add Author Row Id :: ', last_rowid)
|
||||
if last_rowid > 0:
|
||||
author_row['id'] = last_rowid
|
||||
|
||||
if author_row['id'] > 0:
|
||||
author_is_translator = False
|
||||
if 'translator' in author:
|
||||
author_is_translator = 'true' == author['translator'].lower()
|
||||
map_author_book = {
|
||||
'author': author_row['id'],
|
||||
'book': book_info_id,
|
||||
'translator': author_is_translator
|
||||
}
|
||||
|
||||
last_rowid = add_author_book_mapping(db_cursor, const.BOOK_AUTHOR_INSERT_QUERY, map_author_book)
|
||||
print('Author Book Mapping Row ID :: ', last_rowid)
|
||||
if last_rowid < 0:
|
||||
break
|
||||
|
||||
if last_rowid > 0:
|
||||
book_content_row = {
|
||||
'book': book_id
|
||||
}
|
||||
|
||||
# returns the last row id, if row added to the table successfully
|
||||
last_rowid = add_book_content_row_to_table(db_cursor, const.CONTENT_INSERT_QUERY, book_content_row)
|
||||
print('Book Content Row Id :: ', last_rowid)
|
||||
|
||||
if last_rowid > 0:
|
||||
content_id = last_rowid
|
||||
book_chapters_list = book_dict['content']['chapters']
|
||||
for chapter in book_chapters_list:
|
||||
book_chapter_row = {
|
||||
'c_num': chapter['num'],
|
||||
'name': chapter['name'] if 'name' in chapter else None,
|
||||
'book_content': content_id
|
||||
}
|
||||
chapter_id = add_book_chapter_to_table(db_cursor, const.CHAPTER_INSERT_QUERY, book_chapter_row)
|
||||
print('Book Chapter Row Id :: ', chapter_id)
|
||||
if chapter_id > 0:
|
||||
sentences_dict = chapter['sentences']
|
||||
for s_num in sentences_dict.keys():
|
||||
sentence_row = {
|
||||
's_num': s_num,
|
||||
'text': sentences_dict[s_num],
|
||||
'chapter': chapter_id
|
||||
}
|
||||
sen_id = add_book_sentence_to_table(db_cursor, const.SENTENCE_INSERT_QUERY, sentence_row)
|
||||
print('Book Sentence Id :: ', sen_id)
|
||||
if sen_id <= 0:
|
||||
break
|
||||
else:
|
||||
last_rowid = sen_id
|
||||
else:
|
||||
break
|
||||
|
||||
db_cursor.close()
|
||||
|
||||
is_success = False
|
||||
if last_rowid > 0:
|
||||
conn.commit()
|
||||
is_success = True
|
||||
else:
|
||||
conn.rollback()
|
||||
is_success = False
|
||||
|
||||
conn.close()
|
||||
|
||||
return is_success
|
||||
|
||||
|
||||
def add_book_row_to_table(db_cursor, book_insert_query, book_row):
|
||||
try:
|
||||
# Insert this Book row to Table
|
||||
db_cursor.execute(book_insert_query, book_row)
|
||||
book_id = db_cursor.lastrowid
|
||||
if book_id is not None:
|
||||
return book_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def add_book_info_row_to_table(db_cursor, book_info_insert_query, book_info_row):
|
||||
try:
|
||||
# Insert this BookInfo row
|
||||
db_cursor.execute(book_info_insert_query, book_info_row)
|
||||
book_info_id = db_cursor.lastrowid
|
||||
if book_info_id is not None:
|
||||
return book_info_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def add_book_content_row_to_table(db_cursor, book_content_insert_query, book_content_row):
|
||||
try:
|
||||
# Insert Book Content row
|
||||
db_cursor.execute(book_content_insert_query, book_content_row)
|
||||
book_content_id = db_cursor.lastrowid
|
||||
if book_content_id is not None:
|
||||
return book_content_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def add_book_chapter_to_table(db_cursor, book_chapter_insert_query, book_chapter_row):
|
||||
try:
|
||||
# Insert Book chapter row
|
||||
db_cursor.execute(book_chapter_insert_query, book_chapter_row)
|
||||
book_chapter_id = db_cursor.lastrowid
|
||||
if book_chapter_id is not None:
|
||||
return book_chapter_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def add_book_sentence_to_table(db_cursor, book_sentence_insert_query, book_sentence):
|
||||
try:
|
||||
# Insert sentence
|
||||
db_cursor.execute(book_sentence_insert_query, book_sentence)
|
||||
book_sen_id = db_cursor.lastrowid
|
||||
if book_sen_id is not None:
|
||||
return book_sen_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def add_author_to_table(db_cursor, author_insert_query, author_data):
|
||||
try:
|
||||
# Insert Author
|
||||
db_cursor.execute(author_insert_query, author_data)
|
||||
author_id = db_cursor.lastrowid
|
||||
if author_id is not None:
|
||||
return author_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def add_author_book_mapping(db_cursor, book_author_insert_query, book_author_data):
|
||||
try:
|
||||
# Insert Book Author Mapping
|
||||
db_cursor.execute(book_author_insert_query, book_author_data)
|
||||
map_id = db_cursor.rowcount
|
||||
if map_id > 0:
|
||||
return map_id
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
|
||||
|
||||
def search_author(db_cursor, author_search_query, author_data):
|
||||
try:
|
||||
# Search Author
|
||||
db_cursor.execute(author_search_query, author_data)
|
||||
row = db_cursor.fetchone()
|
||||
if row is not None:
|
||||
author_data['id'] = int(row[0])
|
||||
author_data['total_books'] = int(row[2])
|
||||
return author_data
|
||||
else:
|
||||
author_data['id'] = -1
|
||||
return author_data
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
author_data['id'] = -1
|
||||
return author_data
|
||||
|
||||
|
||||
def update_author_book_count(db_cursor, author_update_query, author_data):
|
||||
try:
|
||||
# Update Author Book Count
|
||||
db_cursor.execute(author_update_query, author_data)
|
||||
row_cnt = db_cursor.rowcount
|
||||
if row_cnt > 0:
|
||||
return row_cnt
|
||||
else:
|
||||
return -1
|
||||
|
||||
except Exception as e:
|
||||
print(str(e))
|
||||
return -1
|
||||
@ -0,0 +1,26 @@
|
||||
DB_CONFIG_FILE = 'db_config.ini'
|
||||
|
||||
BOOK_INSERT_QUERY = "INSERT INTO dim_book (code, added_at) " \
|
||||
"VALUES (%(code)s, %(added_at)s)"
|
||||
|
||||
AUTHOR_INSERT_QUERY = "INSERT INTO dim_author (name, total_books) " \
|
||||
"VALUES (%(name)s, %(total_books)s)"
|
||||
|
||||
BOOK_INFO_INSERT_QUERY = "INSERT INTO dim_book_info (title, description, lang, source, is_translation, " \
|
||||
"total_chapters, isbn, book) " \
|
||||
"VALUES (%(title)s, %(description)s, %(lang)s, %(source)s, %(is_translation)s, " \
|
||||
"%(total_chapters)s, %(isbn)s, %(book)s) "
|
||||
|
||||
BOOK_AUTHOR_INSERT_QUERY = "INSERT INTO map_book_author (author, book, translator) " \
|
||||
"VALUES (%(author)s, %(book)s, %(translator)s)"
|
||||
|
||||
CONTENT_INSERT_QUERY = "INSERT INTO dim_book_content (book) VALUES(%(book)s)"
|
||||
|
||||
CHAPTER_INSERT_QUERY = "INSERT INTO dim_book_chapter (c_num, name, book_content) " \
|
||||
"VALUES (%(c_num)s, %(name)s, %(book_content)s)"
|
||||
|
||||
SENTENCE_INSERT_QUERY = "INSERT INTO dim_book_sentence (s_num, text, chapter) VALUES (%(s_num)s, %(text)s, %(chapter)s)"
|
||||
|
||||
AUTHOR_SEARCH_QUERY = "SELECT * FROM dim_author WHERE dim_author.name = %(name)s"
|
||||
|
||||
AUTHOR_UPDATE_QUERY = "UPDATE dim_author SET dim_author.total_books = %(total_books)s WHERE id = %(id)s"
|
||||
@ -0,0 +1,32 @@
|
||||
import mysql.connector
|
||||
from mysql.connector import errorcode
|
||||
import db.read_config as config
|
||||
import utils.constants as const
|
||||
import os
|
||||
|
||||
|
||||
def get_new_mysql_connection(config_file_name):
|
||||
|
||||
config_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+config_file_name
|
||||
db_config = config.read_db_config(config_file_path, 'mysql')
|
||||
|
||||
connection = None
|
||||
|
||||
try:
|
||||
connection = mysql.connector.connect(**db_config)
|
||||
except mysql.connector.Error as err:
|
||||
if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
|
||||
print(const.WARNING, 'Invalid Database User and Password', const.END)
|
||||
elif err.errno == errorcode.ER_BAD_DB_ERROR:
|
||||
print(const.WARNING, 'Database doesn\'t exist ', const.END)
|
||||
else:
|
||||
print(err)
|
||||
|
||||
if connection is not None:
|
||||
if connection.is_connected():
|
||||
connection.autocommit = False
|
||||
print(const.GREEN, 'MySQL Connection Successful => Connection ID :: ', connection.connection_id, const.END)
|
||||
else:
|
||||
connection = None
|
||||
|
||||
return connection
|
||||
@ -0,0 +1,33 @@
|
||||
from configparser import ConfigParser
|
||||
import os
|
||||
|
||||
|
||||
def read_db_config(filename, section):
|
||||
""" Read database configuration file and return a dictionary object
|
||||
:param filename: name of the configuration file
|
||||
:param section: section of database configuration
|
||||
:return: a dictionary of database parameters
|
||||
"""
|
||||
|
||||
parser = ConfigParser()
|
||||
parser.read(filename)
|
||||
|
||||
db = {}
|
||||
if parser.has_section(section):
|
||||
items = parser.items(section)
|
||||
for item in items:
|
||||
db[item[0]] = item[1]
|
||||
else:
|
||||
raise Exception('{0} not found in the {1} file'.format(section, filename))
|
||||
|
||||
try:
|
||||
db['password'] = os.environ[db['password']]
|
||||
except KeyError:
|
||||
print('Please set the Environment Variable ', db['password'])
|
||||
|
||||
try:
|
||||
db['host'] = os.environ[db['host']]
|
||||
except KeyError:
|
||||
print('Please set the Environment Variable ', db['host'])
|
||||
|
||||
return db
|
||||
@ -0,0 +1,10 @@
|
||||
import db.mysql_connection as connection
|
||||
import db.constants as const
|
||||
|
||||
conn = connection.get_new_mysql_connection(const.DB_CONFIG_FILE)
|
||||
|
||||
print('MySQL Server version :: ', conn.get_server_info())
|
||||
print('isConnected :: ', conn.is_connected())
|
||||
|
||||
conn.close()
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
[mysql]
|
||||
host = MYSQL_HOST
|
||||
port = 3306
|
||||
database = bitext-aligner
|
||||
user = root
|
||||
password = MYSQL_PASSWORD
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 1.4 MiB After Width: | Height: | Size: 1.4 MiB |
@ -1,4 +1,8 @@
|
||||
google-cloud-translate==2.0.0
|
||||
google-cloud-storage==1.19.1
|
||||
mysql-connector-python==8.0.19
|
||||
pandas
|
||||
pandas
|
||||
xmlschema
|
||||
numpy
|
||||
jellyfish
|
||||
nltk
|
||||
@ -0,0 +1,38 @@
|
||||
import xml_parser.read_xml as read_xml
|
||||
import db.add_book as adb
|
||||
import xml_parser.validate as validate
|
||||
import utils.json_utils as json_utils
|
||||
import utils.constants as const
|
||||
import utils.env_utils as env
|
||||
|
||||
|
||||
def validate_all_xml_files():
|
||||
validate.validate_all_xml_files()
|
||||
|
||||
|
||||
def save_validated_files_to_db():
|
||||
json_data = json_utils.read_json_file(const.JSON_PATH)
|
||||
books_json = json_data['books']
|
||||
for book_code in books_json.keys():
|
||||
books_list = books_json[book_code]
|
||||
for book in books_list:
|
||||
if not book['is_validated']:
|
||||
print(const.WARNING, 'Book : ', book['xml_file'], ' is not validated against XSD', const.END)
|
||||
continue
|
||||
if not book['is_saved_to_db']:
|
||||
print(const.BLUE, 'Adding Book : ', book['xml_file'], ' to the DB', const.END)
|
||||
book_dict = read_xml.parse_xml_file(book['xml_file_path'])
|
||||
result = adb.add_book_to_db(book_code, book_dict)
|
||||
book['is_saved_to_db'] = result
|
||||
w_str = const.WARNING
|
||||
if result:
|
||||
w_str = const.BLUE
|
||||
print(w_str, 'Result :: ', result, const.END, '\n')
|
||||
|
||||
json_data['books'] = books_json
|
||||
json_utils.write_json_file(const.JSON_PATH, json_data)
|
||||
|
||||
|
||||
if env.check_env_variables():
|
||||
validate_all_xml_files()
|
||||
save_validated_files_to_db()
|
||||
|
@ -0,0 +1,15 @@
|
||||
JSON_PATH = 'json/books.json'
|
||||
|
||||
XSD_PATH = 'xml_files/book.xsd'
|
||||
|
||||
TRANSLATE_ENV_VAR = 'GOOGLE_APPLICATION_CREDENTIALS'
|
||||
|
||||
MYSQL_PASS_ENV_VAR = 'MYSQL_PASSWORD'
|
||||
|
||||
MYSQL_HOST_ENV_VAR = 'MYSQL_HOST'
|
||||
|
||||
WARNING = '\033[91m'
|
||||
END = '\033[0m'
|
||||
|
||||
BLUE = '\033[94m'
|
||||
GREEN = '\033[92m'
|
||||
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import utils.constants as const
|
||||
|
||||
|
||||
def check_env_variables():
|
||||
if const.TRANSLATE_ENV_VAR not in os.environ:
|
||||
print(const.WARNING, 'Please set the ', const.TRANSLATE_ENV_VAR, ' Environment Variable to continue....', const.END)
|
||||
return False
|
||||
if const.MYSQL_PASS_ENV_VAR not in os.environ:
|
||||
print(const.WARNING, 'Please set the ', const.MYSQL_PASS_ENV_VAR, ' Environment Variable to continue....', const.END)
|
||||
return False
|
||||
if const.MYSQL_HOST_ENV_VAR not in os.environ:
|
||||
print(const.WARNING, 'Please set the ', const.MYSQL_HOST_ENV_VAR, ' Environment Variable to continue....', const.END)
|
||||
return False
|
||||
return True
|
||||
@ -0,0 +1,19 @@
|
||||
import json
|
||||
import os
|
||||
|
||||
|
||||
def read_json_file(file_path):
|
||||
json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
|
||||
|
||||
with open(json_file_path, 'r') as json_file:
|
||||
json_data = json.load(json_file)
|
||||
json_file.close()
|
||||
return json_data
|
||||
|
||||
|
||||
def write_json_file(file_path, json_data):
|
||||
json_file_path = os.path.dirname(os.path.dirname(__file__))+'/'+file_path
|
||||
|
||||
with open(json_file_path, 'w') as updated_json:
|
||||
updated_json.write(json.dumps(json_data, indent=4))
|
||||
updated_json.close()
|
||||
@ -0,0 +1,67 @@
|
||||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<xsd:schema xmlns:xsd="http://www.w3.org/2001/XMLSchema">
|
||||
|
||||
<xsd:element name="book">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element ref="bookInfo" minOccurs="1" maxOccurs="1" />
|
||||
<xsd:element ref="content" minOccurs="1" maxOccurs="1" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="code" type="xsd:string" use="required" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
|
||||
<xsd:element name="bookInfo">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element name="title" type="xsd:string" minOccurs="1" maxOccurs="1" />
|
||||
<xsd:element name="lang" type="xsd:string" minOccurs="1" maxOccurs="1" />
|
||||
<xsd:element name="isTranslation" type="xsd:boolean" minOccurs="1" maxOccurs="1" />
|
||||
<xsd:element name="totalChapters" type="xsd:nonNegativeInteger" minOccurs="1" maxOccurs="1" />
|
||||
<xsd:element name="source" type="xsd:string" minOccurs="1" maxOccurs="1" />
|
||||
<xsd:element name="description" type="xsd:string" minOccurs="0" maxOccurs="1" />
|
||||
<xsd:element name="isbn" type="xsd:string" minOccurs="0" maxOccurs="1" />
|
||||
<xsd:element ref="author" minOccurs="1" maxOccurs="unbounded" />
|
||||
</xsd:sequence>
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
|
||||
<xsd:element name="author">
|
||||
<xsd:complexType>
|
||||
<xsd:simpleContent>
|
||||
<xsd:extension base="xsd:string">
|
||||
<xsd:attribute name="translator" type="xsd:boolean" use="optional" />
|
||||
</xsd:extension>
|
||||
</xsd:simpleContent>
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
|
||||
<xsd:element name="content">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element ref="chapter" minOccurs="1" maxOccurs="unbounded" />
|
||||
</xsd:sequence>
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
|
||||
<xsd:element name="chapter">
|
||||
<xsd:complexType>
|
||||
<xsd:sequence>
|
||||
<xsd:element ref="sentence" minOccurs="1" maxOccurs="unbounded" />
|
||||
</xsd:sequence>
|
||||
<xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
|
||||
<xsd:attribute name="name" type="xsd:string" use="optional" />
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
|
||||
<xsd:element name="sentence">
|
||||
<xsd:complexType>
|
||||
<xsd:simpleContent>
|
||||
<xsd:extension base="xsd:string">
|
||||
<xsd:attribute name="num" type="xsd:nonNegativeInteger" use="required" />
|
||||
</xsd:extension>
|
||||
</xsd:simpleContent>
|
||||
</xsd:complexType>
|
||||
</xsd:element>
|
||||
|
||||
</xsd:schema>
|
||||
@ -1,12 +1,16 @@
|
||||
from pathlib import Path
|
||||
import json
|
||||
import utils.constants as const
|
||||
import os
|
||||
|
||||
json_file_path = Path('json/books.json')
|
||||
|
||||
json_data = {'books': []}
|
||||
json_path = os.path.dirname(os.path.dirname(__file__))+'/'+const.JSON_PATH
|
||||
json_file_path = Path(json_path)
|
||||
|
||||
json_data = {'books': {}}
|
||||
if not json_file_path.is_file():
|
||||
json_file = open(json_file_path, 'w')
|
||||
json_file.write(json.dumps(json_data, indent=4))
|
||||
json_file.close()
|
||||
print('JSON File Created :: '+json_file.name)
|
||||
print(const.BLUE, 'JSON File Created :: '+json_file.name, const.END)
|
||||
|
||||
|
||||
@ -0,0 +1,42 @@
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
def parse_xml_file(full_path):
|
||||
|
||||
book_dict = {}
|
||||
|
||||
tree = ET.parse(full_path)
|
||||
book_root = tree.getroot()
|
||||
# print('Root Element :: ', book_root.tag, ' | Attributes :: ', book_root.attrib)
|
||||
book_dict['code'] = book_root.attrib['code']
|
||||
|
||||
book_info_dict = {}
|
||||
book_content_dict = {}
|
||||
book_info_element = book_root.find('bookInfo')
|
||||
book_content_element = book_root.find('content')
|
||||
|
||||
book_info_dict['authors'] = []
|
||||
for child in book_info_element:
|
||||
if 'author' == child.tag:
|
||||
author = {'name': child.text}
|
||||
if 'translator' in child.attrib:
|
||||
author['translator'] = child.attrib['translator']
|
||||
book_info_dict['authors'].append(author)
|
||||
else:
|
||||
book_info_dict[child.tag] = child.text
|
||||
|
||||
book_dict['bookInfo'] = book_info_dict
|
||||
|
||||
book_content_dict['chapters'] = []
|
||||
for chapter in book_content_element:
|
||||
chapter_dict = {'num': chapter.attrib['num']}
|
||||
if 'name' in chapter.attrib:
|
||||
chapter_dict['name'] = chapter.attrib['name']
|
||||
chapter_dict['sentences'] = {}
|
||||
for sentence in chapter.findall('sentence'):
|
||||
chapter_dict['sentences'][sentence.attrib['num']] = sentence.text
|
||||
book_content_dict['chapters'].append(chapter_dict)
|
||||
|
||||
book_dict['content'] = book_content_dict
|
||||
|
||||
return book_dict
|
||||
@ -1,6 +0,0 @@
|
||||
from csv2df import get_book_content, get_book_metadata
|
||||
|
||||
from xml_parser.create_xml import create_xml_file
|
||||
|
||||
create_xml_file(get_book_content(), get_book_metadata())
|
||||
|
||||
@ -0,0 +1,14 @@
|
||||
from csv2df import get_book_content, get_book_metadata
|
||||
import xml_parser.create_xml as create_xml
|
||||
import xml_parser.read_xml as read_xml
|
||||
import xml_parser.validate as validate
|
||||
|
||||
|
||||
file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata())
|
||||
|
||||
# print(file_path)
|
||||
|
||||
validate.validate_all_xml_files()
|
||||
|
||||
# book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml')
|
||||
|
||||
@ -0,0 +1,38 @@
|
||||
import xmlschema
|
||||
import json
|
||||
import utils.json_utils as json_utils
|
||||
import utils.constants as const
|
||||
import os
|
||||
|
||||
|
||||
def is_valid(book_schema, xml_path):
|
||||
return book_schema.is_valid(xml_path)
|
||||
|
||||
|
||||
def get_book_schema(book_xsd_path):
|
||||
xsd_full_path = os.path.dirname(os.path.dirname(__file__))+'/'+book_xsd_path
|
||||
book_schema = xmlschema.XMLSchema(xsd_full_path)
|
||||
return book_schema
|
||||
|
||||
|
||||
def validate_all_xml_files():
|
||||
|
||||
json_data = json_utils.read_json_file(const.JSON_PATH)
|
||||
|
||||
book_schema = get_book_schema(const.XSD_PATH)
|
||||
|
||||
books_json = json_data['books']
|
||||
for book_code in books_json.keys():
|
||||
books_list = books_json[book_code]
|
||||
for book in books_list:
|
||||
if book['is_validated']:
|
||||
print(const.BLUE, 'Book : ', book['xml_file'], ' is valid', const.END)
|
||||
continue
|
||||
else:
|
||||
if 'xml_file_path' in book:
|
||||
result = book_schema.is_valid(book['xml_file_path'])
|
||||
print('Validating Book : ', book['xml_file'], ' -> ', result)
|
||||
book['is_validated'] = result
|
||||
|
||||
json_data['books'] = books_json
|
||||
json_utils.write_json_file(const.JSON_PATH, json_data)
|
||||
Loading…
Reference in new issue