diff --git a/requirements.txt b/requirements.txt index 76fe572..efc90af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ google-cloud-translate==2.0.0 google-cloud-storage==1.19.1 mysql-connector-python==8.0.19 -pandas \ No newline at end of file +pandas +xmlschema \ No newline at end of file diff --git a/run.py b/run.py index b914531..cc2761a 100644 --- a/run.py +++ b/run.py @@ -1,20 +1,18 @@ import json -from pathlib import Path import xml_parser.read_xml as read_xml import db.add_book as adb +import xml_parser.validate as validate +import utils.json_utils as json_utils +import utils.constants as const -json_file_path = Path('json/books.json') - -with open(json_file_path, 'r') as json_file: - json_data = json.load(json_file) - json_file.close() +def save_validated_files_to_db(): + json_data = json_utils.read_json_file(const.JSON_PATH) books_json = json_data['books'] for book_code in books_json.keys(): books_list = books_json[book_code] for book in books_list: - # TODO :: Add not for the below check later (after doing XSD) - if book['is_validated']: + if not book['is_validated']: print('Book : ', book['xml_file'], ' is not validated against XSD') continue if not book['is_saved_to_db']: @@ -24,8 +22,11 @@ with open(json_file_path, 'r') as json_file: book['is_saved_to_db'] = result json_data['books'] = books_json + json_utils.write_json_file(const.JSON_PATH, json_data) + + +def validate_all_xml_files(): + validate.validate_all_xml_files() - with open(json_file_path, 'w') as updated_json: - updated_json.write(json.dumps(json_data, indent=4)) - updated_json.close() +validate_all_xml_files() \ No newline at end of file diff --git a/utils/constants.py b/utils/constants.py new file mode 100644 index 0000000..1ce96a6 --- /dev/null +++ b/utils/constants.py @@ -0,0 +1,3 @@ +JSON_PATH = 'json/books.json' + +XSD_PATH = 'xml_files/book.xsd' \ No newline at end of file diff --git a/utils/json_utils.py b/utils/json_utils.py new file mode 100644 index 0000000..d02296c --- /dev/null +++ b/utils/json_utils.py @@ -0,0 +1,19 @@ +import json +from pathlib import Path + + +def read_json_file(file_path): + json_file_path = Path(file_path) + + with open(json_file_path, 'r') as json_file: + json_data = json.load(json_file) + json_file.close() + return json_data + + +def write_json_file(file_path, json_data): + json_file_path = Path(file_path) + + with open(json_file_path, 'w') as updated_json: + updated_json.write(json.dumps(json_data, indent=4)) + updated_json.close() \ No newline at end of file diff --git a/xml_parser/create_xml.py b/xml_parser/create_xml.py index 0d18efb..ca90bab 100644 --- a/xml_parser/create_xml.py +++ b/xml_parser/create_xml.py @@ -2,7 +2,8 @@ from xml.etree import ElementTree as ET from xml.dom import minidom import os import json -from pathlib import Path +import utils.json_utils as json_utils +import utils.constants as const def create_xml_file(book_dict, book_metadata): @@ -69,13 +70,12 @@ def create_xml_file(book_dict, book_metadata): json_obj['is_saved_to_db'] = False add_xml_book_data_to_json(book_code, json_obj) + return file_path + def add_xml_book_data_to_json(book_code, json_obj): - json_file_path = Path('json/books.json') - json_file = open(json_file_path, 'r') - json_data = json.load(json_file) - json_file.close() + json_data = json_utils.read_json_file(const.JSON_PATH) books = json_data['books'] if book_code in books.keys(): @@ -85,9 +85,7 @@ def add_xml_book_data_to_json(book_code, json_obj): json_data['books'] = books - json_file = open(json_file_path, 'w') - json_file.write(json.dumps(json_data, indent=4)) - json_file.close() + json_utils.write_json_file(const.JSON_PATH, json_data) def prettify(root): diff --git a/xml_parser/test.py b/xml_parser/test.py index ad15380..af6fda8 100644 --- a/xml_parser/test.py +++ b/xml_parser/test.py @@ -1,11 +1,17 @@ from csv2df import get_book_content, get_book_metadata import xml_parser.create_xml as create_xml import xml_parser.read_xml as read_xml +import xmlschema +from pathlib import Path +import xml_parser.validate as validate -create_xml.create_xml_file(get_book_content(), get_book_metadata()) +file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata()) +print(file_path) + +validate.validate_all_xml_files() + +# book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml') -book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml') -print(book_dict) diff --git a/xml_parser/validate.py b/xml_parser/validate.py new file mode 100644 index 0000000..2ea710e --- /dev/null +++ b/xml_parser/validate.py @@ -0,0 +1,38 @@ +import xmlschema +import json +from pathlib import Path +import utils.json_utils as json_utils +import utils.constants as const + + +def is_valid(book_schema, xml_path): + return book_schema.is_valid(xml_path) + + +def get_book_schema(book_xsd_path): + xsd_path = Path(book_xsd_path) + book_schema = xmlschema.XMLSchema(str(xsd_path.absolute())) + return book_schema + + +def validate_all_xml_files(): + + json_data = json_utils.read_json_file(const.JSON_PATH) + + book_schema = get_book_schema(const.XSD_PATH) + + books_json = json_data['books'] + for book_code in books_json.keys(): + books_list = books_json[book_code] + for book in books_list: + if book['is_validated']: + print('Book : ', book['xml_file'], ' is valid') + continue + else: + if 'xml_file_path' in book: + result = book_schema.is_valid(book['xml_file_path']) + print('Validating Book : ', book['xml_file'], ' -> ', result) + book['is_validated'] = result + + json_data['books'] = books_json + json_utils.write_json_file(const.JSON_PATH, json_data)