From 2e2353b545d6177565238570e47de688b4433d01 Mon Sep 17 00:00:00 2001 From: Pavan Mandava Date: Wed, 15 Jan 2020 00:07:00 +0100 Subject: [PATCH] Create XML from Python done --- create_xml.py | 63 ++++++++++++++++++++++++++++++++++++++++++++++++ csv2df.py | 47 ++++++++++++++++++++++++++++++++++-- requirements.txt | 3 ++- test.py | 6 +++++ 4 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 create_xml.py create mode 100644 test.py diff --git a/create_xml.py b/create_xml.py new file mode 100644 index 0000000..5b25430 --- /dev/null +++ b/create_xml.py @@ -0,0 +1,63 @@ +from xml.etree import ElementTree as ET +from xml.dom import minidom + + +def create_xml_file(book_dict, book_metadata): + book_root = ET.Element('book') + book_root.set('id', book_metadata['book_id']) + + book_info = ET.SubElement(book_root, 'bookInfo') + content = ET.SubElement(book_root, 'content') + + title = ET.SubElement(book_info, 'title') + title.text = book_metadata['title'] + + lang = ET.SubElement(book_info, 'lang') + lang.text = book_metadata['lang'] + + is_translation = ET.SubElement(book_info, 'isTranslation') + is_translation.text = book_metadata['isTranslation'] + + total_chapters = ET.SubElement(book_info, 'totalChapters') + total_chapters.text = book_metadata['totalChapters'] + + if 'description' in book_metadata: + description = ET.SubElement(book_info, 'description') + description.text = book_metadata['description'] + + if 'source' in book_metadata: + source = ET.SubElement(book_info, 'source') + source.text = book_metadata['source'] + + if 'isbn' in book_metadata: + isbn = ET.SubElement(book_info, 'isbn') + isbn.text = book_metadata['isbn'] + + authors_list = book_metadata['authors'] + for auth in authors_list: + author = ET.SubElement(book_info, 'author') + author.text = auth['name'] + if 'translator' in auth: + author.set('translator', auth['translator']) + + for key in book_dict.keys(): + chapter = ET.SubElement(content, 'chapter') + chapter.set('id', str(key)) + for idx, val in enumerate(book_dict[key]): + sentence = ET.SubElement(chapter, 'sentence') + sentence.set('id', str(idx + 1)) + sentence.text = val + + # tree = ET.ElementTree(book_root) + # tree.write(filename) + filename = book_root.get('id') + "_" + lang.text + ".xml" + file = open(filename, 'w') + file.write(prettify(book_root)) + + +def prettify(element): + """ Return a pretty-printed XML string for the Element. + """ + rough_string = ET.tostring(element, 'utf-8') + parsed = minidom.parseString(rough_string) + return parsed.toprettyxml(indent=" ") diff --git a/csv2df.py b/csv2df.py index 7f9ef52..43a2518 100644 --- a/csv2df.py +++ b/csv2df.py @@ -1,3 +1,46 @@ +from collections import OrderedDict + import pandas as pd -df = pd.read_csv("test_example.csv", header=None).rename( - columns={0:'chapter', 1:'sentence', 2:'text'}) + + +def get_book_content(): + df = pd.read_csv("test_example.csv", header=None).rename( + columns={0: 'chapter', 1: 'sentence', 2: 'text'}) + + book_dict = OrderedDict() + + for index, row in df.iterrows(): + ch_id = row['chapter'] + s_id = row['sentence'] + text = row['text'] + print(ch_id, " -> ", s_id, " -> ", text) + + if ch_id not in book_dict: + book_dict[ch_id] = [] + book_dict[ch_id].append(text) + + return book_dict + + +def get_book_metadata(): + + dict_metadata = { + "book_id": "abcdef", + "title": "Bullshit", + "lang": "en", + "isTranslation": "true", + "totalChapters": "2", + "authors": [ + { + "name": "Herr Riley", + "translator": "true" + }, + { + "name": "Herr Singh" + } + ], + "description": "Some Random Bullshit description", + "source": "https://www.idontcare.com" + } + + return dict_metadata diff --git a/requirements.txt b/requirements.txt index d546ad6..76fe572 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ google-cloud-translate==2.0.0 google-cloud-storage==1.19.1 -mysql-connector-python==8.0.19 \ No newline at end of file +mysql-connector-python==8.0.19 +pandas \ No newline at end of file diff --git a/test.py b/test.py new file mode 100644 index 0000000..02ee0b3 --- /dev/null +++ b/test.py @@ -0,0 +1,6 @@ +from csv2df import get_book_content, get_book_metadata + +from create_xml import create_xml_file + +create_xml_file(get_book_content(), get_book_metadata()) +