parent
46f2e93e9a
commit
2e2353b545
@ -0,0 +1,63 @@
|
||||
from xml.etree import ElementTree as ET
|
||||
from xml.dom import minidom
|
||||
|
||||
|
||||
def create_xml_file(book_dict, book_metadata):
|
||||
book_root = ET.Element('book')
|
||||
book_root.set('id', book_metadata['book_id'])
|
||||
|
||||
book_info = ET.SubElement(book_root, 'bookInfo')
|
||||
content = ET.SubElement(book_root, 'content')
|
||||
|
||||
title = ET.SubElement(book_info, 'title')
|
||||
title.text = book_metadata['title']
|
||||
|
||||
lang = ET.SubElement(book_info, 'lang')
|
||||
lang.text = book_metadata['lang']
|
||||
|
||||
is_translation = ET.SubElement(book_info, 'isTranslation')
|
||||
is_translation.text = book_metadata['isTranslation']
|
||||
|
||||
total_chapters = ET.SubElement(book_info, 'totalChapters')
|
||||
total_chapters.text = book_metadata['totalChapters']
|
||||
|
||||
if 'description' in book_metadata:
|
||||
description = ET.SubElement(book_info, 'description')
|
||||
description.text = book_metadata['description']
|
||||
|
||||
if 'source' in book_metadata:
|
||||
source = ET.SubElement(book_info, 'source')
|
||||
source.text = book_metadata['source']
|
||||
|
||||
if 'isbn' in book_metadata:
|
||||
isbn = ET.SubElement(book_info, 'isbn')
|
||||
isbn.text = book_metadata['isbn']
|
||||
|
||||
authors_list = book_metadata['authors']
|
||||
for auth in authors_list:
|
||||
author = ET.SubElement(book_info, 'author')
|
||||
author.text = auth['name']
|
||||
if 'translator' in auth:
|
||||
author.set('translator', auth['translator'])
|
||||
|
||||
for key in book_dict.keys():
|
||||
chapter = ET.SubElement(content, 'chapter')
|
||||
chapter.set('id', str(key))
|
||||
for idx, val in enumerate(book_dict[key]):
|
||||
sentence = ET.SubElement(chapter, 'sentence')
|
||||
sentence.set('id', str(idx + 1))
|
||||
sentence.text = val
|
||||
|
||||
# tree = ET.ElementTree(book_root)
|
||||
# tree.write(filename)
|
||||
filename = book_root.get('id') + "_" + lang.text + ".xml"
|
||||
file = open(filename, 'w')
|
||||
file.write(prettify(book_root))
|
||||
|
||||
|
||||
def prettify(element):
|
||||
""" Return a pretty-printed XML string for the Element.
|
||||
"""
|
||||
rough_string = ET.tostring(element, 'utf-8')
|
||||
parsed = minidom.parseString(rough_string)
|
||||
return parsed.toprettyxml(indent=" ")
|
||||
@ -1,3 +1,46 @@
|
||||
from collections import OrderedDict
|
||||
|
||||
import pandas as pd
|
||||
df = pd.read_csv("test_example.csv", header=None).rename(
|
||||
columns={0:'chapter', 1:'sentence', 2:'text'})
|
||||
|
||||
|
||||
def get_book_content():
|
||||
df = pd.read_csv("test_example.csv", header=None).rename(
|
||||
columns={0: 'chapter', 1: 'sentence', 2: 'text'})
|
||||
|
||||
book_dict = OrderedDict()
|
||||
|
||||
for index, row in df.iterrows():
|
||||
ch_id = row['chapter']
|
||||
s_id = row['sentence']
|
||||
text = row['text']
|
||||
print(ch_id, " -> ", s_id, " -> ", text)
|
||||
|
||||
if ch_id not in book_dict:
|
||||
book_dict[ch_id] = []
|
||||
book_dict[ch_id].append(text)
|
||||
|
||||
return book_dict
|
||||
|
||||
|
||||
def get_book_metadata():
|
||||
|
||||
dict_metadata = {
|
||||
"book_id": "abcdef",
|
||||
"title": "Bullshit",
|
||||
"lang": "en",
|
||||
"isTranslation": "true",
|
||||
"totalChapters": "2",
|
||||
"authors": [
|
||||
{
|
||||
"name": "Herr Riley",
|
||||
"translator": "true"
|
||||
},
|
||||
{
|
||||
"name": "Herr Singh"
|
||||
}
|
||||
],
|
||||
"description": "Some Random Bullshit description",
|
||||
"source": "https://www.idontcare.com"
|
||||
}
|
||||
|
||||
return dict_metadata
|
||||
|
||||
@ -1,3 +1,4 @@
|
||||
google-cloud-translate==2.0.0
|
||||
google-cloud-storage==1.19.1
|
||||
mysql-connector-python==8.0.19
|
||||
pandas
|
||||
Loading…
Reference in new issue