You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
187 lines
6.6 KiB
187 lines
6.6 KiB
from lxml import etree
|
|
import os
|
|
import utils.constants as const
|
|
import unicodedata as unicode
|
|
|
|
|
|
namespaces = {'xmlns': "http://www.gribuser.ru/xml/fictionbook/2.0"}
|
|
|
|
|
|
def read_fb2_file(file_name, book_code, source, encoding):
|
|
full_file_path = os.path.dirname(os.path.dirname(__file__))+'/data/'+file_name
|
|
print(const.BLUE, 'Reading :: ', full_file_path, const.END)
|
|
|
|
book_dict = {}
|
|
tree = etree.parse(full_file_path)
|
|
root = tree.getroot()
|
|
|
|
book_info_dict = create_book_info_dict(root, book_code, source)
|
|
|
|
# Add Book Metadata to Book Dictionary
|
|
book_dict['metadata'] = book_info_dict
|
|
|
|
book_content = create_book_content(root, book_info_dict['lang'])
|
|
|
|
# Add Chapter count to Book Metadata
|
|
book_dict['metadata']['totalChapters'] = len(book_content)
|
|
|
|
# Add Book Content/Chapters List to book_dict
|
|
book_dict['content'] = book_content
|
|
|
|
# print('Book Metadata :: ', file_name, ' -> ', book_dict['metadata'])
|
|
# print('Book Content :: ', file_name, ' -> ', book_dict['content'][0:2])
|
|
|
|
return book_dict
|
|
|
|
|
|
def create_book_content(root, lang):
|
|
book_content_list = []
|
|
|
|
body_element = root.find('xmlns:body', namespaces)
|
|
if body_element is None:
|
|
return book_content_list
|
|
|
|
section_list = body_element.findall('xmlns:section', namespaces)
|
|
# print('Length of Sections :: ', len(section_list))
|
|
chapter_num = 1
|
|
for section in section_list:
|
|
sub_section_list = section.findall('xmlns:section', namespaces)
|
|
# print('Length of Subsections :: ', len(sub_section_list))
|
|
paragraph_list = section.findall('xmlns:p', namespaces)
|
|
# print('Length of Paragraphs :: ', len(paragraph_list))
|
|
|
|
# Check if this section is first Header Section without Content
|
|
if len(sub_section_list) == 0 and len(paragraph_list) == 0:
|
|
continue
|
|
|
|
if len(sub_section_list) > 0:
|
|
section_title = get_section_title(section, lang, False).strip()
|
|
# get subsection title and subsection paragraphs
|
|
for sub_section in sub_section_list:
|
|
sub_section_title = get_section_title(sub_section, lang, True)
|
|
if sub_section_title is None or len(sub_section_title) <= 0:
|
|
continue
|
|
|
|
chapter_content = get_chapter_content_from_section(sub_section)
|
|
if chapter_content is None or len(chapter_content) <= 0:
|
|
continue
|
|
|
|
book_content_list.append({
|
|
'chapter_num': chapter_num,
|
|
'chapter_name': section_title+' - '+sub_section_title.strip(),
|
|
'text_content': chapter_content
|
|
})
|
|
chapter_num += 1
|
|
|
|
elif len(paragraph_list) > 0:
|
|
section_title = get_section_title(section, lang, True)
|
|
if section_title is None or len(section_title) <= 0:
|
|
continue
|
|
|
|
chapter_content = get_chapter_content_from_section(section)
|
|
if chapter_content is None or len(chapter_content) <= 0:
|
|
continue
|
|
|
|
book_content_list.append({
|
|
'chapter_num': chapter_num,
|
|
'chapter_name': section_title.strip(),
|
|
'text_content': chapter_content
|
|
})
|
|
chapter_num += 1
|
|
|
|
return book_content_list
|
|
|
|
|
|
def get_chapter_content_from_section(section):
|
|
paragraphs = section.findall('xmlns:p', namespaces)
|
|
chapter_content = ''
|
|
if len(paragraphs) > 0:
|
|
for paragraph in paragraphs:
|
|
paragraph_text = paragraph.text
|
|
if paragraph_text is None:
|
|
continue
|
|
|
|
chapter_content = chapter_content+' '+paragraph_text.strip()
|
|
|
|
normalized_content = unicode.normalize("NFKD", chapter_content)
|
|
return normalized_content
|
|
else:
|
|
return None
|
|
|
|
|
|
def get_section_title(section, lang, is_chapter):
|
|
section_title_name = ''
|
|
title_p_list = section.findall('xmlns:title/xmlns:p', namespaces)
|
|
if len(title_p_list) > 0:
|
|
for title_p in title_p_list:
|
|
p_text = title_p.text
|
|
if p_text is None:
|
|
continue
|
|
section_title_name = section_title_name+' '+p_text.strip()
|
|
|
|
if is_chapter and len(section_title_name) <= 5:
|
|
section_title_name = const.CHAPTER_NAMES[lang]+' '+section_title_name
|
|
return section_title_name
|
|
|
|
|
|
def create_book_info_dict(root, book_code, book_source):
|
|
book_info_dict = {}
|
|
title_info = root.find('xmlns:description/xmlns:title-info', namespaces)
|
|
if title_info is None:
|
|
return book_info_dict
|
|
|
|
book_info_dict['book_code'] = book_code
|
|
|
|
book_title = title_info.find('xmlns:book-title', namespaces)
|
|
if book_title is not None:
|
|
book_info_dict['title'] = book_title.text
|
|
|
|
book_info_dict['lang'] = title_info.find('xmlns:lang', namespaces).text
|
|
|
|
src_lang = title_info.find('xmlns:src-lang', namespaces)
|
|
if src_lang is not None:
|
|
book_src_lang = src_lang.text
|
|
if book_src_lang != book_info_dict['lang']:
|
|
book_info_dict['isTranslation'] = 'true'
|
|
else:
|
|
book_info_dict['isTranslation'] = 'false'
|
|
else:
|
|
book_info_dict['isTranslation'] = 'false'
|
|
|
|
book_info_dict['totalChapters'] = 0
|
|
book_info_dict['source'] = book_source
|
|
|
|
book_info_dict[ 'authors'] = []
|
|
author = title_info.find('xmlns:author', namespaces)
|
|
author_name = get_author_name(author)
|
|
if len(author_name) > 0:
|
|
book_info_dict['authors'].append({'name': author_name})
|
|
|
|
translators = title_info.findall('xmlns:translator', namespaces)
|
|
for translator in translators:
|
|
translator_name = get_author_name(translator)
|
|
if len(translator_name) > 0:
|
|
book_info_dict['authors'].append({'name': translator_name, 'translator':'true'})
|
|
|
|
return book_info_dict
|
|
|
|
|
|
def get_author_name(author_root):
|
|
first_name = author_root.find('xmlns:first-name', namespaces)
|
|
middle_name = author_root.find('xmlns:middle-name', namespaces)
|
|
last_name = author_root.find('xmlns:last-name', namespaces)
|
|
name = ''
|
|
if first_name is not None:
|
|
first_name_text = first_name.text
|
|
if first_name_text is not None and len(first_name_text) > 0:
|
|
name = first_name_text.strip()
|
|
if middle_name is not None:
|
|
middle_name_text = middle_name.text
|
|
if middle_name_text is not None and len(middle_name_text) > 0:
|
|
name = name+' '+middle_name_text.strip()
|
|
if last_name is not None:
|
|
last_name_text = last_name.text
|
|
if last_name_text is not None and len(last_name_text) > 0:
|
|
name = name+' '+last_name_text.strip()
|
|
|
|
return name |