from lxml import etree import os import utils.constants as const import unicodedata as unicode namespaces = {'xmlns': "http://www.gribuser.ru/xml/fictionbook/2.0"} def read_fb2_file(file_name, book_code, source, encoding): full_file_path = os.path.dirname(os.path.dirname(__file__))+'/data/'+file_name print(const.BLUE, 'Reading :: ', full_file_path, const.END) book_dict = {} tree = etree.parse(full_file_path) root = tree.getroot() book_info_dict = create_book_info_dict(root, book_code, source) # Add Book Metadata to Book Dictionary book_dict['metadata'] = book_info_dict book_content = create_book_content(root, book_info_dict['lang']) # Add Chapter count to Book Metadata book_dict['metadata']['totalChapters'] = len(book_content) # Add Book Content/Chapters List to book_dict book_dict['content'] = book_content # print('Book Metadata :: ', file_name, ' -> ', book_dict['metadata']) # print('Book Content :: ', file_name, ' -> ', book_dict['content'][0:2]) return book_dict def create_book_content(root, lang): book_content_list = [] body_element = root.find('xmlns:body', namespaces) if body_element is None: return book_content_list section_list = body_element.findall('xmlns:section', namespaces) # print('Length of Sections :: ', len(section_list)) chapter_num = 1 for section in section_list: sub_section_list = section.findall('xmlns:section', namespaces) # print('Length of Subsections :: ', len(sub_section_list)) paragraph_list = section.findall('xmlns:p', namespaces) # print('Length of Paragraphs :: ', len(paragraph_list)) # Check if this section is first Header Section without Content if len(sub_section_list) == 0 and len(paragraph_list) == 0: continue if len(sub_section_list) > 0: section_title = get_section_title(section, lang, False).strip() # get subsection title and subsection paragraphs for sub_section in sub_section_list: sub_section_title = get_section_title(sub_section, lang, True) if sub_section_title is None or len(sub_section_title) <= 0: continue chapter_content = get_chapter_content_from_section(sub_section) if chapter_content is None or len(chapter_content) <= 0: continue book_content_list.append({ 'chapter_num': chapter_num, 'chapter_name': section_title+' - '+sub_section_title.strip(), 'text_content': chapter_content }) chapter_num += 1 elif len(paragraph_list) > 0: section_title = get_section_title(section, lang, True) if section_title is None or len(section_title) <= 0: continue chapter_content = get_chapter_content_from_section(section) if chapter_content is None or len(chapter_content) <= 0: continue book_content_list.append({ 'chapter_num': chapter_num, 'chapter_name': section_title.strip(), 'text_content': chapter_content }) chapter_num += 1 return book_content_list def get_chapter_content_from_section(section): paragraphs = section.findall('xmlns:p', namespaces) chapter_content = '' if len(paragraphs) > 0: for paragraph in paragraphs: paragraph_text = paragraph.text if paragraph_text is None: continue chapter_content = chapter_content+' '+paragraph_text.strip() normalized_content = unicode.normalize("NFKD", chapter_content) return normalized_content else: return None def get_section_title(section, lang, is_chapter): section_title_name = '' title_p_list = section.findall('xmlns:title/xmlns:p', namespaces) if len(title_p_list) > 0: for title_p in title_p_list: p_text = title_p.text if p_text is None: continue section_title_name = section_title_name+' '+p_text.strip() if is_chapter and len(section_title_name) <= 5: section_title_name = const.CHAPTER_NAMES[lang]+' '+section_title_name return section_title_name def create_book_info_dict(root, book_code, book_source): book_info_dict = {} title_info = root.find('xmlns:description/xmlns:title-info', namespaces) if title_info is None: return book_info_dict book_info_dict['book_code'] = book_code book_title = title_info.find('xmlns:book-title', namespaces) if book_title is not None: book_info_dict['title'] = book_title.text book_info_dict['lang'] = title_info.find('xmlns:lang', namespaces).text src_lang = title_info.find('xmlns:src-lang', namespaces) if src_lang is not None: book_src_lang = src_lang.text if book_src_lang != book_info_dict['lang']: book_info_dict['isTranslation'] = 'true' else: book_info_dict['isTranslation'] = 'false' else: book_info_dict['isTranslation'] = 'false' book_info_dict['totalChapters'] = 0 book_info_dict['source'] = book_source book_info_dict[ 'authors'] = [] author = title_info.find('xmlns:author', namespaces) author_name = get_author_name(author) if len(author_name) > 0: book_info_dict['authors'].append({'name': author_name}) translators = title_info.findall('xmlns:translator', namespaces) for translator in translators: translator_name = get_author_name(translator) if len(translator_name) > 0: book_info_dict['authors'].append({'name': translator_name, 'translator':'true'}) return book_info_dict def get_author_name(author_root): first_name = author_root.find('xmlns:first-name', namespaces) middle_name = author_root.find('xmlns:middle-name', namespaces) last_name = author_root.find('xmlns:last-name', namespaces) name = '' if first_name is not None: first_name_text = first_name.text if first_name_text is not None and len(first_name_text) > 0: name = first_name_text.strip() if middle_name is not None: middle_name_text = middle_name.text if middle_name_text is not None and len(middle_name_text) > 0: name = name+' '+middle_name_text.strip() if last_name is not None: last_name_text = last_name.text if last_name_text is not None and len(last_name_text) > 0: name = name+' '+last_name_text.strip() return name