Reading FB2 Files done, Creating XML from parsed FB2 file done, FB2 files structure issues fixed

master
Pavan Mandava 6 years ago
parent 0c427f60eb
commit 017d8ec27d

@ -15,29 +15,23 @@ translate_client = translate.Client()
'''
def master_align(text0, text1, lang0, lang1):
""" Takes two equivalent texts (original and trnslation) and returns
aligned texts. """
df0 = frame_from_text(text0, lang0, lang1)
print('A')
# print('A')
df1 = frame_from_text(text1, lang1, lang0, is1=True)
print('B')
# print('B')
# returns dfs with ['sent', 'trans', 'rellen', 'relpos']
anchors = anchors_from_frames(df0, df1, window=2)
print('C')
# print('C')
alignments = intermediate_align(df0, df1, anchors, lookahead=4)
print('D')
# print('D')
textdict0, textdict1 = textdicts_from_alignments(df0, df1, alignments)
print('E')
# print('E')
return textdict0, textdict1
def frame_from_text(text, source, target, is1=False): #
""" """ #
#print(source, '-->', target)
@ -85,7 +79,7 @@ def intermediate_align(frame0, frame1, anchs, lookahead): #
def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
""" """
print(anchors_init, anchors_next)
# print(anchors_init, anchors_next)
interaligns = []
i,j = anchors_init
i+=1
@ -95,7 +89,7 @@ def get_interalign(df0, df1, anchors_init, anchors_next, lookahead): #
room0, room1 = min(end0-i,lookahead), min(end1-j,lookahead)
lambdascore = lambda p,q: score(df0, df1, i, j, p, q)
i_,j_ = min([(x,y) for x,y in cp(range(i,i+room0),range(j,j+room1)) if x==i or y==j], key=lambda a: lambdascore(*a))
print((i,j), (i_,j_))
# print((i,j), (i_,j_))
interaligns.append(((i,j),(i_,j_)))
i,j = i_+1,j_+1
return interaligns
@ -122,10 +116,10 @@ def textdicts_from_alignments(frame0, frame1, aligns): #
for i,((a0,a1),(b0,b1)) in enumerate(aligns):
t0 = ' '.join(frame0.loc[a0:b0, 'sent0'])
t1 = ' '.join(frame1.loc[a1:b1, 'sent1'])
print('***************************')
print(aligns[i])
print(t0)
print(t1)
# print('***************************')
# print(aligns[i])
# print(t0)
# print(t1)
textdict0.update({i:t0})
textdict1.update({i:t1})
return textdict0, textdict1

@ -1,13 +1,15 @@
Index;BookCode;Language;BookName;Status
1;dost_cap;de;crime_DE.fb2;
2;dost_cap;en;crime_EN.fb2;
3;dost_cap;ru;crime_RU.fb2;
4;dost_gambler;de;gambler_DE.fb2;
5;dost_gambler;en;gambler_EN.fb2;
6;dost_gambler;ru;gambler_RU.fb2;
7;dost_karamazov;de;karamazov_DE.fb2;
8;dost_karamazov;en;karamazov_EN.fb2;
9;dost_karamazov;ru;karamazov_RU.fb2;
10;dost_underground;de;underground_DE.fb2;
11;dost_underground;en;underground_EN.fb2;
12;dost_underground;ru;undrground_RU.fb2;
Index;BookCode;Language;BookName;Source;Encoding;Status
1;dost_cap_ende;en;crime_EN.fb2;http://originalbook.ru/;UTF-8;
2;dost_cap_ende;de;crime_DE.fb2;http://originalbook.ru/;UTF-8;
3;dost_cap_enru;en;crime_EN.fb2;http://originalbook.ru/;UTF-8;
4;dost_cap_enru;ru;crime_RU.fb2;http://originalbook.ru/;windows-1251;
5;dost_gambler_ende;en;gambler_EN.fb2;http://originalbook.ru/;UTF-8;
6;dost_gambler_ende;de;gambler_DE.fb2;http://originalbook.ru/;UTF-8;
7;dost_gambler_enru;en;gambler_EN.fb2;http://originalbook.ru/;UTF-8;
8;dost_gambler_enru;ru;gambler_RU.fb2;http://originalbook.ru/;windows-1251;
7;dost_karamazov;de;karamazov_DE.fb2;http://originalbook.ru/;UTF-8;
8;dost_karamazov;en;karamazov_EN.fb2;http://originalbook.ru/;UTF-8;
9;dost_karamazov;ru;karamazov_RU.fb2;http://originalbook.ru/;UTF-8;
10;dost_underground;de;underground_DE.fb2;http://originalbook.ru/;UTF-8;
11;dost_underground;en;underground_EN.fb2;http://originalbook.ru/;UTF-8;
12;dost_underground;ru;undrground_RU.fb2;http://originalbook.ru/;windows-1251;
1 Index BookCode Language BookName Source Encoding Status
2 1 dost_cap dost_cap_ende de en crime_DE.fb2 crime_EN.fb2 http://originalbook.ru/ UTF-8
3 2 dost_cap dost_cap_ende en de crime_EN.fb2 crime_DE.fb2 http://originalbook.ru/ UTF-8
4 3 dost_cap dost_cap_enru ru en crime_RU.fb2 crime_EN.fb2 http://originalbook.ru/ UTF-8
5 4 dost_gambler dost_cap_enru de ru gambler_DE.fb2 crime_RU.fb2 http://originalbook.ru/ windows-1251
6 5 dost_gambler dost_gambler_ende en gambler_EN.fb2 http://originalbook.ru/ UTF-8
7 6 dost_gambler dost_gambler_ende ru de gambler_RU.fb2 gambler_DE.fb2 http://originalbook.ru/ UTF-8
8 7 dost_karamazov dost_gambler_enru de en karamazov_DE.fb2 gambler_EN.fb2 http://originalbook.ru/ UTF-8
9 8 dost_karamazov dost_gambler_enru en ru karamazov_EN.fb2 gambler_RU.fb2 http://originalbook.ru/ windows-1251
10 9 7 dost_karamazov ru de karamazov_RU.fb2 karamazov_DE.fb2 http://originalbook.ru/ UTF-8
11 10 8 dost_underground dost_karamazov de en underground_DE.fb2 karamazov_EN.fb2 http://originalbook.ru/ UTF-8
12 11 9 dost_underground dost_karamazov en ru underground_EN.fb2 karamazov_RU.fb2 http://originalbook.ru/ UTF-8
13 12 10 dost_underground ru de undrground_RU.fb2 underground_DE.fb2 http://originalbook.ru/ UTF-8
14 11 dost_underground en underground_EN.fb2 http://originalbook.ru/ UTF-8
15 12 dost_underground ru undrground_RU.fb2 http://originalbook.ru/ windows-1251

@ -15,6 +15,7 @@
<image l:href="#doc2fb_image_02000001.jpg" />
</coverpage>
<lang>de</lang>
<src-lang>ru</src-lang>
</title-info>
<document-info>
<author>

File diff suppressed because one or more lines are too long

@ -751,7 +751,7 @@
</section>
<section>
<title>
<p>Chapter 4. The Confession of a Passionate Heart- In Anecdote</p>
<p>Chapter 4 and 5. The Confession of a Passionate Heart</p>
</title>
<p />
<p>"I was leading a wild life then. Father said just now that I spent several thousand roubles in seducing young girls. That's a swinish invention, and there was nothing of the sort. And if there was, I didn't need money simply for that. With me money is an accessory, the overflow of my heart, the framework. To-day she would be my lady, to-morrow a wench out of the streets in her place. I entertained them both. I threw away money by the handful on music, rioting, and Gypsies. Sometimes I gave it to the ladies, too, for they'll take it greedily, that must be admitted, and be pleased and thankful for it. Ladies used to be fond of me: not all of them, but it happened, it happened. But I always liked side-paths, little dark back-alleys behind the main road- there one finds adventures and surprises, and precious metal in the dirt. I am speaking figuratively, brother. In the town I was in, there were no such back-alleys in the literal sense, but morally there were. If you were like me, you'd know what that means. I loved vice, I loved the ignominy of vice. I loved cruelty; am I not a bug, am I not a noxious insect? In fact a Karamazov! Once we went, a whole lot of us, for a picnic, in seven sledges. It was dark, it was winter, and I began squeezing a girl's hand, and forced her to kiss me. She was the daughter of an official, a sweet, gentle, submissive creature. She allowed me, she allowed me much in the dark. She thought, poor thing, that I should come next day to make her an offer (I was looked upon as a good match, too). But I didn't say a word to her for five months. I used to see her in a corner at dances (we were always having dances), her eyes watching me. I saw how they glowed with fire- a fire of gentle indignation. This game only tickled that insect lust I cherished in my soul. Five months later she married an official and left the town, still angry, and still, perhaps, in love with me. Now they live happily. Observe that I told no one. I didn't boast of it. Though I'm full of low desires, and love what's low, I'm not dishonourable. You're blushing; your eyes flashed. Enough of this filth with you. And all this was nothing much- wayside blossoms a la Paul de Kock- though the cruel insect had already grown strong in my soul. I've a perfect album of reminiscences, brother. God bless them, the darlings. I tried to break it off without quarrelling. And I never gave them away, I never bragged of one of them. But that's enough. You can't suppose I brought you here simply to talk of such nonsense. No, I'm going to tell you something more curious; and don't be surprised that I'm glad to tell you, instead of being ashamed."</p>
@ -786,13 +786,6 @@
<p>"I should have lost the game, of course. She'd have run away. But it would have been an infernal revenge. It would have been worth it all. I'd have howled with regret all the rest of my life, only to have played that trick. Would you believe it, it has never happened to me with any other woman, not one, to look at her at such a moment with hatred. But, on my oath, I looked at her for three seconds, or five perhaps, with fearful hatred- that hate which is only a hair's-breadth from love, from the maddest love!</p>
<p>"I went to the window, put my forehead against the frozen pane, and I remember the ice burnt my forehead like fire. I did not keep her long, don't be afraid. I turned round, went up to the table, opened the drawer and took out a banknote for five thousand roubles (it was lying in a French dictionary). Then I showed it her in silence, folded it, handed it to her, opened the door into the passage, and, stepping back, made her a deep bow. a most respectful, a most impressive bow, believe me! She shuddered all over, gazed at me for a second, turned horribly pale-white as a sheet, in fact- and all at once, not impetuously but softly, gently, bowed down to my feet- not a boarding-school curtsey, but a Russian bow, with her forehead to the floor. She jumped up and ran away. I was wearing my sword. I drew it and nearly stabbed myself with it on the spot; why, I don't know. It would have been frightfully stupid, of course. I suppose it was from delight. Can you understand that one might kill oneself from delight? But I didn't stab myself. I only kissed my sword and put it back in the scabbard- which there was no need to have told you, by the way. And I fancy that in telling you about my inner conflict I have laid it on rather thick to glorify myself. But let it pass, and to hell with all who pry into the human heart! Well, so much for that 'adventure' with Katerina Ivanovna. So now Ivan knows of it, and you- no one else."</p>
<p>Dmitri got up, took a step or two in his excitement, pulled out his handkerchief and mopped his forehead, then sat down again, not in the same place as before, but on the opposite side, so that Alyosha had to turn quite round to face him.</p>
<p />
</section>
<section>
<title>
<p>Chapter 5. The Confession of a Passionate Heart- "Heels Up"</p>
</title>
<p />
<p>"NOW," said Alyosha, "I understand the first half."</p>
<p>"You understand the first half. That half is a drama, and it was played out there. The second half is a tragedy, and it is being acted here."</p>
<p>"And I understand nothing of that second half so far," said Alyosha.</p>
@ -865,7 +858,6 @@
<p>"I'll go, Mitya. I believe that God will order things for the best, that nothing awful may happen."</p>
<p>"And I will sit and wait for the miracle. And if it doesn't come to pass- "</p>
<p>Alyosha went thoughtfully towards his father's house.</p>
<p />
</section>
<section>
<title>
@ -5977,7 +5969,7 @@
</section>
<section>
<title>
<p>Chapter 2. Dangerous Witnesses</p>
<p>Chapter 2 and 3. Dangerous Witnesses</p>
</title>
<p />
<p>I DO NOT know whether the witnesses for the defence and for the prosecution were separated into groups by the President, and whether it was arranged to call them in a certain order. But no doubt it was so. I only know that the witnesses for the prosecution were called first. I repeat I don't intend to describe all the questions step by step. Besides, my account would be to some extent superfluous, because in the speeches for the prosecution and for the defence the whole course of the evidence was brought together and set in a strong and significant light, and I took down parts of those two remarkable speeches in full, and will quote them in due course, together with one extraordinary and quite unexpected episode, which occurred before the final speeches, and undoubtedly influenced the sinister and fatal outcome of the trial.</p>
@ -6043,13 +6035,6 @@
<p>When Fetyukovitch had to cross-examine him, he scarcely tried to refute his evidence, but began asking him about an incident at the first carousal at Mokroe, a month before the arrest, when Timofey and another peasant called Akim had picked up on the floor in the passage a hundred roubles dropped by Mitya when he was drunk, and had given them to Trifon Borissovitch and received a rouble each from him for doing so. "Well," asked the lawyer," did you give that hundred roubles back to Mr. Karamazov?" Trifon Borissovitch shuffled in vain.... He was obliged, after the peasants had been examined, to admit the finding of the hundred roubles, only adding that he had religiously returned it all to Dmitri Fyodorovitch "in perfect honesty, and it's only because his honour was in liquor at the time, he wouldn't remember it." But, as he had denied the incident of the hundred roubles till the peasants had been called to prove it, his evidence as to returning the money to Mitya was naturally regarded with great suspicion. So one of the most dangerous witnesses brought forward by the prosecution was again discredited.</p>
<p>The same thing happened with the Poles. They took up an attitude of pride and independence; they vociferated loudly that they had both been in the service of the Crown, and that "Pan Mitya" had offered them three thousand "to buy their honour," and that they had seen a large sum of money in his hands. Pan Mussyalovitch introduced a terrible number of Polish words into his sentences, and seeing that this only increased his consequence in the eyes of the President and the prosecutor, grew more and more pompous, and ended by talking in Polish altogether. But Fetyukovitch caught them, too, in his snares. Trifon Borissovitch, recalled, was forced, in spite of his evasions, to admit that Pan Vrublevsky had substituted another pack of cards for the one he had provided, and that Pan Mussyalovitch had cheated during the game. Kalgonov confirmed this, and both the Poles left the witness-box with damaged reputations, amidst laughter from the public.</p>
<p>Then exactly the same thing happened with almost all the most dangerous witnesses. Fetyukovitch succeeded in casting a slur on all of them, and dismissing them with a certain derision. The lawyers and experts were lost in admiration, and were only at a loss to understand what good purpose could be served by it, for all, I repeat, felt that the case for the prosecution could not be refuted, but was growing more and more tragically overwhelming. But from the confidence of the "great magician" they saw that he was serene, and they waited, feeling that "such a man" had not come from Petersburg for nothing, and that he was not a man to return unsuccessful.</p>
<p />
</section>
<section>
<title>
<p>Chapter 3. The Medical Experts and a Pound of Nuts</p>
</title>
<p />
<p>THE evidence of the medical experts, too, was of little use to the prisoner. And it appeared later that Fetyukovitch had not reckoned much upon it. The medical line of defence had only been taken up through the insistence of Katerina Ivanovna, who had sent for a celebrated doctor from Moscow on purpose. The case for the defence could, of course, lose nothing by it and might, with luck, gain something from it. There was, however, an element of comedy about it, through the difference of opinion of the doctors. The medical experts were the famous doctor from Moscow, our doctor, Herzenstube, and the young doctor, Varvinsky. The two latter appeared also as witnesses for the prosecution.</p>
<p>The first to be called in the capacity of expert was Doctor Herzenstube. He was a grey and bald old man of seventy, of middle height and sturdy build. He was much esteemed and respected by everyone in the town. He was a conscientious doctor and an excellent and pious man, a Hernguter or Moravian brother, I am not quite sure which. He had been living amongst us for many years and behaved with wonderful dignity. He was a kind-hearted and humane man. He treated the sick poor and peasants for nothing, visited them in their slums and huts, and left money for medicine, but he was as obstinate as a mule. If once he had taken an idea into his head, there was no shaking it. Almost everyone in the town was aware, by the way, that the famous doctor had, within the first two or three days of his presence among us, uttered some extremely offensive allusions to Doctor Herzenstube's qualifications. Though the Moscow doctor asked twenty-five roubles for a visit, several people in the town were glad to take advantage of his arrival, and rushed to consult him regardless of expense. All these had, of course, been previously patients of Doctor Herzenstube, and the celebrated doctor had criticised his treatment with extreme harshness. Finally, he had asked the patients as soon as he saw them, "Well, who has been cramming you with nostrums? Herzenstube? He he!" Doctor Herzenstube, of course, heard all this, and now all the three doctors made their appearance, one after another, to be examined.</p>
<p>Doctor Herzenstube roundly declared that the abnormality of the prisoner's mental faculties was self-evident. Then giving his grounds for this opinion, which I omit here, he added that the abnormality was not only evident in many of the prisoner's actions in the past, but was apparent even now at this very moment. When he was asked to explain how it was apparent now at this moment, the old doctor, with simple-hearted directness, pointed out that the prisoner had "an extraordinary air, remarkable in the circumstances"; that he had "marched in like a soldier, looking straight before him, though it would have been more natural for him to look to the left where, among the public, the ladies were sitting, seeing that he was a great admirer of the fair sex and must be thinking much of what the ladies are saying of him now," the old man concluded in his peculiar language.</p>

File diff suppressed because one or more lines are too long

@ -21,7 +21,7 @@ def add_book_to_db(book_code, book_dict):
# returns the last row id, if row added to the table successfully
last_rowid = add_book_row_to_table(db_cursor, const.BOOK_INSERT_QUERY, book_row)
book_id = last_rowid
print('Book Row Id :: ', last_rowid)
# print('Book Row Id :: ', last_rowid)
book_info_dict = book_dict['bookInfo']
if last_rowid > 0:
@ -38,7 +38,7 @@ def add_book_to_db(book_code, book_dict):
# returns the last row id, if row added to the table successfully
last_rowid = add_book_info_row_to_table(db_cursor, const.BOOK_INFO_INSERT_QUERY, book_info_row)
print('Book Info Row Id :: ', last_rowid)
# print('Book Info Row Id :: ', last_rowid)
if last_rowid > 0:
book_info_id = last_rowid
@ -50,18 +50,18 @@ def add_book_to_db(book_code, book_dict):
'total_books': 1
}
author_row = search_author(db_cursor, const.AUTHOR_SEARCH_QUERY, author_row)
print('Author Search Result :: ', author_row)
# print('Author Search Result :: ', author_row)
if author_row['id'] > 0:
author_row['total_books'] = author_row['total_books'] + 1
last_rowid = update_author_book_count(db_cursor, const.AUTHOR_UPDATE_QUERY, author_row)
print('Author Update Row count :: ', last_rowid)
# print('Author Update Row count :: ', last_rowid)
if last_rowid <= 0:
break
else:
author_row['name'] = author['name']
author_row['total_books'] = 1
last_rowid = add_author_to_table(db_cursor, const.AUTHOR_INSERT_QUERY, author_row)
print('Add Author Row Id :: ', last_rowid)
# print('Add Author Row Id :: ', last_rowid)
if last_rowid > 0:
author_row['id'] = last_rowid
@ -76,7 +76,7 @@ def add_book_to_db(book_code, book_dict):
}
last_rowid = add_author_book_mapping(db_cursor, const.BOOK_AUTHOR_INSERT_QUERY, map_author_book)
print('Author Book Mapping Row ID :: ', last_rowid)
# print('Author Book Mapping Row ID :: ', last_rowid)
if last_rowid < 0:
break
@ -87,7 +87,7 @@ def add_book_to_db(book_code, book_dict):
# returns the last row id, if row added to the table successfully
last_rowid = add_book_content_row_to_table(db_cursor, const.CONTENT_INSERT_QUERY, book_content_row)
print('Book Content Row Id :: ', last_rowid)
# print('Book Content Row Id :: ', last_rowid)
if last_rowid > 0:
content_id = last_rowid
@ -99,7 +99,7 @@ def add_book_to_db(book_code, book_dict):
'book_content': content_id
}
chapter_id = add_book_chapter_to_table(db_cursor, const.CHAPTER_INSERT_QUERY, book_chapter_row)
print('Book Chapter Row Id :: ', chapter_id)
# print('Book Chapter Row Id :: ', chapter_id)
if chapter_id > 0:
sentences_dict = chapter['sentences']
for s_num in sentences_dict.keys():
@ -109,7 +109,7 @@ def add_book_to_db(book_code, book_dict):
'chapter': chapter_id
}
sen_id = add_book_sentence_to_table(db_cursor, const.SENTENCE_INSERT_QUERY, sentence_row)
print('Book Sentence Id :: ', sen_id)
# print('Book Sentence Id :: ', sen_id)
if sen_id <= 0:
break
else:

@ -0,0 +1,187 @@
from lxml import etree
import os
import utils.constants as const
import unicodedata as unicode
namespaces = {'xmlns': "http://www.gribuser.ru/xml/fictionbook/2.0"}
def read_fb2_file(file_name, book_code, source, encoding):
full_file_path = os.path.dirname(os.path.dirname(__file__))+'/data/'+file_name
print(const.BLUE, 'Reading :: ', full_file_path, const.END)
book_dict = {}
tree = etree.parse(full_file_path)
root = tree.getroot()
book_info_dict = create_book_info_dict(root, book_code, source)
# Add Book Metadata to Book Dictionary
book_dict['metadata'] = book_info_dict
book_content = create_book_content(root, book_info_dict['lang'])
# Add Chapter count to Book Metadata
book_dict['metadata']['totalChapters'] = len(book_content)
# Add Book Content/Chapters List to book_dict
book_dict['content'] = book_content
# print('Book Metadata :: ', file_name, ' -> ', book_dict['metadata'])
# print('Book Content :: ', file_name, ' -> ', book_dict['content'][0:2])
return book_dict
def create_book_content(root, lang):
book_content_list = []
body_element = root.find('xmlns:body', namespaces)
if body_element is None:
return book_content_list
section_list = body_element.findall('xmlns:section', namespaces)
# print('Length of Sections :: ', len(section_list))
chapter_num = 1
for section in section_list:
sub_section_list = section.findall('xmlns:section', namespaces)
# print('Length of Subsections :: ', len(sub_section_list))
paragraph_list = section.findall('xmlns:p', namespaces)
# print('Length of Paragraphs :: ', len(paragraph_list))
# Check if this section is first Header Section without Content
if len(sub_section_list) == 0 and len(paragraph_list) == 0:
continue
if len(sub_section_list) > 0:
section_title = get_section_title(section, lang, False).strip()
# get subsection title and subsection paragraphs
for sub_section in sub_section_list:
sub_section_title = get_section_title(sub_section, lang, True)
if sub_section_title is None or len(sub_section_title) <= 0:
continue
chapter_content = get_chapter_content_from_section(sub_section)
if chapter_content is None or len(chapter_content) <= 0:
continue
book_content_list.append({
'chapter_num': chapter_num,
'chapter_name': section_title+' - '+sub_section_title.strip(),
'text_content': chapter_content
})
chapter_num += 1
elif len(paragraph_list) > 0:
section_title = get_section_title(section, lang, True)
if section_title is None or len(section_title) <= 0:
continue
chapter_content = get_chapter_content_from_section(section)
if chapter_content is None or len(chapter_content) <= 0:
continue
book_content_list.append({
'chapter_num': chapter_num,
'chapter_name': section_title.strip(),
'text_content': chapter_content
})
chapter_num += 1
return book_content_list
def get_chapter_content_from_section(section):
paragraphs = section.findall('xmlns:p', namespaces)
chapter_content = ''
if len(paragraphs) > 0:
for paragraph in paragraphs:
paragraph_text = paragraph.text
if paragraph_text is None:
continue
chapter_content = chapter_content+' '+paragraph_text.strip()
normalized_content = unicode.normalize("NFKD", chapter_content)
return normalized_content
else:
return None
def get_section_title(section, lang, is_chapter):
section_title_name = ''
title_p_list = section.findall('xmlns:title/xmlns:p', namespaces)
if len(title_p_list) > 0:
for title_p in title_p_list:
p_text = title_p.text
if p_text is None:
continue
section_title_name = section_title_name+' '+p_text.strip()
if is_chapter and len(section_title_name) <= 5:
section_title_name = const.CHAPTER_NAMES[lang]+' '+section_title_name
return section_title_name
def create_book_info_dict(root, book_code, book_source):
book_info_dict = {}
title_info = root.find('xmlns:description/xmlns:title-info', namespaces)
if title_info is None:
return book_info_dict
book_info_dict['book_code'] = book_code
book_title = title_info.find('xmlns:book-title', namespaces)
if book_title is not None:
book_info_dict['title'] = book_title.text
book_info_dict['lang'] = title_info.find('xmlns:lang', namespaces).text
src_lang = title_info.find('xmlns:src-lang', namespaces)
if src_lang is not None:
book_src_lang = src_lang.text
if book_src_lang != book_info_dict['lang']:
book_info_dict['isTranslation'] = 'true'
else:
book_info_dict['isTranslation'] = 'false'
else:
book_info_dict['isTranslation'] = 'false'
book_info_dict['totalChapters'] = 0
book_info_dict['source'] = book_source
book_info_dict[ 'authors'] = []
author = title_info.find('xmlns:author', namespaces)
author_name = get_author_name(author)
if len(author_name) > 0:
book_info_dict['authors'].append({'name': author_name})
translators = title_info.findall('xmlns:translator', namespaces)
for translator in translators:
translator_name = get_author_name(translator)
if len(translator_name) > 0:
book_info_dict['authors'].append({'name': translator_name, 'translator':'true'})
return book_info_dict
def get_author_name(author_root):
first_name = author_root.find('xmlns:first-name', namespaces)
middle_name = author_root.find('xmlns:middle-name', namespaces)
last_name = author_root.find('xmlns:last-name', namespaces)
name = ''
if first_name is not None:
first_name_text = first_name.text
if first_name_text is not None and len(first_name_text) > 0:
name = first_name_text.strip()
if middle_name is not None:
middle_name_text = middle_name.text
if middle_name_text is not None and len(middle_name_text) > 0:
name = name+' '+middle_name_text.strip()
if last_name is not None:
last_name_text = last_name.text
if last_name_text is not None and len(last_name_text) > 0:
name = name+' '+last_name_text.strip()
return name

@ -6,7 +6,9 @@ import utils.constants as const
import utils.env_utils as env
import xml_parser.create_xml as create_xml
import txt_parser.csv_utils as csv_utils
from csv2df import get_book_content, get_book_metadata
import fb2_parser.read_fb2 as read_fb2
import aligner.bitext_align as aligner
import time
def validate_all_xml_files():
@ -46,51 +48,48 @@ def read_data_files_and_align_sentences(book_code):
if book_code in books_dict:
book_code_list = books_dict[book_code]
full_book_dict = {}
book_lang_list = []
for book in book_code_list:
book_lines = csv_utils.read_data_file(book[3].strip())
# TODO (for Jassi) :: Take this 'book_lines' and return dictionary after parsing chapters
# TODO :: Please Follow the below Dictionary Structure, ==
# Later Isaac will use this dict structure to align sentences
# book_dict = {
# 'meta_data': {
# "book_id": "",
# "title": "",
# "lang": "",
# "isTranslation": "",
# "totalChapters": "",
# "authors": [
# {
# "name": "",
# "translator": ""
# },
# {
# "name": ""
# }
# ],
# "description": "", # Optional
# "source": ""
# },
# 'content' : [
# {
# 'chapter_num': '',
# 'chapter_name': '',
# 'text_content': ''
# },
# {
# 'chapter_num': '',
# 'chapter_name': '',
# 'text_content': ''
# }
# ]
# }
def create_xml_file(book_content_dict, book_metadata_dict):
create_xml.create_xml_file(book_content_dict, book_metadata_dict)
book_dict = read_fb2.read_fb2_file(book[3].strip(), book_code, book[4], book[5])
full_book_dict[book[2]] = book_dict
book_lang_list.append(book[2])
book1 = full_book_dict[book_lang_list[0]]
book2 = full_book_dict[book_lang_list[1]]
book1_lang = book1['metadata']['lang']
book1_chapters = book1['content']
book2_lang = book2['metadata']['lang']
book2_chapters = book2['content']
print(const.BLUE, 'Total Chapters :: ', book1['metadata']['totalChapters'])
for idx, book1_chapter in enumerate(book1_chapters):
book2_chapter = book2_chapters[idx]
if book1_chapter['chapter_num'] == book2_chapter['chapter_num']:
book1_sen, book2_sen = aligner.master_align(book1_chapter['text_content'], book2_chapter['text_content'], book1_lang, book2_lang)
print(const.GREEN, 'Chapter :', book1_chapter['chapter_num'], '-> Sentence Alignment Done', const.END)
book1_chapter.pop('text_content')
book2_chapter.pop('text_content')
book1_chapter.update({'sentences': book1_sen})
book2_chapter.update({'sentences': book2_sen})
book1_chapters[idx] = book1_chapter
book2_chapters[idx] = book2_chapter
time.sleep(60)
if idx == 1:
break
print(const.BLUE, 'Book Sentence Alignment Done', const.END)
create_xml_file(book1_chapters, book1['metadata'])
create_xml_file(book2_chapters, book2['metadata'])
def create_xml_file(book_content, book_metadata_dict):
create_xml.create_xml_file(book_content, book_metadata_dict)
if env.check_env_variables():
read_data_files_and_align_sentences('dost_cap')
# validate_all_xml_files()
# save_validated_files_to_db()
read_data_files_and_align_sentences('dost_cap_ende')
validate_all_xml_files()
save_validated_files_to_db()

@ -1,6 +0,0 @@
1,1,"Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo."
1,2,"Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt."
1,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."
2,1,"Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?"
2,2,"Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?"
2,3,"Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem."
1 1 1 Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo.
2 1 2 Nemo enim ipsam voluptatem quia voluptas sit aspernatur aut odit aut fugit, sed quia consequuntur magni dolores eos qui ratione voluptatem sequi nesciunt.
3 1 3 Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem.
4 2 1 Ut enim ad minima veniam, quis nostrum exercitationem ullam corporis suscipit laboriosam, nisi ut aliquid ex ea commodi consequatur?
5 2 2 Quis autem vel eum iure reprehenderit qui in ea voluptate velit esse quam nihil molestiae consequatur, vel illum qui dolorem eum fugiat quo voluptas nulla pariatur?
6 2 3 Neque porro quisquam est, qui dolorem ipsum quia dolor sit amet, consectetur, adipisci velit, sed quia non numquam eius modi tempora incidunt ut labore et dolore magnam aliquam quaerat voluptatem.

@ -17,3 +17,21 @@ END = '\033[0m'
BLUE = '\033[94m'
GREEN = '\033[92m'
CHAPTER_NAMES = {
'en': 'Chapter',
'EN': 'CHAPTER',
'de': 'Kapitel',
'DE': 'Kapitel',
'ru': 'Глава',
'RU': 'Глава'
}
LANGUAGE_NAME = {
'en': 'english',
'EN': 'english',
'de': 'german',
'DE': 'german',
'ru': 'russian',
'RU': 'russian'
}

@ -6,9 +6,9 @@ import utils.json_utils as json_utils
import utils.constants as const
def create_xml_file(book_dict, book_metadata):
def create_xml_file(book_content, book_metadata):
book_root = ET.Element('book')
book_root.set('code', book_metadata['book_id'])
book_root.set('code', book_metadata['book_code'])
book_info = ET.SubElement(book_root, 'bookInfo')
content = ET.SubElement(book_root, 'content')
@ -23,7 +23,7 @@ def create_xml_file(book_dict, book_metadata):
is_translation.text = book_metadata['isTranslation']
total_chapters = ET.SubElement(book_info, 'totalChapters')
total_chapters.text = book_metadata['totalChapters']
total_chapters.text = str(book_metadata['totalChapters'])
source = ET.SubElement(book_info, 'source')
source.text = book_metadata['source']
@ -43,13 +43,17 @@ def create_xml_file(book_dict, book_metadata):
if 'translator' in auth:
author.set('translator', auth['translator'])
for key in book_dict.keys():
chapter = ET.SubElement(content, 'chapter')
chapter.set('num', str(key))
for idx, val in enumerate(book_dict[key]):
sentence = ET.SubElement(chapter, 'sentence')
sentence.set('num', str(idx + 1))
sentence.text = val
for chapter in book_content:
if 'sentences' not in chapter:
continue
chapter_element = ET.SubElement(content, 'chapter')
chapter_element.set('num', str(chapter['chapter_num']))
chapter_element.set('name', chapter['chapter_name'])
sentences_dict = chapter['sentences']
for key in sentences_dict.keys():
sentence = ET.SubElement(chapter_element, 'sentence')
sentence.set('num', str(key + 1))
sentence.text = sentences_dict[key]
# tree = ET.ElementTree(book_root)
# tree.write(filename)
@ -86,6 +90,7 @@ def add_xml_book_data_to_json(book_code, json_obj):
json_data['books'] = books
json_utils.write_json_file(const.JSON_PATH, json_data)
print(const.BLUE, 'Added XML Book Entry to JSON', const.END)
def prettify(root):

Loading…
Cancel
Save