Notes from the Underground Done,

index.html Added and aligned html file added
master
Pavan Mandava 6 years ago
parent b0a1359401
commit d8d3b6af11

@ -5,12 +5,14 @@ from itertools import product as cp
import numpy as np
import pandas as pd
import re
from google.cloud import translate_v2 as translate
from jellyfish import levenshtein_distance as lev
import nltk
import utils.constants as const
nltk.download('punkt')
translate_client = translate.Client()
'''
@ -20,8 +22,8 @@ translate_client = translate.Client()
def master_align(text0, text1, lang0, lang1):
""" Takes two equivalent texts (original and trnslation) and returns
aligned texts. """
text0 = re.sub(' ?. . . ?| … ?| ?... ?', '', text0)
text1 = re.sub(' ?. . . ?| … ?| ?... ?', '', text1)
text0 = re.sub(' ?\\. \\. \\. ?| \\.\\.\\. ?| ?\\.\\.\\. ?', '', text0)
text1 = re.sub(' ?\\. \\. \\. ?| \\.\\.\\. ?| ?\\.\\.\\. ?', '', text1)
df0 = frame_from_text(text0, lang0, lang1)
# print('A')
df1 = frame_from_text(text1, lang1, lang0, is1=True)

@ -1,50 +0,0 @@
from collections import OrderedDict
import os
import pandas as pd
def get_book_content():
csv_path = os.path.dirname(os.path.realpath(__file__)) + '/test_example.csv'
print('Test CSV File :: ', csv_path)
df = pd.read_csv(csv_path, header=None).rename(
columns={0: 'chapter', 1: 'sentence', 2: 'text'})
book_dict = OrderedDict()
for index, row in df.iterrows():
ch_id = row['chapter']
s_id = row['sentence']
text = row['text']
# print(ch_id, " -> ", s_id, " -> ", text)
if ch_id not in book_dict:
book_dict[ch_id] = []
book_dict[ch_id].append(text)
return book_dict
def get_book_metadata():
dict_metadata = {
"book_id": "fdcap_book",
"title": "Crime and Punishment",
"lang": "en",
"isTranslation": "true",
"totalChapters": "2",
"authors": [
{
"name": "Herr Isaac Riley",
"translator": "true"
},
{
"name": "Fyodor Dostoevsky"
}
],
"description": "Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author "
"Fyodor Dostoevsky.First published in a journal named The Russian Messenger, it appeared in "
"twelve monthly installments in 1866, and was later published as a novel",
"source": "https://en.wikisource.org/wiki/Crime_and_Punishment"
}
return dict_metadata

@ -0,0 +1,77 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Parallel Corpus Creation</title>
<style>
a.button {
-webkit-appearance: button;
-moz-appearance: button;
appearance: button;
text-decoration: none;
color: initial;
background-color: #ECB142;
border-radius: 2px;
border: 1px solid #ECD9CF;
color: white;
padding: 4px 4px;
text-align: center;
font-size: 16px;
font-weight: bold;
cursor: pointer;
}
a.button:hover {
background-color: #E87131;
}
table tr td {
font-size: 17px;
font-weight: bold;
}
</style>
</head>
<body>
<h2 align="center">Main Heading comes here.....</h2>
<br />
<br />
<div id="main" align="center">
<table width="auto" cellspacing="20" cellpadding="4">
<tr>
<td width="auto" align="left">Crime And Punishment(EN) - Verbrechen und Strafe(DE)</td>
<td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
</tr>
<tr>
<td width="auto" align="left">Crime And Punishment(EN) - Преступление и наказание(RU)</td>
<td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
</tr>
<tr>
<td width="auto" align="left">The Gambler(EN) - Der Spieler(DE)</td>
<td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
</tr>
<tr>
<td width="auto" align="left">The Gambler(EN) - Игрок(RU)</td>
<td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
</tr>
<tr>
<td width="auto" align="left">Notes from Underground(EN) - Aufzeichnungen aus dem Kellerloch(DE)</td>
<td align="center"><a class="button" href="xslt/dost_under_ende.html">View HTML</a></td>
</tr>
<tr>
<td width="auto" align="left">Notes from Underground(EN) - Записки из подполья(RU)</td>
<td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
</tr>
</table>
</div>
</body>
</html>

@ -0,0 +1,20 @@
{
"books": {
"dost_under_ende": [
{
"xml_file": "dost_under_ende_en.xml",
"lang": "en",
"xml_file_path": "/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/dost_under_ende_en.xml",
"is_validated": true,
"is_saved_to_db": false
},
{
"xml_file": "dost_under_ende_de.xml",
"lang": "de",
"xml_file_path": "/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/dost_under_ende_de.xml",
"is_validated": true,
"is_saved_to_db": false
}
]
}
}

@ -76,20 +76,22 @@ def read_data_files_and_align_sentences(book_code):
book1_chapters[idx] = book1_chapter
book2_chapters[idx] = book2_chapter
time.sleep(10)
if idx == 1:
break
print(const.BLUE, 'Book Sentence Alignment Done', const.END)
create_xml_file(book1_chapters, book1['metadata'])
create_xml_file(book2_chapters, book2['metadata'])
else:
print(const.WARNING, 'Unknown Book Code :: ', book_code, const.END)
print(const.BLUE, 'Please provide the BookCode from books_data.csv', const.END)
def create_xml_file(book_content, book_metadata_dict):
create_xml.create_xml_file(book_content, book_metadata_dict)
if env.check_env_variables():
read_data_files_and_align_sentences('dost_cap_ende')
# read_data_files_and_align_sentences('dost_under_ende')
validate_all_xml_files()
# save_validated_files_to_db()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -64,7 +64,7 @@ def create_xml_file(book_content, book_metadata):
file_path = file.name
file.write(prettify(book_root))
file.close()
print(const.BLUE, 'Saved XML File Path :: ', file_path, const.END)
print(const.BLUE, 'Saved Book Content to XML File - Path :: ', file_path, const.END)
json_obj = {}
book_code = book_root.get('code')
json_obj['xml_file'] = filename

@ -1,14 +1,12 @@
from csv2df import get_book_content, get_book_metadata
import xml_parser.create_xml as create_xml
import xml_parser.read_xml as read_xml
import xml_parser.validate as validate
file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata())
# file_path = create_xml.create_xml_file({},{})
# print(file_path)
validate.validate_all_xml_files()
# book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml')

@ -1,8 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:variable name="book1" select="document('../xml_files/dost_cap_ende_en.xml')/*" />
<xsl:variable name="book2" select="document('../xml_files/dost_cap_ende_de.xml')/*" />
<xsl:variable name="book1" select="document('../xml_files/dost_under_ende_en.xml')/*" />
<xsl:variable name="book2" select="document('../xml_files/dost_under_ende_de.xml')/*" />
<xsl:template match="/">
<html>

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save