Notes from the Underground Done,

index.html Added and aligned html file added
6 years ago · d8d3b6af11
parent b0a1359401
commit d8d3b6af11
11 changed files with 8525 additions and 61 deletions
--- a/aligner/bitext_align.py
+++ b/aligner/bitext_align.py
@ -5,12 +5,14 @@ from itertools import product as cp

 import numpy as np
 import pandas as pd
+import re
 from google.cloud import translate_v2 as translate
 from jellyfish import levenshtein_distance as lev
 import nltk
 import utils.constants as const
 nltk.download('punkt')

+
 translate_client = translate.Client()

 '''
@ -20,8 +22,8 @@ translate_client = translate.Client()
 def master_align(text0, text1, lang0, lang1): 
    """ Takes two equivalent texts (original and trnslation) and returns 
        aligned texts. """
-    text0 = re.sub(' ?. . . ?| … ?| ?... ?', '… ', text0)
-    text1 = re.sub(' ?. . . ?| … ?| ?... ?', '… ', text1)
+    text0 = re.sub(' ?\\. \\. \\. ?| \\.\\.\\. ?| ?\\.\\.\\. ?', '… ', text0)
+    text1 = re.sub(' ?\\. \\. \\. ?| \\.\\.\\. ?| ?\\.\\.\\. ?', '… ', text1)
    df0 = frame_from_text(text0, lang0, lang1)
    # print('A')
    df1 = frame_from_text(text1, lang1, lang0, is1=True)
--- a/csv2df.py
+++ b/csv2df.py
@ -1,50 +0,0 @@
-from collections import OrderedDict
-import os
-import pandas as pd
-
-
-def get_book_content():
-    csv_path = os.path.dirname(os.path.realpath(__file__)) + '/test_example.csv'
-    print('Test CSV File :: ', csv_path)
-    df = pd.read_csv(csv_path, header=None).rename(
-        columns={0: 'chapter', 1: 'sentence', 2: 'text'})
-
-    book_dict = OrderedDict()
-
-    for index, row in df.iterrows():
-        ch_id = row['chapter']
-        s_id = row['sentence']
-        text = row['text']
-        # print(ch_id, " -> ", s_id, " -> ", text)
-
-        if ch_id not in book_dict:
-            book_dict[ch_id] = []
-        book_dict[ch_id].append(text)
-
-    return book_dict
-
-
-def get_book_metadata():
-
-    dict_metadata = {
-        "book_id": "fdcap_book",
-        "title": "Crime and Punishment",
-        "lang": "en",
-        "isTranslation": "true",
-        "totalChapters": "2",
-        "authors": [
-            {
-                "name": "Herr Isaac Riley",
-                "translator": "true"
-            },
-            {
-                "name": "Fyodor Dostoevsky"
-            }
-        ],
-        "description": "Crime and Punishment (Russian: Преступление и наказание) is a novel written by Russian author "
-                       "Fyodor Dostoevsky.First published in a journal named The Russian Messenger, it appeared in "
-                       "twelve monthly installments in 1866, and was later published as a novel",
-        "source": "https://en.wikisource.org/wiki/Crime_and_Punishment"
-    }
-
-    return dict_metadata
--- a/index.html
+++ b/index.html
@ -0,0 +1,77 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <title>Parallel Corpus Creation</title>
+    <style>
+        a.button {
+              -webkit-appearance: button;
+              -moz-appearance: button;
+              appearance: button;
+              text-decoration: none;
+              color: initial;
+              background-color: #ECB142;
+              border-radius: 2px;
+              border: 1px solid #ECD9CF;
+              color: white;
+              padding: 4px 4px;
+              text-align: center;
+              font-size: 16px;
+              font-weight: bold;
+              cursor: pointer;
+        }
+        a.button:hover {
+              background-color: #E87131;
+        }
+        table tr td {
+              font-size: 17px;
+              font-weight: bold;
+        }
+    </style>
+</head>
+<body>
+
+    <h2 align="center">Main Heading comes here.....</h2>
+    <br />
+    <br />
+
+    <div id="main" align="center">
+
+        <table width="auto" cellspacing="20" cellpadding="4">
+
+            <tr>
+                <td width="auto" align="left">Crime And Punishment(EN) - Verbrechen und Strafe(DE)</td>
+                <td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
+            </tr>
+
+            <tr>
+                <td width="auto" align="left">Crime And Punishment(EN) - Преступление и наказание(RU)</td>
+                <td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
+            </tr>
+
+            <tr>
+                <td width="auto" align="left">The Gambler(EN) - Der Spieler(DE)</td>
+                <td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
+            </tr>
+
+            <tr>
+                <td width="auto" align="left">The Gambler(EN) - Игрок(RU)</td>
+                <td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
+            </tr>
+
+            <tr>
+                <td width="auto" align="left">Notes from Underground(EN) - Aufzeichnungen aus dem Kellerloch(DE)</td>
+                <td align="center"><a class="button" href="xslt/dost_under_ende.html">View HTML</a></td>
+            </tr>
+
+            <tr>
+                <td width="auto" align="left">Notes from Underground(EN) - Записки из подполья(RU)</td>
+                <td align="center"><a class="button" href="xslt/action.html">View HTML</a></td>
+            </tr>
+
+        </table>
+
+    </div>
+
+</body>
+</html>
--- a/json/books.json
+++ b/json/books.json
@ -0,0 +1,20 @@
+{
+    "books": {
+        "dost_under_ende": [
+            {
+                "xml_file": "dost_under_ende_en.xml",
+                "lang": "en",
+                "xml_file_path": "/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/dost_under_ende_en.xml",
+                "is_validated": true,
+                "is_saved_to_db": false
+            },
+            {
+                "xml_file": "dost_under_ende_de.xml",
+                "lang": "de",
+                "xml_file_path": "/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/dost_under_ende_de.xml",
+                "is_validated": true,
+                "is_saved_to_db": false
+            }
+        ]
+    }
+}
--- a/run.py
+++ b/run.py
@ -76,20 +76,22 @@ def read_data_files_and_align_sentences(book_code):
                book1_chapters[idx] = book1_chapter
                book2_chapters[idx] = book2_chapter
                time.sleep(10)
-            if idx == 1:
-                break

        print(const.BLUE, 'Book Sentence Alignment Done', const.END)

        create_xml_file(book1_chapters, book1['metadata'])
        create_xml_file(book2_chapters, book2['metadata'])

+    else:
+        print(const.WARNING, 'Unknown Book Code :: ', book_code, const.END)
+        print(const.BLUE, 'Please provide the BookCode from books_data.csv', const.END)
+

 def create_xml_file(book_content, book_metadata_dict):
    create_xml.create_xml_file(book_content, book_metadata_dict)


 if env.check_env_variables():
-    read_data_files_and_align_sentences('dost_cap_ende')
+    # read_data_files_and_align_sentences('dost_under_ende')
    validate_all_xml_files()
    # save_validated_files_to_db()
--- a/xml_files/dost_under_ende_de.xml
+++ b/xml_files/dost_under_ende_de.xml
--- a/xml_files/dost_under_ende_en.xml
+++ b/xml_files/dost_under_ende_en.xml
--- a/xml_parser/create_xml.py
+++ b/xml_parser/create_xml.py
@ -64,7 +64,7 @@ def create_xml_file(book_content, book_metadata):
    file_path = file.name
    file.write(prettify(book_root))
    file.close()
-    print(const.BLUE, 'Saved XML File Path :: ', file_path, const.END)
+    print(const.BLUE, 'Saved Book Content to XML File - Path :: ', file_path, const.END)
    json_obj = {}
    book_code = book_root.get('code')
    json_obj['xml_file'] = filename
--- a/xml_parser/test_parser.py
+++ b/xml_parser/test_parser.py
@ -1,14 +1,12 @@
-from csv2df import get_book_content, get_book_metadata
 import xml_parser.create_xml as create_xml
 import xml_parser.read_xml as read_xml
 import xml_parser.validate as validate


-file_path = create_xml.create_xml_file(get_book_content(), get_book_metadata())
+# file_path = create_xml.create_xml_file({},{})

 # print(file_path)

 validate.validate_all_xml_files()

 # book_dict = read_xml.parse_xml_file('/Users/pavanmandava/PythonWorkspace/bitext-aligner/xml_files/abcdef_en.xml')
-
--- a/xslt/book_align.xsl
+++ b/xslt/book_align.xsl
@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">

-    <xsl:variable name="book1" select="document('../xml_files/dost_cap_ende_en.xml')/*" />
-    <xsl:variable name="book2" select="document('../xml_files/dost_cap_ende_de.xml')/*" />
+    <xsl:variable name="book1" select="document('../xml_files/dost_under_ende_en.xml')/*" />
+    <xsl:variable name="book2" select="document('../xml_files/dost_under_ende_de.xml')/*" />

 	<xsl:template match="/">
 		<html>
--- a/xslt/dost_under_ende.html
+++ b/xslt/dost_under_ende.html