XSLT for Combining Books Done,

Added Language to sentence tokenizer, Removed unnecessary code
6 years ago · fc95b9e82c
parent 4a56e14484
commit fc95b9e82c
7 changed files with 109 additions and 75 deletions
--- a/aligner/bitext_align.py
+++ b/aligner/bitext_align.py
@ -7,7 +7,9 @@ import numpy as np
 import pandas as pd
 from google.cloud import translate_v2 as translate
 from jellyfish import levenshtein_distance as lev
-from nltk import sent_tokenize
+import nltk
 import utils.constants as const
 nltk.download('punkt')
 translate_client = translate.Client()
@ -38,7 +40,7 @@ def frame_from_text(text, source, target, is1=False): #
    cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
    #print(cols)
    frame = pd.DataFrame(columns=cols)
-    frame[cols[0]] = sent_tokenize(text)
+    frame[cols[0]] = nltk.sent_tokenize(text, language=const.LANGUAGE_NAME[source])
    frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
    frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
    frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max()
--- a/run.py
+++ b/run.py
@ -5,7 +5,7 @@ import utils.json_utils as json_utils
 import utils.constants as const
 import utils.env_utils as env
 import xml_parser.create_xml as create_xml
-import txt_parser.csv_utils as csv_utils
+import utils.csv_utils as csv_utils
 import fb2_parser.read_fb2 as read_fb2
 import aligner.bitext_align as aligner
 import time
@ -75,7 +75,7 @@ def read_data_files_and_align_sentences(book_code):
                book2_chapter.update({'sentences': book2_sen})
                book1_chapters[idx] = book1_chapter
                book2_chapters[idx] = book2_chapter
-                time.sleep(60)
+                time.sleep(10)
            if idx == 1:
                break
@ -92,4 +92,4 @@ def create_xml_file(book_content, book_metadata_dict):
 if env.check_env_variables():
    read_data_files_and_align_sentences('dost_cap_ende')
    validate_all_xml_files()
-    save_validated_files_to_db()
+    # save_validated_files_to_db()
--- a/txt_parser/init.py
+++ b/txt_parser/init.py
--- a/txt_parser/test_txt.py
+++ b/txt_parser/test_txt.py
@ -1,10 +0,0 @@
 import txt_parser.csv_utils as read_csv
 import utils.constants as const
 books_list = read_csv.read_books_csv_file(const.CSV_FILE)
 for book in books_list:
    print(book)
 # read_csv.write_books_data_to_csv(const.CSV_FILE, books_list)
--- a/txt_parser/txt_cleaner.py
+++ b/txt_parser/txt_cleaner.py
@ -1,59 +0,0 @@
 # -*- coding: utf-8 -*-
 """
 Created on Sun Jan 19 13:45:29 2020
@author: DroidRonin
 """
 import re
 import pandas as pd
 def get_text():
    file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r')
    lines = file.readlines()
    file.close()
    count = 0
    star_index = list()
    for line in lines:
        line = line.strip()
        count = count + 1
        if '* * *' in line:
            print(True)
            star_index.append(count)
            print(count)    #The index comes out to be 55,1074
    print(lines[star_index[0]:star_index[1]])   #Gives out the text between the two star thingies
    total_text = lines[star_index[0]:star_index[1]]
    text_str = ''.join(total_text)
    pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE)   #Regex for finding chapters
    chapter_list = re.findall(pattern, text_str)
    print(chapter_list)
    chapter_list1 = list()
    for chapter in chapter_list:
        for chap in chapter[0:1]:
            chapter_list1.append(chap)
    chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
    chapter_div = list(zip(chapter_list1, chap_seg))
    for c in chapter_div:
        print(''.join(c))
    print(chapter_div[0])    #Will print out the first chapter
--- a/txt_parser/csv_utils.py
+++ b/txt_parser/csv_utils.py
@ -32,6 +32,6 @@ def write_books_data_to_csv(csv_file_name, books_list):
 def read_data_file(file_name):
    txt_file_path = os.path.dirname(os.path.dirname(__file__)) + const.DATA_FOLDER + file_name
    with open(txt_file_path, 'r') as file:
-        lines = file.readline()
+        lines = file.readlines()
        file.close()
        return lines
--- a/xslt/book_align.xsl
+++ b/xslt/book_align.xsl
@ -0,0 +1,101 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:variable name="book1" select="document('../xml_files/dost_cap_enru_en.xml')/*" />
    <xsl:variable name="book2" select="document('../xml_files/dost_cap_enru_ru.xml')/*" />
 	<xsl:template match="/">
 		<html>
 			<head>
 				<title>Bi-Text Aligner</title>
 			</head>
            <style>
                table {
                  border-collapse: collapse;
                }
                table, th, td {
                  border: 1px solid black;
                }
                tr:nth-child(odd) {
                  background: #e8eaed
                }
                tr:nth-child(even) {
                  background: #ffffff
                }
                table tr th {
                  font-size: 22px;
                }
                table tr td {
                  font-size: 20px;
                }
            </style>
 			<body>
 				<h2 align="center">Parallel Corpus</h2>
                <br />
                <xsl:apply-templates select="book/bookInfo" />
                <xsl:apply-templates select="book/content" />
 			</body>
 		</html>
 	</xsl:template>
    <xsl:template match="bookInfo">
        <table border="1" align="center" width="50%" cellpadding="10">
            <tr>
                <td bgcolor="#cccc99" width="8%"><b>Book Name</b></td>
                <td width="21%"><xsl:value-of select="$book1/bookInfo/title" /></td>
                <td width="21%"><xsl:value-of select="$book2/bookInfo/title" /></td>
            </tr>
            <tr>
                <td bgcolor="#cccc99"><b>Book Language</b></td>
                <td><xsl:value-of select="$book1/bookInfo/lang" /></td>
                <td><xsl:value-of select="$book2/bookInfo/lang" /></td>
            </tr>
            <tr>
                <td bgcolor="#cccc99"><b>Is Translation?</b></td>
                <td><xsl:value-of select="$book1/bookInfo/isTranslation" /></td>
                <td><xsl:value-of select="$book2/bookInfo/isTranslation" /></td>
            </tr>
            <tr>
                <td bgcolor="#cccc99"><b>Total Chapters</b></td>
                <td><xsl:value-of select="$book1/bookInfo/totalChapters" /></td>
                <td><xsl:value-of select="$book2/bookInfo/totalChapters" /></td>
            </tr>
            <tr>
                <td bgcolor="#cccc99"><b>Author</b></td>
                <td><xsl:value-of select="$book1/bookInfo/author" /></td>
                <td><xsl:value-of select="$book2/bookInfo/author" /></td>
            </tr>
            <tr>
                <td bgcolor="#cccc99"><b>Source</b></td>
                <td><xsl:value-of select="$book1/bookInfo/source" /></td>
                <td><xsl:value-of select="$book2/bookInfo/source" /></td>
            </tr>
        </table>
        <br />
        <br />
        <br />
    </xsl:template>
    <xsl:template match="content">
        <xsl:for-each select="chapter">
            <xsl:variable name="position" select="position()" />
            <h2 align="center">Chapter - <xsl:value-of select="@num" /></h2>
            <table border="1" align="center" width="80%" cellpadding="10">
                <tr>
                    <th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book1/content/chapter[$position]/@name"/></th>
                    <th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book2/content/chapter[$position]/@name"/></th>
                </tr>
                <xsl:for-each select="sentence">
                    <xsl:variable name="sen_position" select="position()" />
                    <tr>
                        <td><xsl:value-of select="$book1/content/chapter[$position]/sentence[$sen_position]"/></td>
                        <td><xsl:value-of select="$book2/content/chapter[$position]/sentence[$sen_position]"/></td>
                    </tr>
                </xsl:for-each>
            </table>
            <br />
            <br />
        </xsl:for-each>
    </xsl:template>
 </xsl:stylesheet>