XSLT for Combining Books Done,

Added Language to sentence tokenizer,
Removed unnecessary code
master
Pavan Mandava 6 years ago
parent 4a56e14484
commit fc95b9e82c

@ -7,7 +7,9 @@ import numpy as np
import pandas as pd import pandas as pd
from google.cloud import translate_v2 as translate from google.cloud import translate_v2 as translate
from jellyfish import levenshtein_distance as lev from jellyfish import levenshtein_distance as lev
from nltk import sent_tokenize import nltk
import utils.constants as const
nltk.download('punkt')
translate_client = translate.Client() translate_client = translate.Client()
@ -38,7 +40,7 @@ def frame_from_text(text, source, target, is1=False): #
cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']]
#print(cols) #print(cols)
frame = pd.DataFrame(columns=cols) frame = pd.DataFrame(columns=cols)
frame[cols[0]] = sent_tokenize(text) frame[cols[0]] = nltk.sent_tokenize(text, language=const.LANGUAGE_NAME[source])
frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText'])
frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x)) frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x))
frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max() frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max()

@ -5,7 +5,7 @@ import utils.json_utils as json_utils
import utils.constants as const import utils.constants as const
import utils.env_utils as env import utils.env_utils as env
import xml_parser.create_xml as create_xml import xml_parser.create_xml as create_xml
import txt_parser.csv_utils as csv_utils import utils.csv_utils as csv_utils
import fb2_parser.read_fb2 as read_fb2 import fb2_parser.read_fb2 as read_fb2
import aligner.bitext_align as aligner import aligner.bitext_align as aligner
import time import time
@ -75,7 +75,7 @@ def read_data_files_and_align_sentences(book_code):
book2_chapter.update({'sentences': book2_sen}) book2_chapter.update({'sentences': book2_sen})
book1_chapters[idx] = book1_chapter book1_chapters[idx] = book1_chapter
book2_chapters[idx] = book2_chapter book2_chapters[idx] = book2_chapter
time.sleep(60) time.sleep(10)
if idx == 1: if idx == 1:
break break
@ -92,4 +92,4 @@ def create_xml_file(book_content, book_metadata_dict):
if env.check_env_variables(): if env.check_env_variables():
read_data_files_and_align_sentences('dost_cap_ende') read_data_files_and_align_sentences('dost_cap_ende')
validate_all_xml_files() validate_all_xml_files()
save_validated_files_to_db() # save_validated_files_to_db()

@ -1,10 +0,0 @@
import txt_parser.csv_utils as read_csv
import utils.constants as const
books_list = read_csv.read_books_csv_file(const.CSV_FILE)
for book in books_list:
print(book)
# read_csv.write_books_data_to_csv(const.CSV_FILE, books_list)

@ -1,59 +0,0 @@
# -*- coding: utf-8 -*-
"""
Created on Sun Jan 19 13:45:29 2020
@author: DroidRonin
"""
import re
import pandas as pd
def get_text():
file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r')
lines = file.readlines()
file.close()
count = 0
star_index = list()
for line in lines:
line = line.strip()
count = count + 1
if '* * *' in line:
print(True)
star_index.append(count)
print(count) #The index comes out to be 55,1074
print(lines[star_index[0]:star_index[1]]) #Gives out the text between the two star thingies
total_text = lines[star_index[0]:star_index[1]]
text_str = ''.join(total_text)
pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE) #Regex for finding chapters
chapter_list = re.findall(pattern, text_str)
print(chapter_list)
chapter_list1 = list()
for chapter in chapter_list:
for chap in chapter[0:1]:
chapter_list1.append(chap)
chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
chapter_div = list(zip(chapter_list1, chap_seg))
for c in chapter_div:
print(''.join(c))
print(chapter_div[0]) #Will print out the first chapter

@ -32,6 +32,6 @@ def write_books_data_to_csv(csv_file_name, books_list):
def read_data_file(file_name): def read_data_file(file_name):
txt_file_path = os.path.dirname(os.path.dirname(__file__)) + const.DATA_FOLDER + file_name txt_file_path = os.path.dirname(os.path.dirname(__file__)) + const.DATA_FOLDER + file_name
with open(txt_file_path, 'r') as file: with open(txt_file_path, 'r') as file:
lines = file.readline() lines = file.readlines()
file.close() file.close()
return lines return lines

@ -0,0 +1,101 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:variable name="book1" select="document('../xml_files/dost_cap_enru_en.xml')/*" />
<xsl:variable name="book2" select="document('../xml_files/dost_cap_enru_ru.xml')/*" />
<xsl:template match="/">
<html>
<head>
<title>Bi-Text Aligner</title>
</head>
<style>
table {
border-collapse: collapse;
}
table, th, td {
border: 1px solid black;
}
tr:nth-child(odd) {
background: #e8eaed
}
tr:nth-child(even) {
background: #ffffff
}
table tr th {
font-size: 22px;
}
table tr td {
font-size: 20px;
}
</style>
<body>
<h2 align="center">Parallel Corpus</h2>
<br />
<xsl:apply-templates select="book/bookInfo" />
<xsl:apply-templates select="book/content" />
</body>
</html>
</xsl:template>
<xsl:template match="bookInfo">
<table border="1" align="center" width="50%" cellpadding="10">
<tr>
<td bgcolor="#cccc99" width="8%"><b>Book Name</b></td>
<td width="21%"><xsl:value-of select="$book1/bookInfo/title" /></td>
<td width="21%"><xsl:value-of select="$book2/bookInfo/title" /></td>
</tr>
<tr>
<td bgcolor="#cccc99"><b>Book Language</b></td>
<td><xsl:value-of select="$book1/bookInfo/lang" /></td>
<td><xsl:value-of select="$book2/bookInfo/lang" /></td>
</tr>
<tr>
<td bgcolor="#cccc99"><b>Is Translation?</b></td>
<td><xsl:value-of select="$book1/bookInfo/isTranslation" /></td>
<td><xsl:value-of select="$book2/bookInfo/isTranslation" /></td>
</tr>
<tr>
<td bgcolor="#cccc99"><b>Total Chapters</b></td>
<td><xsl:value-of select="$book1/bookInfo/totalChapters" /></td>
<td><xsl:value-of select="$book2/bookInfo/totalChapters" /></td>
</tr>
<tr>
<td bgcolor="#cccc99"><b>Author</b></td>
<td><xsl:value-of select="$book1/bookInfo/author" /></td>
<td><xsl:value-of select="$book2/bookInfo/author" /></td>
</tr>
<tr>
<td bgcolor="#cccc99"><b>Source</b></td>
<td><xsl:value-of select="$book1/bookInfo/source" /></td>
<td><xsl:value-of select="$book2/bookInfo/source" /></td>
</tr>
</table>
<br />
<br />
<br />
</xsl:template>
<xsl:template match="content">
<xsl:for-each select="chapter">
<xsl:variable name="position" select="position()" />
<h2 align="center">Chapter - <xsl:value-of select="@num" /></h2>
<table border="1" align="center" width="80%" cellpadding="10">
<tr>
<th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book1/content/chapter[$position]/@name"/></th>
<th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book2/content/chapter[$position]/@name"/></th>
</tr>
<xsl:for-each select="sentence">
<xsl:variable name="sen_position" select="position()" />
<tr>
<td><xsl:value-of select="$book1/content/chapter[$position]/sentence[$sen_position]"/></td>
<td><xsl:value-of select="$book2/content/chapter[$position]/sentence[$sen_position]"/></td>
</tr>
</xsl:for-each>
</table>
<br />
<br />
</xsl:for-each>
</xsl:template>
</xsl:stylesheet>
Loading…
Cancel
Save