Added Language to sentence tokenizer, Removed unnecessary codemaster
parent
4a56e14484
commit
fc95b9e82c
@ -1,10 +0,0 @@
|
|||||||
import txt_parser.csv_utils as read_csv
|
|
||||||
import utils.constants as const
|
|
||||||
|
|
||||||
books_list = read_csv.read_books_csv_file(const.CSV_FILE)
|
|
||||||
|
|
||||||
for book in books_list:
|
|
||||||
print(book)
|
|
||||||
|
|
||||||
|
|
||||||
# read_csv.write_books_data_to_csv(const.CSV_FILE, books_list)
|
|
||||||
@ -1,59 +0,0 @@
|
|||||||
|
|
||||||
|
|
||||||
# -*- coding: utf-8 -*-
|
|
||||||
"""
|
|
||||||
Created on Sun Jan 19 13:45:29 2020
|
|
||||||
|
|
||||||
@author: DroidRonin
|
|
||||||
"""
|
|
||||||
import re
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
|
|
||||||
def get_text():
|
|
||||||
file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r')
|
|
||||||
lines = file.readlines()
|
|
||||||
file.close()
|
|
||||||
count = 0
|
|
||||||
star_index = list()
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
count = count + 1
|
|
||||||
if '* * *' in line:
|
|
||||||
print(True)
|
|
||||||
star_index.append(count)
|
|
||||||
print(count) #The index comes out to be 55,1074
|
|
||||||
|
|
||||||
print(lines[star_index[0]:star_index[1]]) #Gives out the text between the two star thingies
|
|
||||||
total_text = lines[star_index[0]:star_index[1]]
|
|
||||||
text_str = ''.join(total_text)
|
|
||||||
|
|
||||||
pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE) #Regex for finding chapters
|
|
||||||
chapter_list = re.findall(pattern, text_str)
|
|
||||||
print(chapter_list)
|
|
||||||
chapter_list1 = list()
|
|
||||||
|
|
||||||
for chapter in chapter_list:
|
|
||||||
for chap in chapter[0:1]:
|
|
||||||
chapter_list1.append(chap)
|
|
||||||
|
|
||||||
chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
|
|
||||||
chapter_div = list(zip(chapter_list1, chap_seg))
|
|
||||||
|
|
||||||
for c in chapter_div:
|
|
||||||
print(''.join(c))
|
|
||||||
|
|
||||||
print(chapter_div[0]) #Will print out the first chapter
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@ -0,0 +1,101 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||||
|
|
||||||
|
<xsl:variable name="book1" select="document('../xml_files/dost_cap_enru_en.xml')/*" />
|
||||||
|
<xsl:variable name="book2" select="document('../xml_files/dost_cap_enru_ru.xml')/*" />
|
||||||
|
|
||||||
|
<xsl:template match="/">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Bi-Text Aligner</title>
|
||||||
|
</head>
|
||||||
|
<style>
|
||||||
|
table {
|
||||||
|
border-collapse: collapse;
|
||||||
|
}
|
||||||
|
table, th, td {
|
||||||
|
border: 1px solid black;
|
||||||
|
}
|
||||||
|
tr:nth-child(odd) {
|
||||||
|
background: #e8eaed
|
||||||
|
}
|
||||||
|
tr:nth-child(even) {
|
||||||
|
background: #ffffff
|
||||||
|
}
|
||||||
|
table tr th {
|
||||||
|
font-size: 22px;
|
||||||
|
}
|
||||||
|
table tr td {
|
||||||
|
font-size: 20px;
|
||||||
|
}
|
||||||
|
</style>
|
||||||
|
<body>
|
||||||
|
<h2 align="center">Parallel Corpus</h2>
|
||||||
|
<br />
|
||||||
|
<xsl:apply-templates select="book/bookInfo" />
|
||||||
|
<xsl:apply-templates select="book/content" />
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="bookInfo">
|
||||||
|
<table border="1" align="center" width="50%" cellpadding="10">
|
||||||
|
<tr>
|
||||||
|
<td bgcolor="#cccc99" width="8%"><b>Book Name</b></td>
|
||||||
|
<td width="21%"><xsl:value-of select="$book1/bookInfo/title" /></td>
|
||||||
|
<td width="21%"><xsl:value-of select="$book2/bookInfo/title" /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td bgcolor="#cccc99"><b>Book Language</b></td>
|
||||||
|
<td><xsl:value-of select="$book1/bookInfo/lang" /></td>
|
||||||
|
<td><xsl:value-of select="$book2/bookInfo/lang" /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td bgcolor="#cccc99"><b>Is Translation?</b></td>
|
||||||
|
<td><xsl:value-of select="$book1/bookInfo/isTranslation" /></td>
|
||||||
|
<td><xsl:value-of select="$book2/bookInfo/isTranslation" /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td bgcolor="#cccc99"><b>Total Chapters</b></td>
|
||||||
|
<td><xsl:value-of select="$book1/bookInfo/totalChapters" /></td>
|
||||||
|
<td><xsl:value-of select="$book2/bookInfo/totalChapters" /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td bgcolor="#cccc99"><b>Author</b></td>
|
||||||
|
<td><xsl:value-of select="$book1/bookInfo/author" /></td>
|
||||||
|
<td><xsl:value-of select="$book2/bookInfo/author" /></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td bgcolor="#cccc99"><b>Source</b></td>
|
||||||
|
<td><xsl:value-of select="$book1/bookInfo/source" /></td>
|
||||||
|
<td><xsl:value-of select="$book2/bookInfo/source" /></td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
<br />
|
||||||
|
<br />
|
||||||
|
<br />
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="content">
|
||||||
|
<xsl:for-each select="chapter">
|
||||||
|
<xsl:variable name="position" select="position()" />
|
||||||
|
<h2 align="center">Chapter - <xsl:value-of select="@num" /></h2>
|
||||||
|
<table border="1" align="center" width="80%" cellpadding="10">
|
||||||
|
<tr>
|
||||||
|
<th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book1/content/chapter[$position]/@name"/></th>
|
||||||
|
<th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book2/content/chapter[$position]/@name"/></th>
|
||||||
|
</tr>
|
||||||
|
<xsl:for-each select="sentence">
|
||||||
|
<xsl:variable name="sen_position" select="position()" />
|
||||||
|
<tr>
|
||||||
|
<td><xsl:value-of select="$book1/content/chapter[$position]/sentence[$sen_position]"/></td>
|
||||||
|
<td><xsl:value-of select="$book2/content/chapter[$position]/sentence[$sen_position]"/></td>
|
||||||
|
</tr>
|
||||||
|
</xsl:for-each>
|
||||||
|
</table>
|
||||||
|
<br />
|
||||||
|
<br />
|
||||||
|
</xsl:for-each>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
</xsl:stylesheet>
|
||||||
Loading…
Reference in new issue