Added Language to sentence tokenizer, Removed unnecessary codemaster
parent
4a56e14484
commit
fc95b9e82c
@ -1,10 +0,0 @@
|
||||
import txt_parser.csv_utils as read_csv
|
||||
import utils.constants as const
|
||||
|
||||
books_list = read_csv.read_books_csv_file(const.CSV_FILE)
|
||||
|
||||
for book in books_list:
|
||||
print(book)
|
||||
|
||||
|
||||
# read_csv.write_books_data_to_csv(const.CSV_FILE, books_list)
|
||||
@ -1,59 +0,0 @@
|
||||
|
||||
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Created on Sun Jan 19 13:45:29 2020
|
||||
|
||||
@author: DroidRonin
|
||||
"""
|
||||
import re
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def get_text():
|
||||
file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r')
|
||||
lines = file.readlines()
|
||||
file.close()
|
||||
count = 0
|
||||
star_index = list()
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
count = count + 1
|
||||
if '* * *' in line:
|
||||
print(True)
|
||||
star_index.append(count)
|
||||
print(count) #The index comes out to be 55,1074
|
||||
|
||||
print(lines[star_index[0]:star_index[1]]) #Gives out the text between the two star thingies
|
||||
total_text = lines[star_index[0]:star_index[1]]
|
||||
text_str = ''.join(total_text)
|
||||
|
||||
pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE) #Regex for finding chapters
|
||||
chapter_list = re.findall(pattern, text_str)
|
||||
print(chapter_list)
|
||||
chapter_list1 = list()
|
||||
|
||||
for chapter in chapter_list:
|
||||
for chap in chapter[0:1]:
|
||||
chapter_list1.append(chap)
|
||||
|
||||
chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
|
||||
chapter_div = list(zip(chapter_list1, chap_seg))
|
||||
|
||||
for c in chapter_div:
|
||||
print(''.join(c))
|
||||
|
||||
print(chapter_div[0]) #Will print out the first chapter
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
@ -0,0 +1,101 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
|
||||
|
||||
<xsl:variable name="book1" select="document('../xml_files/dost_cap_enru_en.xml')/*" />
|
||||
<xsl:variable name="book2" select="document('../xml_files/dost_cap_enru_ru.xml')/*" />
|
||||
|
||||
<xsl:template match="/">
|
||||
<html>
|
||||
<head>
|
||||
<title>Bi-Text Aligner</title>
|
||||
</head>
|
||||
<style>
|
||||
table {
|
||||
border-collapse: collapse;
|
||||
}
|
||||
table, th, td {
|
||||
border: 1px solid black;
|
||||
}
|
||||
tr:nth-child(odd) {
|
||||
background: #e8eaed
|
||||
}
|
||||
tr:nth-child(even) {
|
||||
background: #ffffff
|
||||
}
|
||||
table tr th {
|
||||
font-size: 22px;
|
||||
}
|
||||
table tr td {
|
||||
font-size: 20px;
|
||||
}
|
||||
</style>
|
||||
<body>
|
||||
<h2 align="center">Parallel Corpus</h2>
|
||||
<br />
|
||||
<xsl:apply-templates select="book/bookInfo" />
|
||||
<xsl:apply-templates select="book/content" />
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="bookInfo">
|
||||
<table border="1" align="center" width="50%" cellpadding="10">
|
||||
<tr>
|
||||
<td bgcolor="#cccc99" width="8%"><b>Book Name</b></td>
|
||||
<td width="21%"><xsl:value-of select="$book1/bookInfo/title" /></td>
|
||||
<td width="21%"><xsl:value-of select="$book2/bookInfo/title" /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td bgcolor="#cccc99"><b>Book Language</b></td>
|
||||
<td><xsl:value-of select="$book1/bookInfo/lang" /></td>
|
||||
<td><xsl:value-of select="$book2/bookInfo/lang" /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td bgcolor="#cccc99"><b>Is Translation?</b></td>
|
||||
<td><xsl:value-of select="$book1/bookInfo/isTranslation" /></td>
|
||||
<td><xsl:value-of select="$book2/bookInfo/isTranslation" /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td bgcolor="#cccc99"><b>Total Chapters</b></td>
|
||||
<td><xsl:value-of select="$book1/bookInfo/totalChapters" /></td>
|
||||
<td><xsl:value-of select="$book2/bookInfo/totalChapters" /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td bgcolor="#cccc99"><b>Author</b></td>
|
||||
<td><xsl:value-of select="$book1/bookInfo/author" /></td>
|
||||
<td><xsl:value-of select="$book2/bookInfo/author" /></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td bgcolor="#cccc99"><b>Source</b></td>
|
||||
<td><xsl:value-of select="$book1/bookInfo/source" /></td>
|
||||
<td><xsl:value-of select="$book2/bookInfo/source" /></td>
|
||||
</tr>
|
||||
</table>
|
||||
<br />
|
||||
<br />
|
||||
<br />
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="content">
|
||||
<xsl:for-each select="chapter">
|
||||
<xsl:variable name="position" select="position()" />
|
||||
<h2 align="center">Chapter - <xsl:value-of select="@num" /></h2>
|
||||
<table border="1" align="center" width="80%" cellpadding="10">
|
||||
<tr>
|
||||
<th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book1/content/chapter[$position]/@name"/></th>
|
||||
<th bgcolor="#cccc99" width="40%"><xsl:value-of select="$book2/content/chapter[$position]/@name"/></th>
|
||||
</tr>
|
||||
<xsl:for-each select="sentence">
|
||||
<xsl:variable name="sen_position" select="position()" />
|
||||
<tr>
|
||||
<td><xsl:value-of select="$book1/content/chapter[$position]/sentence[$sen_position]"/></td>
|
||||
<td><xsl:value-of select="$book2/content/chapter[$position]/sentence[$sen_position]"/></td>
|
||||
</tr>
|
||||
</xsl:for-each>
|
||||
</table>
|
||||
<br />
|
||||
<br />
|
||||
</xsl:for-each>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
Loading…
Reference in new issue