diff --git a/aligner/bitext_align.py b/aligner/bitext_align.py index da6b320..091c5c1 100644 --- a/aligner/bitext_align.py +++ b/aligner/bitext_align.py @@ -7,7 +7,9 @@ import numpy as np import pandas as pd from google.cloud import translate_v2 as translate from jellyfish import levenshtein_distance as lev -from nltk import sent_tokenize +import nltk +import utils.constants as const +nltk.download('punkt') translate_client = translate.Client() @@ -38,7 +40,7 @@ def frame_from_text(text, source, target, is1=False): # cols = [c+str(int(is1)) for c in ['sent','trans','rellen','relpos']] #print(cols) frame = pd.DataFrame(columns=cols) - frame[cols[0]] = sent_tokenize(text) + frame[cols[0]] = nltk.sent_tokenize(text, language=const.LANGUAGE_NAME[source]) frame[cols[1]] = frame[cols[0]].apply(lambda x: translate_client.translate(x, source_language=source, target_language=target, model='nmt')['translatedText']) frame[cols[2]] = frame[cols[0]].apply(lambda x: len(x)) frame[cols[2]] = frame[cols[2]]/frame[cols[2]].max() diff --git a/run.py b/run.py index c5a81cf..9860521 100644 --- a/run.py +++ b/run.py @@ -5,7 +5,7 @@ import utils.json_utils as json_utils import utils.constants as const import utils.env_utils as env import xml_parser.create_xml as create_xml -import txt_parser.csv_utils as csv_utils +import utils.csv_utils as csv_utils import fb2_parser.read_fb2 as read_fb2 import aligner.bitext_align as aligner import time @@ -75,7 +75,7 @@ def read_data_files_and_align_sentences(book_code): book2_chapter.update({'sentences': book2_sen}) book1_chapters[idx] = book1_chapter book2_chapters[idx] = book2_chapter - time.sleep(60) + time.sleep(10) if idx == 1: break @@ -92,4 +92,4 @@ def create_xml_file(book_content, book_metadata_dict): if env.check_env_variables(): read_data_files_and_align_sentences('dost_cap_ende') validate_all_xml_files() - save_validated_files_to_db() + # save_validated_files_to_db() diff --git a/txt_parser/__init__.py b/txt_parser/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/txt_parser/test_txt.py b/txt_parser/test_txt.py deleted file mode 100644 index a9f45a8..0000000 --- a/txt_parser/test_txt.py +++ /dev/null @@ -1,10 +0,0 @@ -import txt_parser.csv_utils as read_csv -import utils.constants as const - -books_list = read_csv.read_books_csv_file(const.CSV_FILE) - -for book in books_list: - print(book) - - -# read_csv.write_books_data_to_csv(const.CSV_FILE, books_list) diff --git a/txt_parser/txt_cleaner.py b/txt_parser/txt_cleaner.py deleted file mode 100644 index c8b5b1f..0000000 --- a/txt_parser/txt_cleaner.py +++ /dev/null @@ -1,59 +0,0 @@ - - -# -*- coding: utf-8 -*- -""" -Created on Sun Jan 19 13:45:29 2020 - -@author: DroidRonin -""" -import re -import pandas as pd - - -def get_text(): - file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r') - lines = file.readlines() - file.close() - count = 0 - star_index = list() - - for line in lines: - line = line.strip() - count = count + 1 - if '* * *' in line: - print(True) - star_index.append(count) - print(count) #The index comes out to be 55,1074 - - print(lines[star_index[0]:star_index[1]]) #Gives out the text between the two star thingies - total_text = lines[star_index[0]:star_index[1]] - text_str = ''.join(total_text) - - pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE) #Regex for finding chapters - chapter_list = re.findall(pattern, text_str) - print(chapter_list) - chapter_list1 = list() - - for chapter in chapter_list: - for chap in chapter[0:1]: - chapter_list1.append(chap) - - chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:] - chapter_div = list(zip(chapter_list1, chap_seg)) - - for c in chapter_div: - print(''.join(c)) - - print(chapter_div[0]) #Will print out the first chapter - - - - - - - - - - - - diff --git a/txt_parser/csv_utils.py b/utils/csv_utils.py similarity index 95% rename from txt_parser/csv_utils.py rename to utils/csv_utils.py index 7e1353d..e41ec57 100644 --- a/txt_parser/csv_utils.py +++ b/utils/csv_utils.py @@ -32,6 +32,6 @@ def write_books_data_to_csv(csv_file_name, books_list): def read_data_file(file_name): txt_file_path = os.path.dirname(os.path.dirname(__file__)) + const.DATA_FOLDER + file_name with open(txt_file_path, 'r') as file: - lines = file.readline() + lines = file.readlines() file.close() return lines \ No newline at end of file diff --git a/xslt/book_align.xsl b/xslt/book_align.xsl new file mode 100644 index 0000000..f04dfe0 --- /dev/null +++ b/xslt/book_align.xsl @@ -0,0 +1,101 @@ + + + + + + + + + + Bi-Text Aligner + + + +

Parallel Corpus

+
+ + + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
Book Name
Book Language
Is Translation?
Total Chapters
Author
Source
+
+
+
+
+ + + + +

Chapter -

+ + + + + + + + + + + + +
+
+
+
+
+ +