Committing Chapter Divider code and updated English version of Crime & Punishment

6 years ago · 96ae438e6c
parent f0a718d6b4
commit 96ae438e6c
3 changed files with 891 additions and 156 deletions
--- a/txt/crime_EN.txt
+++ b/txt/crime_EN.txt
--- a/utils/txt_preparation/init.py
+++ b/utils/txt_preparation/init.py
--- a/utils/txt_preparation/txt_cleaner.py
+++ b/utils/txt_preparation/txt_cleaner.py
@ -0,0 +1,59 @@
 # -*- coding: utf-8 -*-
 """
 Created on Sun Jan 19 13:45:29 2020
@author: DroidRonin
 """
 import re
 import pandas as pd
 def get_text():
    file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\txt\\crime_EN.txt", 'r')
    lines = file.readlines()
    file.close()
    count = 0
    star_index = list()
    for line in lines:
        line = line.strip()
        count = count + 1
        if '* * *' in line:
            print(True)
            star_index.append(count)
            print(count)    #The index comes out to be 55,1074
    print(lines[star_index[0]:star_index[1]])   #Gives out the text between the two star thingies
    total_text = lines[star_index[0]:star_index[1]]
    text_str = ''.join(total_text)
    pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE)   #Regex for finding chapters
    chapter_list = re.findall(pattern, text_str)
    print(chapter_list)
    chapter_list1 = list()
    for chapter in chapter_list:
        for chap in chapter[0:1]:
            chapter_list1.append(chap)
    chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
    chapter_div = list(zip(chapter_list1, chap_seg))
    for c in chapter_div:
        print(''.join(c))
    print(chapter_div[0])    #Will print out the first chapter