bitext-aligner/txt_parser/txt_cleaner.py



# -*- coding: utf-8 -*-
"""
Created on Sun Jan 19 13:45:29 2020

@author: DroidRonin
"""
import re
import pandas as pd


def get_text():
    file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r')
    lines = file.readlines()
    file.close()
    count = 0
    star_index = list()

    for line in lines:
        line = line.strip()
        count = count + 1
        if '* * *' in line:
            print(True)
            star_index.append(count)
            print(count)    #The index comes out to be 55,1074

    print(lines[star_index[0]:star_index[1]])   #Gives out the text between the two star thingies
    total_text = lines[star_index[0]:star_index[1]]
    text_str = ''.join(total_text)

    pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE)   #Regex for finding chapters
    chapter_list = re.findall(pattern, text_str)
    print(chapter_list)
    chapter_list1 = list()

    for chapter in chapter_list:
        for chap in chapter[0:1]:
            chapter_list1.append(chap)

    chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
    chapter_div = list(zip(chapter_list1, chap_seg))

    for c in chapter_div:
        print(''.join(c))

    print(chapter_div[0])    #Will print out the first chapter