You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
60 lines
1.3 KiB
60 lines
1.3 KiB
|
|
|
|
# -*- coding: utf-8 -*-
|
|
"""
|
|
Created on Sun Jan 19 13:45:29 2020
|
|
|
|
@author: DroidRonin
|
|
"""
|
|
import re
|
|
import pandas as pd
|
|
|
|
|
|
def get_text():
|
|
file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\data\\crime_EN.data", 'r')
|
|
lines = file.readlines()
|
|
file.close()
|
|
count = 0
|
|
star_index = list()
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
count = count + 1
|
|
if '* * *' in line:
|
|
print(True)
|
|
star_index.append(count)
|
|
print(count) #The index comes out to be 55,1074
|
|
|
|
print(lines[star_index[0]:star_index[1]]) #Gives out the text between the two star thingies
|
|
total_text = lines[star_index[0]:star_index[1]]
|
|
text_str = ''.join(total_text)
|
|
|
|
pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE) #Regex for finding chapters
|
|
chapter_list = re.findall(pattern, text_str)
|
|
print(chapter_list)
|
|
chapter_list1 = list()
|
|
|
|
for chapter in chapter_list:
|
|
for chap in chapter[0:1]:
|
|
chapter_list1.append(chap)
|
|
|
|
chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
|
|
chapter_div = list(zip(chapter_list1, chap_seg))
|
|
|
|
for c in chapter_div:
|
|
print(''.join(c))
|
|
|
|
print(chapter_div[0]) #Will print out the first chapter
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|