parent
f0a718d6b4
commit
96ae438e6c
File diff suppressed because one or more lines are too long
@ -0,0 +1,59 @@
|
|||||||
|
|
||||||
|
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Created on Sun Jan 19 13:45:29 2020
|
||||||
|
|
||||||
|
@author: DroidRonin
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
|
||||||
|
def get_text():
|
||||||
|
file = open("C:\\Users\\Nerv\\Text-Technology\\Aligner\\txt\\crime_EN.txt", 'r')
|
||||||
|
lines = file.readlines()
|
||||||
|
file.close()
|
||||||
|
count = 0
|
||||||
|
star_index = list()
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line = line.strip()
|
||||||
|
count = count + 1
|
||||||
|
if '* * *' in line:
|
||||||
|
print(True)
|
||||||
|
star_index.append(count)
|
||||||
|
print(count) #The index comes out to be 55,1074
|
||||||
|
|
||||||
|
print(lines[star_index[0]:star_index[1]]) #Gives out the text between the two star thingies
|
||||||
|
total_text = lines[star_index[0]:star_index[1]]
|
||||||
|
text_str = ''.join(total_text)
|
||||||
|
|
||||||
|
pattern = re.compile(r"\b((chapter)[\s]+[IVXLCDM]+\b)", re.IGNORECASE) #Regex for finding chapters
|
||||||
|
chapter_list = re.findall(pattern, text_str)
|
||||||
|
print(chapter_list)
|
||||||
|
chapter_list1 = list()
|
||||||
|
|
||||||
|
for chapter in chapter_list:
|
||||||
|
for chap in chapter[0:1]:
|
||||||
|
chapter_list1.append(chap)
|
||||||
|
|
||||||
|
chap_seg = re.split(r'CHAPTER\s[A-Z.]+', text_str)[1:]
|
||||||
|
chapter_div = list(zip(chapter_list1, chap_seg))
|
||||||
|
|
||||||
|
for c in chapter_div:
|
||||||
|
print(''.join(c))
|
||||||
|
|
||||||
|
print(chapter_div[0]) #Will print out the first chapter
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Loading…
Reference in new issue