-
Notifications
You must be signed in to change notification settings - Fork 1
/
split_txt.py
71 lines (51 loc) · 2.03 KB
/
split_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import math
"""
A super-simple script that splits a text document into a given number of parts.
Note that this script is fairly specific to the properties of the texts in the
Travelogues corpus: splits are placed only where there are two empty lines (i.e.
three consecutive newline characters), since we know these denote line breaks in
the corpus documents.
"""
# TODO commandline-arguments?
INPUT_FILE = './156843801.txt'
NUM_PARTS = 26
# Computes the desired end_offset for the clip,
# given the text (txt), the current cursor position,
# and the number of pages this part should include.
def get_end_offset(txt, cursor, n):
text_after_cursor = txt[cursor:]
pages_after_cursor = text_after_cursor.split('\n\n\n')
# print(f'{len(pages_after_cursor)} pages after the cursor')
offset = cursor
pages = n if len(pages_after_cursor) > n else len(pages_after_cursor)
# print(f'Counting length for {n} pages')
for i in range(pages):
# print(i)
offset += len(pages_after_cursor[i])
# print(offset)
return offset
with open(INPUT_FILE, 'r') as infile:
# Count 'pages'
txt = infile.read()
pages = txt.split('\n\n\n')
total_chars = 0
for page in pages:
total_chars += len(page)
print(f'Read {len(pages)} pages, {total_chars} characters total')
pages_per_part = math.ceil(len(pages) / NUM_PARTS)
print(f'Splitting into {NUM_PARTS} parts, {pages_per_part} pages per part')
i = 1 # Running index, used only for filename
page_cursor = 0
while page_cursor < len(pages):
# Construct outfile name
without_ext = INPUT_FILE[:INPUT_FILE.rfind('.')]
padded_idx = str(i) if i > 9 else f'0{i}'
outfile = f'{without_ext}.part{padded_idx}.txt'
to_page = (page_cursor + pages_per_part) if (page_cursor + pages_per_part) < len(pages) else len(pages)
print(f'Writing to file {outfile}: page {page_cursor} to {to_page}')
with open(outfile, 'w') as out:
for p in range(page_cursor, to_page):
out.write(f'{pages[p]}\n\n\n')
out.close()
page_cursor += pages_per_part
i += 1