-
Notifications
You must be signed in to change notification settings - Fork 0
/
read.py
26 lines (20 loc) · 817 Bytes
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import PyPDF2
import os
input_directory = "data"
output_directory = "output"
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for filename in os.listdir(input_directory):
if filename.endswith(".pdf"):
pdf_path = os.path.join(input_directory, filename)
pdf_obj = open(pdf_path, 'rb')
reader = PyPDF2.PdfReader(pdf_obj, strict=True)
total_pages = len(reader.pages)
output_text = []
for page_num in range(total_pages):
page = reader.pages[page_num]
output_text.append(page.extract_text())
output_filename = os.path.join(output_directory, filename.replace('.pdf', '.txt'))
with open(output_filename, 'w', encoding='utf-8') as f:
f.write("\n".join(output_text))
pdf_obj.close()