-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
126 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
from somajo import SoMaJo | ||
import os | ||
import re | ||
from multiprocessing import Pool, cpu_count | ||
|
||
# user settings | ||
INPUT_DIR = "data/dewiki-20220201" | ||
OUTPUT_DIR = "data/output" | ||
LANGUAGE = "de" | ||
# LANGUAGE = "en" | ||
|
||
tokenizer = SoMaJo(f"{LANGUAGE}_CMC") | ||
html_tag_patten = re.compile('<[^<>]+>') | ||
|
||
|
||
# see https://github.com/tsproisl/SoMaJo/issues/17 | ||
def detokenize(tokens): | ||
"""Convert SoMaJo tokens to sentence (str).""" | ||
result_list = [] | ||
for token in tokens: | ||
if token.original_spelling is not None: | ||
result_list.append(token.original_spelling) | ||
else: | ||
result_list.append(token.text) | ||
|
||
if token.space_after: | ||
result_list.append(" ") | ||
result = "".join(result_list) | ||
result = result.strip() | ||
return result | ||
|
||
|
||
def is_doc_start_line(line): | ||
return line.startswith('<doc id=') | ||
|
||
|
||
def is_doc_end_line(line): | ||
return line.startswith('</doc>') | ||
|
||
|
||
def get_data_dirs(root_dir): | ||
return [name for name in os.listdir(root_dir) | ||
if os.path.isdir(os.path.join(root_dir, name))] | ||
|
||
|
||
def get_data_files(root_dir): | ||
return [name for name in os.listdir(root_dir) | ||
if os.path.isfile(os.path.join(root_dir, name))] | ||
|
||
|
||
def process_text_line(line): | ||
# remove HTML taks if still there | ||
line = re.sub(html_tag_patten, ' ', line) | ||
|
||
sentences = tokenizer.tokenize_text([line]) | ||
|
||
result = [] | ||
|
||
for s in sentences: | ||
sentence_string = detokenize(s) | ||
result.append(sentence_string) | ||
|
||
return result | ||
|
||
|
||
def process_directory(map_item): | ||
input_dir, output_file_name = map_item | ||
print("Creating:", output_file_name) | ||
with open(os.path.join(OUTPUT_DIR, output_file_name), 'a') as output_file: | ||
# r_=root, d_=directories, f_=files | ||
data_files = get_data_files(input_dir) | ||
for data_file in data_files: | ||
next_input_file = os.path.join(input_dir, data_file) | ||
print("Reading file:", next_input_file) | ||
|
||
with open(next_input_file, "r") as input_file: | ||
|
||
skip_next_line = False | ||
|
||
for line in input_file: | ||
|
||
# drop line with start tag | ||
if is_doc_start_line(line): | ||
skip_next_line = True | ||
continue | ||
|
||
# drop line with end tag and append blank line | ||
if is_doc_end_line(line): | ||
output_file.write("\n") | ||
continue | ||
|
||
# skip first line to skip headline | ||
if skip_next_line: | ||
skip_next_line = False | ||
continue | ||
|
||
# skip empty lines | ||
if len(line) <= 1: | ||
continue | ||
|
||
sentences = process_text_line(line) | ||
|
||
for sentence in sentences: | ||
# ignore blank lines and make sure that stuff like "\n" is also ignored: | ||
if len(sentence) > 2: | ||
output_file.write(f"{sentence}\n") | ||
|
||
|
||
if __name__ == '__main__': | ||
# get sub directories with data | ||
data_dirs = get_data_dirs(INPUT_DIR) | ||
|
||
# create tasks for parallel execution | ||
task_list = [] | ||
for data_dir in data_dirs: | ||
call_item = (os.path.join(INPUT_DIR, data_dir), data_dir + ".txt") | ||
task_list.append(call_item) | ||
|
||
pool_size = cpu_count() * 4 | ||
print("pool_size:", pool_size) | ||
|
||
# execute tasks in parallel | ||
with Pool(pool_size) as p: | ||
p.map(process_directory, task_list) | ||
|
||
print("Done!") |