-
Notifications
You must be signed in to change notification settings - Fork 4
/
tokenizer.py
87 lines (73 loc) · 2.59 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
from subprocess import Popen, PIPE
class Tokenizer:
def __init__(self, engine):
self.engine = engine
def split(self, statement, ngram=2):
if self.engine == "ngram":
return self.split_ngram(statement, ngram)
elif self.engine == "ma":
return self.split_ma(statement)
def split_ngram(self, statement, ngram):
result = []
if(len(statement) >= ngram):
for i in xrange(len(statement) - ngram + 1):
result.append(statement[i:i+ngram])
return result
def split_ma(self, statement):
result = []
p = Popen(['php','./webma2.php'], stdin=PIPE, stdout=PIPE)
p.stdin.write(statement.encode('utf-8') + "\n")
ret = p.wait()
if ret == 0:
tokenized_result = p.stdout.read().split("\n")
for line in tokenized_result:
line_elems = line.split()
if (len(line_elems) > 1):
ma_line = line.split()[1] #each line get from webma
ma_tokens = ma_line.split(",")
if (len(ma_tokens) < 2): continue;
orig = ma_tokens[0] #original token
hira = ma_tokens[1] #hiragana token
result.append(orig)
if (hira != orig):
result.append(hira)
return result
#split function for query which keep query offset
#param: statement
#return: [(tok1, index1), (tok2, index2), ..]
def split_query(self, statement):
result = []
p = Popen(['php','./webma2.php'], stdin=PIPE, stdout=PIPE)
p.stdin.write(statement.encode('utf-8') + "\n")
ret = p.wait()
if ret == 0:
tokenized_result = p.stdout.read().split("\n")
tok_idx = 0 #each line as each token
#<<< for loop
for line in tokenized_result:
line_elems = line.split()
if (len(line_elems) > 1):
ma_line = line.split()[1] #each line get from webma
ma_tokens = ma_line.split(",")
if (len(ma_tokens) < 2): continue;
orig = ma_tokens[0] #original token
hira = ma_tokens[1] #hiragana token
result.append((orig, tok_idx))
if (hira != orig):
result.append((hira, tok_idx))
tok_idx += 1
#>> end for loop
return result
def test(self):
test_str = "aab"
test_tokenized = ["aa", "ab"]
ret = self.split(test_str, 2)
assert test_tokenized[0] == ret[0]
assert test_tokenized[1] == ret[1]
test_str2 = "今日は晴れ"
test_tokenized2 = ["今日", "は", "晴れ", "きょう", "はれ"]
ret2 = self.split_ma(test_str2)
for token in test_tokenized2:
assert (token in ret2)
assert len(ret2) == len(test_tokenized2)