-
Notifications
You must be signed in to change notification settings - Fork 0
/
grammar.py
101 lines (84 loc) · 4.33 KB
/
grammar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
from syntactic_node import SyntacticNode
class Grammar:
def __init__(self, nodes_by_type, vocabulary, rules):
"""
:param nodes_by_type: a dictionary mapping between node type to a list of SyntacticNode instances
representing the available syntactic nodes in the grammar
:param vocabulary: a list of vocabulary items (strings)
:param rules: a list of Rule instances
"""
self.nodes_by_type = nodes_by_type
self.vocabulary = vocabulary
self.rules = rules
def get_encoding_length(self):
score = 0
for item in self.vocabulary:
score += len(item)
for rule in self.rules:
score += rule.get_encoding_length()
return score
def get_encoding_length_of_datum_given_grammar(self, datum):
# Naive and simplified solution assuming:
# - datum is comprised of 1 root following 1 suffix
# - each rule takes 1 feature node
# First we ask: is the datum simply a root with no feature nodes?
if not datum.feature_nodes:
for root in self.nodes_by_type[SyntacticNode.TYPE_ROOT]:
if datum.surface_form == root.surface_form:
return 0
# TODO perhaps we also need to support the case where datum is a stem and ALSO could have been generated by a rule
# First we need to find the rules that could have feasibly generated the datum.
# The first step is to weed out irrelevant rules, i.e. rules whose output affix doesn't match the datum's.
feasible_rules = []
# for rule in self.rules:
# is_root_in_rule_env = (rule.environment_roots and datum.root_node in rule.environment_roots)
# is_rule_relevant = self._is_subset(rule.input_nodes, datum.feature_nodes) and datum.surface_form.endswith(rule.output_affix)
# if (not rule.environment_roots or is_root_in_rule_env) and is_rule_relevant:
# feasible_rules.append(rule)
for rule in self.rules:
# for root in self.nodes_by_type[SyntacticNode.TYPE_ROOT]:
# root_str = root.surface_form
root_str = datum.root_node.surface_form
is_rule_relevant = datum.surface_form[:len(root_str)] == root_str \
and datum.surface_form[len(root_str):] == rule.output_affix \
and self._is_subset(rule.input_nodes, datum.feature_nodes)
is_root_in_rule_env = (rule.environment_roots and datum.root_node in rule.environment_roots)
if is_root_in_rule_env and not is_rule_relevant:
# Grammar has a rule with the current datum's root but it doesn't apply - automatic disqualification
return float('inf')
elif (not rule.environment_roots or is_root_in_rule_env) and is_rule_relevant:
feasible_rules.append(rule)
# Now we apply the subset principle, searching for decreasingly 'good' (i.e. specific) rules in order
# 1. Rules with root node in env
applicable_rules = []
for rule in feasible_rules:
if datum.root_node in rule.environment_roots:
applicable_rules.append(rule)
# 2. If no rules found in previous case, then add rules with match between their input nodes and the datum's feature nodes
if not applicable_rules:
pass # TODO
# 3. All other feasible rules
if not applicable_rules:
applicable_rules = feasible_rules
# Now we need to decide how many 'choice points' we have with respect to the datum.
# For now we assume it's just the number of rules to choose from, but it's probably more complex than that, TODO
if applicable_rules:
return len(applicable_rules)
else:
return float('inf')
# Returns True iff first set of nodes is a subset of the second
def _is_subset(self, nodes1, nodes2):
for node in nodes1:
if node not in nodes2:
return False
return True
def __repr__(self):
repr = "<Grammar instance>\n"
repr += "\tRules:\n"
for rule in self.rules:
repr += "\t\t{}\n".format(rule)
return repr[:-1] # Remove that last newline
def __eq__(self, other):
return repr(self) == repr(other)
def __hash__(self):
return hash(repr(self))