grammar.py

from syntactic_node import SyntacticNode

class Grammar:

    def __init__(self, nodes_by_type, vocabulary, rules):
        """
        :param nodes_by_type: a dictionary mapping between node type to a list of SyntacticNode instances
        representing the available syntactic nodes in the grammar
        :param vocabulary: a list of vocabulary items (strings)
        :param rules: a list of Rule instances
        """
        self.nodes_by_type = nodes_by_type
        self.vocabulary = vocabulary
        self.rules = rules

    def get_encoding_length(self):
        score = 0
        for item in self.vocabulary:
            score += len(item)
        for rule in self.rules:
            score += rule.get_encoding_length()
        return score

    def get_encoding_length_of_datum_given_grammar(self, datum):
        # Naive and simplified solution assuming:
        # - datum is comprised of 1 root following 1 suffix
        # - each rule takes 1 feature node

        # First we ask: is the datum simply a root with no feature nodes?
        if not datum.feature_nodes:
            for root in self.nodes_by_type[SyntacticNode.TYPE_ROOT]:
                if datum.surface_form == root.surface_form:
                    return 0
        # TODO perhaps we also need to support the case where datum is a stem and ALSO could have been generated by a rule

        # First we need to find the rules that could have feasibly generated the datum.
        # The first step is to weed out irrelevant rules, i.e. rules whose output affix doesn't match the datum's.
        feasible_rules = []

        # for rule in self.rules:
        #     is_root_in_rule_env = (rule.environment_roots and datum.root_node in rule.environment_roots)
        #     is_rule_relevant = self._is_subset(rule.input_nodes, datum.feature_nodes) and datum.surface_form.endswith(rule.output_affix)
        #     if (not rule.environment_roots or is_root_in_rule_env) and is_rule_relevant:
        #         feasible_rules.append(rule)

        for rule in self.rules:
            # for root in self.nodes_by_type[SyntacticNode.TYPE_ROOT]:
            #     root_str = root.surface_form
            root_str = datum.root_node.surface_form
            is_rule_relevant = datum.surface_form[:len(root_str)] == root_str \
                and datum.surface_form[len(root_str):] == rule.output_affix \
                and self._is_subset(rule.input_nodes, datum.feature_nodes)
            is_root_in_rule_env = (rule.environment_roots and datum.root_node in rule.environment_roots)
            if is_root_in_rule_env and not is_rule_relevant:
                # Grammar has a rule with the current datum's root but it doesn't apply - automatic disqualification
                return float('inf')
            elif (not rule.environment_roots or is_root_in_rule_env) and is_rule_relevant:
                feasible_rules.append(rule)

        # Now we apply the subset principle, searching for decreasingly 'good' (i.e. specific) rules in order

        # 1. Rules with root node in env
        applicable_rules = []
        for rule in feasible_rules:
            if datum.root_node in rule.environment_roots:
                applicable_rules.append(rule)

        # 2. If no rules found in previous case, then add rules with match between their input nodes and the datum's feature nodes
        if not applicable_rules:
            pass # TODO

        # 3. All other feasible rules
        if not applicable_rules:
            applicable_rules = feasible_rules

        # Now we need to decide how many 'choice points' we have with respect to the datum.
        # For now we assume it's just the number of rules to choose from, but it's probably more complex than that, TODO
        if applicable_rules:
            return len(applicable_rules)
        else:
            return float('inf')

    # Returns True iff first set of nodes is a subset of the second
    def _is_subset(self, nodes1, nodes2):
        for node in nodes1:
            if node not in nodes2:
                return False
        return True

    def __repr__(self):
        repr = "<Grammar instance>\n"
        repr += "\tRules:\n"
        for rule in self.rules:
            repr += "\t\t{}\n".format(rule)
        return repr[:-1] # Remove that last newline

    def __eq__(self, other):
        return repr(self) == repr(other)

    def __hash__(self):
        return hash(repr(self))