Also use matching tokens to score tree similarity.

2017-05-24 22:09:26 +02:00 · 2017-05-24 22:09:26 +02:00 · e6cbb54382
commit e6cbb54382
parent 75d690120b
1 changed files with 11 additions and 6 deletions
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -88,7 +88,7 @@ def integrate_language(knowledge_base, example):

        for position, atom in lower_levels:
            logging.debug("\x1b[1mSelecting\x1b[0m: {}".format(atom))
-            similar = get_similar_tree(knowledge_base, atom)
+            similar = get_similar_tree(knowledge_base, atom, tokens)
            remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, tokens, atom, similar)
            _, matcher, result = make_template(knowledge_base, tokens, atom)
            logging.debug("Tx: {}".format(tokens))
@ -138,7 +138,7 @@ def apply_remix(tokens, remix):
 def build_remix_matrix(knowledge_base, tokens, atom, similar):
    tokens = list(tokens)
    tokens, matcher, result = make_template(knowledge_base, tokens, atom)
-    similar_matcher, similar_result, similar_result_resolved, _ = similar
+    similar_matcher, similar_result, similar_result_resolved, _, _ = similar

    start_bounds, end_bounds = find_bounds(matcher, similar_matcher)

@ -219,7 +219,7 @@ def find_bounds(matcher, similar_matcher):
    return start_bounds, end_bounds


-def get_similar_tree(knowledge_base, atom):
+def get_similar_tree(knowledge_base, atom, tokens):
    possibilities = []

    # Find matching possibilities
@ -243,12 +243,17 @@ def get_similar_tree(knowledge_base, atom):
                    raw))

        # TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element
-        score = sum([resolved[i] == atom[i]
+        atom_score = sum([resolved[i] == atom[i]
                     for i
                     in range(min(len(resolved),
                                  len(atom)))])
-        sorted_possibilities.append((raw, possibility, resolved, score))
-    sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3], reverse=True)
+        token_score = sum([similar_token in tokens
+                           for similar_token
+                           in raw])
+
+        sorted_possibilities.append((raw, possibility, resolved, atom_score, token_score))
+
+    sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True)
    if len(sorted_possibilities) < 1:
        return None