From 0e192407204f40a230c91a42b403da0e25ad335f Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Sat, 13 May 2017 20:28:11 +0200 Subject: [PATCH] =?UTF-8?q?Building=20base=20(extremely=20na=C3=AFve)=20su?= =?UTF-8?q?bgraph=20remix=20matrix.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- naive-nlu/knowledge_base.py | 9 +- naive-nlu/parsing.py | 173 ++++++++++++++++++++++++++++++++++-- 2 files changed, 171 insertions(+), 11 deletions(-) diff --git a/naive-nlu/knowledge_base.py b/naive-nlu/knowledge_base.py index 3368df8..310d6f7 100644 --- a/naive-nlu/knowledge_base.py +++ b/naive-nlu/knowledge_base.py @@ -30,13 +30,10 @@ class KnowledgeBase(object): "parsed": inferred_tree, }) self.act_upon(result) - parsed_examples.append((decomposition, inferred_tree)) + self.examples.append((decomposition, inferred_tree)) - # Reduce values - trained = parsing.reprocess_language_knowledge(self, parsed_examples) - - self.examples += parsed_examples - self.trained = trained + # Reduce values + self.trained = parsing.reprocess_language_knowledge(self, self.examples) knowledge_after = copy.deepcopy(self.knowledge) knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, diff --git a/naive-nlu/parsing.py b/naive-nlu/parsing.py index f0df874..de57d64 100644 --- a/naive-nlu/parsing.py +++ b/naive-nlu/parsing.py @@ -1,10 +1,12 @@ #!/usr/bin/env python +import knowledge_evaluation + +import re from functools import reduce - def make_template(knowledge_base, text, parsed): - tokens = text.split() + tokens = re.findall(r'(\w+|[^\s])', text) matcher = list(tokens) template = list(parsed) for i in range(len(matcher)): @@ -17,16 +19,177 @@ def make_template(knowledge_base, text, parsed): return tokens, matcher, template +def is_bottom_level(tree): + for element in tree: + if isinstance(element, list) or isinstance(element, tuple): + return False + return True + + +def get_lower_levels(parsed): + lower = [] + def aux(subtree, top_level): + nonlocal lower + deeper = top_level + for element in subtree: + if isinstance(element, list) or isinstance(element, tuple): + aux(element, top_level=False) + deeper = True + + if not deeper: + lower.append(subtree) + + aux(parsed, top_level=True) + return lower + + def integrate_language(knowledge_base, example): text = example["text"].lower() parsed = example["parsed"] + + print("P:", parsed) + while True: + lower_levels = get_lower_levels(parsed) + print("Lower:", lower_levels) + if len(lower_levels) == 0: + break + + for atom in lower_levels: + print("\x1b[1mSelecting\x1b[0m:", atom) + similar = get_similar_tree(knowledge_base, atom) + print("___>", similar) + remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, text, atom, similar) + tokens, matcher, result = make_template(knowledge_base, text, atom) + print("Tx:", tokens) + print("Mx:", matcher) + print("Rx:", result) + print("#########") + + break + tokens, matcher, result = make_template(knowledge_base, text, parsed) - print(text) - print(parsed) + print("T:", tokens) + print("M:", matcher) + print("R:", result) print() return tokens, matcher, result +def build_remix_matrix(knowledge_base, text, atom, similar): + # print("+" * 20) + + tokens, matcher, result = make_template(knowledge_base, text, atom) + similar_matcher, similar_result, similar_result_resolved, _ = similar + + # print("NEW:") + # print("Tokens:", tokens) + # print("Matcher:", matcher) + # print("Result:", result) + # print() + # print("Similar:") + # print("Matcher:", similar_matcher) + # print("Result:", similar_result) + + start_bounds, end_bounds = find_bounds(matcher, similar_matcher) + # print() + # print("Bounds:") + # print("Start:", start_bounds) + # print("End: ", end_bounds) + + for i, element in (end_bounds + start_bounds[::-1]): + matcher.pop(i) + tokens.pop(i) + + possible_remixes = get_possible_remixes(matcher, similar_matcher) + chosen_remix = possible_remixes[0] + + # print("New tokens:", tokens) + # print("-" * 20) + return chosen_remix, (start_bounds, end_bounds) + + +def get_possible_remixes(matcher, similar_matcher): + # print("*" * 20) + # print(matcher) + # print(similar_matcher) + + matrix = [] + for element in matcher: + assert(element in similar_matcher) + indexes = all_indexes(similar_matcher, element) + matrix.append(indexes) + + # print(matrix) + # print([list(x) for x in list(zip(*matrix))]) + # TODO: do some scoring to find the most "interesting combination" + return [list(x) for x in list(zip(*matrix))] + + +def all_indexes(collection, element): + indexes = [] + base = 0 + + for _ in range(collection.count(element)): + i = collection.index(element, base) + base = i + 1 + indexes.append(i) + + return indexes + + +def find_bounds(matcher, similar_matcher): + start_bounds = [] + for i, element in enumerate(matcher): + if element in similar_matcher: + break + else: + start_bounds.append((i, element)) + + end_bounds = [] + for i, element in enumerate(matcher[::-1]): + if element in similar_matcher: + break + else: + end_bounds.append((len(matcher) - (i + 1), element)) + + return start_bounds, end_bounds + + +def get_similar_tree(knowledge_base, atom): + possibilities = [] + + # Find matching possibilities + for entry, tree in knowledge_base.trained: + if not is_bottom_level(tree): + continue + if tree[0] == atom[0]: + possibilities.append((entry, tree)) + + # Sort by more matching elements + sorted_possibilities = [] + for (raw, possibility) in possibilities: + resolved = [] + for element in atom: + if isinstance(element, str): + resolved.append(element) + else: + resolved.append(knowledge_evaluation.resolve( + knowledge_base.knowledge, + element, + raw)) + + # TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element + score = sum([resolved[i] == atom[i] + for i + in range(min(len(resolved), + len(atom)))]) + sorted_possibilities.append((raw, possibility, resolved, score)) + sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3], reverse=True) + if len(sorted_possibilities) < 1: + return None + + return sorted_possibilities[0] + def get_matching(sample, other): l = len(sample[0]) other = list(filter(lambda x: len(x[0]) == l, other)) @@ -56,13 +219,13 @@ def reprocess_language_knowledge(knowledge_base, examples): pattern_examples = [] for i, sample in enumerate(examples): other = examples[:i] + examples[i + 1:] - print(sample) match = get_matching(sample, other) print("->", match) if len(match) > 0: sample = (match, sample[1],) pattern_examples.append(sample) print() + print("\x1b[7m--\x1b[0m") return pattern_examples