From 75174e17368e7312a9930d9bb8cbe289272fe663 Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Mon, 2 Oct 2017 23:37:20 +0200 Subject: [PATCH] Increase exploration, remove unnecessary initial knowledge. --- naive-nlu/tree_nlu/parsing.py | 184 ++++++++++++++++------------ naive-nlu/tree_nlu/tests/gac_100.py | 18 --- 2 files changed, 106 insertions(+), 96 deletions(-) diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py index d539a28..8081265 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/parsing.py @@ -28,7 +28,7 @@ def make_template(knowledge_base, tokens, parsed): if word in template: template[template.index(word)] = i matcher[i] = { - 'groups': set(knowledge_base.knowledge[word]['groups']) + 'groups': set(knowledge_base.knowledge.get(word, {}).get('groups', set())), } return tokens, matcher, template @@ -98,8 +98,15 @@ def integrate_language(knowledge_base, example): for position, atom in lower_levels: with session().log("Atom {}".format(atom)): - similar = get_similar_tree(knowledge_base, atom, tokens) - remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, tokens, atom, similar) + similars = get_similar_tree(knowledge_base, atom, tokens) + for similar in similars: + result = build_remix_matrix(knowledge_base, tokens, atom, similar) + if result is not None: + break + + if result is None: + raise Exception("No match found") + remix, (start_bounds, end_bounds) = result after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix) session().annotate("--FIND MIX--") @@ -161,38 +168,47 @@ def apply_remix(tokens, remix): def build_remix_matrix(knowledge_base, tokens, atom, similar): tokens = list(tokens) - tokens, matcher, result = make_template(knowledge_base, tokens, atom) - similar_matcher, similar_result, similar_result_resolved, _, _ = similar + with session().log("Remix matrix for {} - {}".format(tokens, atom)): + tokens, matcher, result = make_template(knowledge_base, tokens, atom) + similar_matcher, similar_result, similar_result_resolved, _, _ = similar - start_bounds, end_bounds = find_bounds(knowledge_base, matcher, similar_matcher) + start_bounds, end_bounds = find_bounds(knowledge_base, matcher, similar_matcher) - for i, element in (end_bounds + start_bounds[::-1]): - matcher.pop(i) - tokens.pop(i) + for i, element in (end_bounds + start_bounds[::-1]): + matcher.pop(i) + tokens.pop(i) - possible_remixes = get_possible_remixes(knowledge_base, matcher, similar_matcher) - chosen_remix = possible_remixes[0] + possible_remixes = get_possible_remixes(knowledge_base, matcher, similar_matcher) + session().annotate("Possible remixes: {}".format(possible_remixes)) + if len(possible_remixes) < 1: + return None - return chosen_remix, (start_bounds, end_bounds) + chosen_remix = possible_remixes[0] + + return chosen_remix, (start_bounds, end_bounds) def get_possible_remixes(knowledge_base, matcher, similar_matcher): matrix = [] - for element in matcher: - session().annotate("- {}".format(element)) - session().annotate("+ {}".format(similar_matcher)) - if element in similar_matcher or isinstance(element, dict): - if isinstance(element, dict): - indexes = all_matching_indexes(knowledge_base, similar_matcher, element) - else: - indexes = all_indexes(similar_matcher, element) - matrix.append(indexes) - else: - matrix.append([element]) + with session().log("Possible remixes from matcher: {}".format(matcher)): + for element in matcher: + with session().log("Element `{}`".format(element)): + session().annotate("Similar `{}`".format(similar_matcher)) + if element in similar_matcher or isinstance(element, dict): + if isinstance(element, dict): + indexes = all_matching_indexes(knowledge_base, similar_matcher, element) + session().annotate("Dict element matching: {}".format(indexes)) + else: + indexes = all_indexes(similar_matcher, element) + session().annotate("* element matching: {}".format(indexes)) + matrix.append(indexes) + else: + session().annotate("`else` element matching: [element]") + matrix.append([element]) - # TODO: do some scoring to find the most "interesting combination" - return [list(x) for x in list(zip(*matrix))] + # TODO: do some scoring to find the most "interesting combination" + return [list(x) for x in list(zip(*matrix))] def all_indexes(collection, element): @@ -298,12 +314,14 @@ def get_similar_tree(knowledge_base, atom, tokens): return None for i, possibility in enumerate(sorted_possibilities): - similar_matcher, similar_result, similar_result_resolved, _, _ = possibility + similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility with session().log("Like {}".format(similar_matcher)): - session().annotate('Results on: {}'.format(similar_result_resolved)) session().annotate('AST: {}'.format(similar_result)) + session().annotate('Results on: {}'.format(similar_result_resolved)) + session().annotate('Atom score: {}'.format(_atom_score)) + session().annotate('Token score: {}'.format(_token_score)) - return sorted_possibilities[0] + return sorted_possibilities # TODO: unroll this mess @@ -375,14 +393,14 @@ def reverse_remix(tree_section, remix): def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS): results = [] for matcher, ast in knowledge.trained: - result = match_fit(knowledge, tokens, matcher, ast, - remaining_recursions) + with session().log("{} <- {}".format(matcher, tokens)): + result = match_fit(knowledge, tokens, matcher, ast, + remaining_recursions) - if result is not None: - results.append(result) - session().annotate("XXX {}".format(result)) + if result is not None: + with session().log("Result: {}".format(result)): + results.append(result) - session().annotate(' - ' + '\n - '.join(map(str, results))) if len(results) > 0: return results[0] @@ -407,19 +425,20 @@ def resolve_fit(knowledge, fit, remaining_recursions): if is_definite_minisegment(element): fitted.append(element) else: - ((result_type, remixer), tokens) = element - remixed_tokens = reverse_remix(tokens, remixer) - if remixed_tokens is None: - return None + with session().log("Resolving fit of `{}`".format(element)): + ((result_type, remixer), tokens) = element + remixed_tokens = reverse_remix(tokens, remixer) + if remixed_tokens is None: + return None - minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1) - if minifit is None: - return None + minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1) + if minifit is None: + return None - minitokens, miniast = minifit - session().annotate(" AST | {}".format(miniast)) - subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast) - fitted.append(subproperty) + minitokens, miniast = minifit + session().annotate(" AST | {}".format(miniast)) + subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast) + fitted.append(subproperty) return fitted @@ -430,33 +449,38 @@ def match_fit(knowledge, tokens, matcher, ast, remaining_recursions): session().annotate(indent + 'T> {}'.format(tokens)) session().annotate(indent + 'M> {}'.format(matcher)) for minisegment in matcher: - possibilities_after_round = [] - session().annotate(indent + "MS {}".format(minisegment)) - for matched_tokens, remaining_tokens in segment_possibilities: - if len(remaining_tokens) < 1: - continue + with session().log("Minisegment `{}`".format(minisegment)): + possibilities_after_round = [] + for matched_tokens, remaining_tokens in segment_possibilities: + if len(remaining_tokens) < 1: + continue - session().annotate(indent + "RT {}".format(remaining_tokens[0])) - session().annotate(indent + "DEF {}".format(is_definite_minisegment(minisegment))) - if is_definite_minisegment(minisegment): - # What if not match -----< - if match_token(knowledge, remaining_tokens[0], minisegment): - possibilities_after_round.append(( - matched_tokens + [remaining_tokens[0]], - remaining_tokens[1:] - )) + session().annotate(indent + "RT {}".format(remaining_tokens[0])) + session().annotate(indent + "DEF {}".format(is_definite_minisegment(minisegment))) + if is_definite_minisegment(minisegment): + # What if not match -----< + if match_token(knowledge, remaining_tokens[0], minisegment): + possibilities_after_round.append(( + matched_tokens + [remaining_tokens[0]], + remaining_tokens[1:] + )) + else: + # What if not match!!!!!!-----< + # TODO: optimize this with a look ahead + for i in range(1, len(tokens)): + possibilities_after_round.append(( + matched_tokens + [(minisegment, remaining_tokens[:i])], + remaining_tokens[i:] + )) + session().annotate(indent + "## PA {}".format(possibilities_after_round)) else: - # What if not match!!!!!!-----< - # TODO: optimize this with a look ahead - for i in range(1, len(tokens)): - possibilities_after_round.append(( - matched_tokens + [(minisegment, remaining_tokens[:i])], - remaining_tokens[i:] - )) - session().annotate(indent + "## PA {}".format(possibilities_after_round)) - else: - segment_possibilities = possibilities_after_round - session().annotate(">>>> {}".format(len(segment_possibilities))) + segment_possibilities = possibilities_after_round + for possibility in segment_possibilities: + with session().log("Possibility: `{}`".format(possibility)): + pass + if len(segment_possibilities) < 1: + with session().log("NO POSSIBLE"): + pass fully_matched_segments = [(matched, remaining) for (matched, remaining) @@ -464,15 +488,19 @@ def match_fit(knowledge, tokens, matcher, ast, remaining_recursions): if len(remaining) == 0] resolved_fits = [] - for fit, _ in fully_matched_segments: - session().annotate(indent + "::: {}".format(fit)) # REMIXES HAVE TO BE APPLIED BEFORE!!! + with session().log("Full matches"): + for fit, _ in fully_matched_segments: + with session().log(fit): # REMIXES HAVE TO BE APPLIED BEFORE!!! + pass - session().annotate(indent + '*' * 20) - for fit, _ in fully_matched_segments: - session().annotate(indent + "::: {}".format(fit)) # REMIXES HAVE TO BE APPLIED BEFORE!!! - resolved_fit = resolve_fit(knowledge, fit, remaining_recursions) - if resolved_fit is not None: - resolved_fits.append(resolved_fit) + with session().log("Resolutions"): + for fit, _ in fully_matched_segments: + with session().log("Resolving {}".format(fit)): # REMIXES HAVE TO BE APPLIED BEFORE!!! + resolved_fit = resolve_fit(knowledge, fit, remaining_recursions) + if resolved_fit is not None: + resolved_fits.append(resolved_fit) + else: + session().annotate("Not resolved") if len(resolved_fits) == 0: return None diff --git a/naive-nlu/tree_nlu/tests/gac_100.py b/naive-nlu/tree_nlu/tests/gac_100.py index acfe23e..5c57766 100644 --- a/naive-nlu/tree_nlu/tests/gac_100.py +++ b/naive-nlu/tree_nlu/tests/gac_100.py @@ -655,24 +655,9 @@ examples = [ ] base_knowledge = { - 'icecream': { - "groups": {'noun', 'object', 'comestible', 'sweet'}, - }, - 'hot': { - "groups": {'property', 'temperature'}, - }, 'summer': { "groups": {'epoch'}, }, - 'planet': { - "groups": {'noun', 'group'}, - }, - 'green': { - "groups": {'noun', 'color', 'concept'}, - }, - 'milk': { - "groups": {'noun'}, - }, 'fly': { "groups": {'verb'}, }, @@ -682,9 +667,6 @@ base_knowledge = { 'electricity': { "groups": {'power'}, }, - 'french': { - "groups": {'language'}, - } } def main():