From 998a183fd2bdcf8b89f1f0e18c22f64ca878af8f Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Sun, 15 Apr 2018 17:47:04 +0200 Subject: [PATCH] Dig deeper in cut-by-token approach. --- naive-nlu/tree_nlu/knowledge_base.py | 3 +- naive-nlu/tree_nlu/parsing.py | 91 ++++++++++++++++++++---- naive-nlu/tree_nlu/test.py | 6 +- naive-nlu/tree_nlu/tests/tokenization.py | 2 + 4 files changed, 86 insertions(+), 16 deletions(-) diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index b796d43..3e09ec6 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -75,7 +75,7 @@ class KnowledgeBase(object): session().log("Results:\n{}".format('\n'.join(map(str, options)))) if return_one: - chosen = parsing.pick_one_tokenization(options) + chosen = parsing.pick_one_tokenization(options, self) session().log("Chosen: “{}”".format(chosen)) return chosen return options @@ -92,6 +92,7 @@ class KnowledgeBase(object): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): tokens = self.tokenize(row) + print(tokens) fit = parsing.get_fit(self, tokens) if fit is None: diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py index 1450636..5683943 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/parsing.py @@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty from . import parameters from .atoms import Atom, a +def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): + for se in knowledge_base.structural_elements: + found_position = remaining.find(se) + found = found_position >= 0 + session().annotate('Looking for structure with “{}”, found? {}'.format(se, found)) + if found: + return [ + (remaining[:found_position], se, remaining[found_position + len(se):]) + ] + + for token in knowledge_base.knowledge.keys(): + found_position = remaining.find(token) + found = found_position >= 0 + session().annotate('Looking for token “{}”, found? {}'.format(token, found)) + if found: + return [ + (remaining[:found_position], token, remaining[found_position + len(token):]) + ] + + return None + + + def to_tokens(knowledge_base, text, acc=None): # TODO This is an extra-naïve implementation found = 0 @@ -33,10 +56,29 @@ def to_tokens(knowledge_base, text, acc=None): remaining = remaining[len(thing):] possibility.append(thing) else: - if i + 1 >= len(tokenization): # Last element - session().annotate("Token not found, considering it all of “{}”".format(remaining)) - possibility.append(remaining) - remaining = "" + if i + 1 >= len(tokenization): # Last element, lookahead for tokens/structural elements + with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)): + # If we start with remaining[0:] it's not a real lookahead + # ... and it can get us trapped on infinite recursion + splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:]) + + if splits is None: + session().log("No splits found, keeping remaining as token “{}”".format(remaining)) + + possibility.append(remaining) + remaining = "" + + else: + # Consider we only have one possibility + assert len(splits) == 1 + + before_split, pivot, after_split = splits[0] + before_split = remaining[0] + before_split + + session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split)) + + possibility.append(before_split) + remaining = pivot + after_split else: # Not las element, use the next one as cutter # Try with (HYPERSIMPLISTIC!) backtracking @@ -54,15 +96,17 @@ def to_tokens(knowledge_base, text, acc=None): if remaining.find(token) < 0: # Not inmediately after! break remaining = remaining[len(token):] - session().annotate("OK, remaining: {}".format(remaining)) + session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1))) else: # Tokenization applicable found += 1 if remaining == '': + session().log("Concluded possibility “{}”".format(possibility)) yield possibility else: - for consecuent in to_tokens(knowledge_base, remaining, possibility): - yield list(filter(lambda x: x != '', possibility + consecuent)) + with session().log("Continuing with “{}”".format(remaining)): + for consecuent in to_tokens(knowledge_base, remaining, possibility): + yield list(filter(lambda x: x != '', possibility + consecuent)) if found == 0: raise Exception('No tokenization found') @@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens): knowledge_base.add_tokenization(tuple(elements)) -def pick_one_tokenization(options): +def pick_one_tokenization(options, knowledge_base): ''' Heuristic function to pick the most probable tokenization. Just pick the one with more results. ''' - return sorted(options, - key=lambda tokenization: len(tokenization), - reverse=True)[0] + with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))): + return pick_by_score(options, + [ + # First by number of splits + lambda tokenization: len(tokenization), + + # Among them, by number of splits without structuring elements + lambda tokenization: sum(map( + lambda split: -sum(map( + lambda se: se in split, knowledge_base.structural_elements + )), tokenization)) + ]) + +def pick_by_score(options, heuristics): + for heuristic in heuristics: + assert(len(options) > 0) + options = list(map(lambda opt: (heuristic(opt), opt), options)) + sorted_options = sorted(options, key=lambda x: x[0], reverse=True) + + heuristic_cutoff = sorted_options[0][0] + pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff] + options = pass_heuristic + + session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options)))) + return options[0] + def make_template(knowledge_base, tokens, parsed): matcher = list(tokens) @@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example): parsed = example["parsed"] resolved_parsed = copy.deepcopy(parsed) - tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text))) + tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base)) while True: session().annotate("P: {}".format(resolved_parsed)) diff --git a/naive-nlu/tree_nlu/test.py b/naive-nlu/tree_nlu/test.py index 683f85e..11cd561 100644 --- a/naive-nlu/tree_nlu/test.py +++ b/naive-nlu/tree_nlu/test.py @@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR) tests = ( ("tokenization", tokenization), - ("basic", basic), - ("gac 100", gac_100), - ("gac+", gac_extension), + # ("basic", basic), + # ("gac 100", gac_100), + # ("gac+", gac_extension), ) diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py index 0bc1a80..4b91dae 100644 --- a/naive-nlu/tree_nlu/tests/tokenization.py +++ b/naive-nlu/tree_nlu/tests/tokenization.py @@ -65,6 +65,8 @@ def main(): with session().log(example['text']): tokens = list(knowledge.tokenize(example['text'])) + print(tokens) + print(example['tokens']) assert example['tokens'] == tokens else: