diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index b34efe7..b796d43 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -71,9 +71,13 @@ class KnowledgeBase(object): def tokenize(self, row, return_one=True): row = row.lower() with session().log("Tokenize: {}".format(row)): - options = parsing.to_tokens(self, row) + options = list(parsing.to_tokens(self, row)) + session().log("Results:\n{}".format('\n'.join(map(str, options)))) + if return_one: - return parsing.pick_one_tokenization(options) + chosen = parsing.pick_one_tokenization(options) + session().log("Chosen: “{}”".format(chosen)) + return chosen return options def add_tokenization(self, tokenization): diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py index 198bda2..1450636 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/parsing.py @@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None): found = 0 for tokenization in knowledge_base.tokenization: + with session().log("Tokenization {}".format(tokenization)): remaining = text possibility = [] + # Apply tokenization to all elmenets for i, token in enumerate(tokenization): + with session().log("T “{}” over “{}”".format(token, remaining)): if token == Atom('token'): for thing in knowledge_base.knowledge.keys(): + session().annotate("Testing with “{}”".format(thing)) if remaining.startswith(thing): # TODO We should also branch here, probably :\ remaining = remaining[len(thing):] possibility.append(thing) else: - if i + 1 >= len(tokenization): + if i + 1 >= len(tokenization): # Last element + session().annotate("Token not found, considering it all of “{}”".format(remaining)) possibility.append(remaining) remaining = "" - else: + else: # Not las element, use the next one as cutter # Try with (HYPERSIMPLISTIC!) backtracking # Cut using the next token we should use more!!! next_token = tokenization[i + 1] + session().annotate("Trying to cut for next token on “{}”".format(next_token)) + cutoff = remaining.find(next_token) if cutoff < 0: break @@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None): if remaining.find(token) < 0: # Not inmediately after! break remaining = remaining[len(token):] - + session().annotate("OK, remaining: {}".format(remaining)) else: # Tokenization applicable found += 1 diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py index 5a62def..0bc1a80 100644 --- a/naive-nlu/tree_nlu/tests/tokenization.py +++ b/naive-nlu/tree_nlu/tests/tokenization.py @@ -26,7 +26,10 @@ EXAMPLES = [ "text": 'text separated by spaces', "tokens": ['text', 'separated', 'by', 'spaces'], }), - + ('example', { + "text": 'is earth a planet?', + "tokens": ['is', 'earth', 'a', 'planet', '?'], + }), ('test', { "text": 'plane', "tokens": ['plane'], @@ -39,6 +42,10 @@ EXAMPLES = [ ('test', { "text": 'some other text', "tokens": ['some', 'other', 'text'], + }), + ('test', { + "text": 'is the sun a star?', + "tokens": ['is', 'the', 'sun', 'a', 'star', '?'], }) ]