From d63781a0d2f4cad67860262eccd2c756d5cb00f2 Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Sun, 15 Apr 2018 20:45:24 +0200 Subject: [PATCH] Learn from tokenizations inferred. --- naive-nlu/tree_nlu/knowledge_base.py | 1 + naive-nlu/tree_nlu/tests/tokenization.py | 16 ++++++++++------ naive-nlu/tree_nlu/utils/tokenization.py | 5 +++++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index 218b09a..8e12f5e 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -121,6 +121,7 @@ class KnowledgeBase(object): if return_one: chosen = parsing.pick_one_tokenization(options, self) session().log("Chosen: “{}”".format(chosen)) + self.train_tokenizer({'text': row, 'tokens': chosen}) return chosen return options diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py index 7e93d59..6b61fc4 100644 --- a/naive-nlu/tree_nlu/tests/tokenization.py +++ b/naive-nlu/tree_nlu/tests/tokenization.py @@ -34,11 +34,11 @@ EXAMPLES = [ "text": 'plane', "tokens": ['plane'], }), - ('test', { - "text": 'planes', - "tokens": ['planes'], - "meaning": { 'planes': ('add-modifier', 'plane', 'plural') }, - }), + # ('test', { + # "text": 'planes', + # "tokens": ['planes'], + # "meaning": { 'planes': ('add-modifier', 'plane', 'plural') }, + # }), ('test', { "text": 'some other text', "tokens": ['some', 'other', 'text'], @@ -46,6 +46,10 @@ EXAMPLES = [ ('test', { "text": 'is the sun a star?', "tokens": ['is', 'the', 'sun', 'a', 'star', '?'], + }), + ('test', { + "text": 'sometextnotseparatedbyspaces', + "tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'], }) ] @@ -66,7 +70,7 @@ def main(): tokens = list(knowledge.tokenize(example['text'])) session().log('Expected “{}”, found “{}”' - .format(tokens, example['tokens'])) + .format(example['tokens'], tokens)) assert example['tokens'] == tokens else: diff --git a/naive-nlu/tree_nlu/utils/tokenization.py b/naive-nlu/tree_nlu/utils/tokenization.py index b763584..4664923 100644 --- a/naive-nlu/tree_nlu/utils/tokenization.py +++ b/naive-nlu/tree_nlu/utils/tokenization.py @@ -7,6 +7,11 @@ BASIC_TOKENIZATION_EXAMPLES = ( "text": 'cat', "tokens": ['cat'], }), + ({ + "text": 'cats', + "tokens": ['cats'], + "meaning": { 'cats': ('add-modifier', 'cat', 'plural') }, + }), ({ "text": 'text separated by spaces', "tokens": ['text', 'separated', 'by', 'spaces'],