From d63781a0d2f4cad67860262eccd2c756d5cb00f2 Mon Sep 17 00:00:00 2001
From: kenkeiras <kenkeiras@codigoparallevar.com>
Date: Sun, 15 Apr 2018 20:45:24 +0200
Subject: [PATCH] Learn from tokenizations inferred.

---
 naive-nlu/tree_nlu/knowledge_base.py     |  1 +
 naive-nlu/tree_nlu/tests/tokenization.py | 16 ++++++++++------
 naive-nlu/tree_nlu/utils/tokenization.py |  5 +++++
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py
index 218b09a..8e12f5e 100644
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@@ -121,6 +121,7 @@ class KnowledgeBase(object):
             if return_one:
                 chosen = parsing.pick_one_tokenization(options, self)
                 session().log("Chosen: “{}”".format(chosen))
+                self.train_tokenizer({'text': row, 'tokens': chosen})
                 return chosen
             return options
 
diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py
index 7e93d59..6b61fc4 100644
--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@@ -34,11 +34,11 @@ EXAMPLES = [
         "text": 'plane',
         "tokens": ['plane'],
     }),
-    ('test', {
-        "text": 'planes',
-        "tokens": ['planes'],
-        "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
-    }),
+    # ('test', {
+    #     "text": 'planes',
+    #     "tokens": ['planes'],
+    #     "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
+    # }),
     ('test', {
         "text": 'some other text',
         "tokens": ['some', 'other', 'text'],
@@ -46,6 +46,10 @@ EXAMPLES = [
     ('test', {
         "text": 'is the sun a star?',
         "tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
+    }),
+    ('test', {
+        "text": 'sometextnotseparatedbyspaces',
+        "tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
     })
 ]
 
@@ -66,7 +70,7 @@ def main():
                 tokens = list(knowledge.tokenize(example['text']))
 
                 session().log('Expected “{}”, found “{}”'
-                            .format(tokens, example['tokens']))
+                            .format(example['tokens'], tokens))
                 assert example['tokens'] == tokens
 
         else:
diff --git a/naive-nlu/tree_nlu/utils/tokenization.py b/naive-nlu/tree_nlu/utils/tokenization.py
index b763584..4664923 100644
--- a/naive-nlu/tree_nlu/utils/tokenization.py
+++ b/naive-nlu/tree_nlu/utils/tokenization.py
@@ -7,6 +7,11 @@ BASIC_TOKENIZATION_EXAMPLES = (
         "text": 'cat',
         "tokens": ['cat'],
     }),
+    ({
+        "text": 'cats',
+        "tokens": ['cats'],
+        "meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
+    }),
     ({
         "text": 'text separated by spaces',
         "tokens": ['text', 'separated', 'by', 'spaces'],