Learn from tokenizations inferred.

2018-04-15 20:45:24 +02:00 · 2018-04-15 20:45:24 +02:00 · d63781a0d2
commit d63781a0d2
parent 6fb1e1e649
3 changed files with 16 additions and 6 deletions
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -121,6 +121,7 @@ class KnowledgeBase(object):
            if return_one:
                chosen = parsing.pick_one_tokenization(options, self)
                session().log("Chosen: “{}”".format(chosen))
+                self.train_tokenizer({'text': row, 'tokens': chosen})
                return chosen
            return options

--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@ -34,11 +34,11 @@ EXAMPLES = [
        "text": 'plane',
        "tokens": ['plane'],
    }),
-    ('test', {
-        "text": 'planes',
-        "tokens": ['planes'],
-        "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
-    }),
+    # ('test', {
+    #     "text": 'planes',
+    #     "tokens": ['planes'],
+    #     "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
+    # }),
    ('test', {
        "text": 'some other text',
        "tokens": ['some', 'other', 'text'],
@ -46,6 +46,10 @@ EXAMPLES = [
    ('test', {
        "text": 'is the sun a star?',
        "tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
+    }),
+    ('test', {
+        "text": 'sometextnotseparatedbyspaces',
+        "tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
    })
 ]

@ -66,7 +70,7 @@ def main():
                tokens = list(knowledge.tokenize(example['text']))

                session().log('Expected “{}”, found “{}”'
-                            .format(tokens, example['tokens']))
+                            .format(example['tokens'], tokens))
                assert example['tokens'] == tokens

        else:
--- a/naive-nlu/tree_nlu/utils/tokenization.py
+++ b/naive-nlu/tree_nlu/utils/tokenization.py
@ -7,6 +7,11 @@ BASIC_TOKENIZATION_EXAMPLES = (
        "text": 'cat',
        "tokens": ['cat'],
    }),
+    ({
+        "text": 'cats',
+        "tokens": ['cats'],
+        "meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
+    }),
    ({
        "text": 'text separated by spaces',
        "tokens": ['text', 'separated', 'by', 'spaces'],