Add (non-passing) tokenization.

2018-04-01 20:24:09 +02:00 · 2018-04-01 20:24:09 +02:00 · fc37450565
commit fc37450565
parent 75174e1736
7 changed files with 229 additions and 11 deletions
--- a/naive-nlu/tree_nlu/utils/tokenization.py
+++ b/naive-nlu/tree_nlu/utils/tokenization.py
@ -0,0 +1,19 @@
+BASIC_TOKENIZATION_EXAMPLES = (
+    ({
+        "text": 'cat',
+        "tokens": ['cat'],
+    }),
+    ({
+        "text": 'text separated by spaces',
+        "tokens": ['text', 'separated', 'by', 'spaces'],
+    }),
+    ({
+        "text": 'is earth a planet?',
+        "tokens": ['is', 'earth', 'a', 'planet', '?'],
+    }),
+)
+
+
+def train_basic_tokenization(knowledge_base):
+    for example in BASIC_TOKENIZATION_EXAMPLES:
+        knowledge_base.train_tokenizer(example)