Increase logging, add failing tokenization tests.

2018-04-15 17:08:01 +02:00 · 2018-04-15 17:08:01 +02:00 · d601ae3f83
commit d601ae3f83
parent 40b63128af
3 changed files with 24 additions and 6 deletions
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -71,9 +71,13 @@ class KnowledgeBase(object):
    def tokenize(self, row, return_one=True):
        row = row.lower()
        with session().log("Tokenize: {}".format(row)):
-            options = parsing.to_tokens(self, row)
+            options = list(parsing.to_tokens(self, row))
+            session().log("Results:\n{}".format('\n'.join(map(str, options))))
+
            if return_one:
-                return parsing.pick_one_tokenization(options)
+                chosen = parsing.pick_one_tokenization(options)
+                session().log("Chosen: “{}”".format(chosen))
+                return chosen
            return options

    def add_tokenization(self, tokenization):
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
    found = 0

    for tokenization in knowledge_base.tokenization:
+      with session().log("Tokenization {}".format(tokenization)):
        remaining = text
        possibility = []

+        # Apply tokenization to all elmenets
        for i, token in enumerate(tokenization):
+          with session().log("T “{}” over “{}”".format(token, remaining)):
            if token == Atom('token'):
                for thing in knowledge_base.knowledge.keys():
+                    session().annotate("Testing with “{}”".format(thing))
                    if remaining.startswith(thing):
                        # TODO We should also branch here, probably :\
                        remaining = remaining[len(thing):]
                        possibility.append(thing)
                else:
-                    if i + 1 >= len(tokenization):
+                    if i + 1 >= len(tokenization):  # Last element
+                        session().annotate("Token not found, considering it all of  “{}”".format(remaining))
                        possibility.append(remaining)
                        remaining = ""

-                    else:
+                    else:  # Not las element, use the next one as cutter
                        # Try with (HYPERSIMPLISTIC!) backtracking
                        # Cut using the next token we should use more!!!
                        next_token = tokenization[i + 1]
+                        session().annotate("Trying to cut for next token on “{}”".format(next_token))
+
                        cutoff = remaining.find(next_token)
                        if cutoff < 0:
                            break
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
                if remaining.find(token) < 0: # Not inmediately after!
                    break
                remaining = remaining[len(token):]
-
+            session().annotate("OK, remaining: {}".format(remaining))
        else:
            # Tokenization applicable
            found += 1
--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@ -26,7 +26,10 @@ EXAMPLES = [
        "text": 'text separated by spaces',
        "tokens": ['text', 'separated', 'by', 'spaces'],
    }),
-
+    ('example', {
+        "text": 'is earth a planet?',
+        "tokens": ['is', 'earth', 'a', 'planet', '?'],
+    }),
    ('test', {
        "text": 'plane',
        "tokens": ['plane'],
@ -39,6 +42,10 @@ EXAMPLES = [
    ('test', {
        "text": 'some other text',
        "tokens": ['some', 'other', 'text'],
+    }),
+    ('test', {
+        "text": 'is the sun a star?',
+        "tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
    })
 ]