Increase logging, add failing tokenization tests.

2018-04-15 17:08:01 +02:00 · 2018-04-15 17:08:01 +02:00 · d601ae3f83
commit d601ae3f83
parent 40b63128af
3 changed files with 24 additions and 6 deletions
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
    found = 0

    for tokenization in knowledge_base.tokenization:
+      with session().log("Tokenization {}".format(tokenization)):
        remaining = text
        possibility = []

+        # Apply tokenization to all elmenets
        for i, token in enumerate(tokenization):
+          with session().log("T “{}” over “{}”".format(token, remaining)):
            if token == Atom('token'):
                for thing in knowledge_base.knowledge.keys():
+                    session().annotate("Testing with “{}”".format(thing))
                    if remaining.startswith(thing):
                        # TODO We should also branch here, probably :\
                        remaining = remaining[len(thing):]
                        possibility.append(thing)
                else:
-                    if i + 1 >= len(tokenization):
+                    if i + 1 >= len(tokenization):  # Last element
+                        session().annotate("Token not found, considering it all of  “{}”".format(remaining))
                        possibility.append(remaining)
                        remaining = ""

-                    else:
+                    else:  # Not las element, use the next one as cutter
                        # Try with (HYPERSIMPLISTIC!) backtracking
                        # Cut using the next token we should use more!!!
                        next_token = tokenization[i + 1]
+                        session().annotate("Trying to cut for next token on “{}”".format(next_token))
+
                        cutoff = remaining.find(next_token)
                        if cutoff < 0:
                            break
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
                if remaining.find(token) < 0: # Not inmediately after!
                    break
                remaining = remaining[len(token):]
-
+            session().annotate("OK, remaining: {}".format(remaining))
        else:
            # Tokenization applicable
            found += 1