Increase logging, add failing tokenization tests.

This commit is contained in:
kenkeiras 2018-04-15 17:08:01 +02:00
parent 40b63128af
commit d601ae3f83
3 changed files with 24 additions and 6 deletions

View file

@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
found = 0
for tokenization in knowledge_base.tokenization:
with session().log("Tokenization {}".format(tokenization)):
remaining = text
possibility = []
# Apply tokenization to all elmenets
for i, token in enumerate(tokenization):
with session().log("T “{}” over “{}".format(token, remaining)):
if token == Atom('token'):
for thing in knowledge_base.knowledge.keys():
session().annotate("Testing with “{}".format(thing))
if remaining.startswith(thing):
# TODO We should also branch here, probably :\
remaining = remaining[len(thing):]
possibility.append(thing)
else:
if i + 1 >= len(tokenization):
if i + 1 >= len(tokenization): # Last element
session().annotate("Token not found, considering it all of “{}".format(remaining))
possibility.append(remaining)
remaining = ""
else:
else: # Not las element, use the next one as cutter
# Try with (HYPERSIMPLISTIC!) backtracking
# Cut using the next token we should use more!!!
next_token = tokenization[i + 1]
session().annotate("Trying to cut for next token on “{}".format(next_token))
cutoff = remaining.find(next_token)
if cutoff < 0:
break
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
if remaining.find(token) < 0: # Not inmediately after!
break
remaining = remaining[len(token):]
session().annotate("OK, remaining: {}".format(remaining))
else:
# Tokenization applicable
found += 1