Increase logging, add failing tokenization tests.
This commit is contained in:
parent
40b63128af
commit
d601ae3f83
3 changed files with 24 additions and 6 deletions
|
@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
|
|||
found = 0
|
||||
|
||||
for tokenization in knowledge_base.tokenization:
|
||||
with session().log("Tokenization {}".format(tokenization)):
|
||||
remaining = text
|
||||
possibility = []
|
||||
|
||||
# Apply tokenization to all elmenets
|
||||
for i, token in enumerate(tokenization):
|
||||
with session().log("T “{}” over “{}”".format(token, remaining)):
|
||||
if token == Atom('token'):
|
||||
for thing in knowledge_base.knowledge.keys():
|
||||
session().annotate("Testing with “{}”".format(thing))
|
||||
if remaining.startswith(thing):
|
||||
# TODO We should also branch here, probably :\
|
||||
remaining = remaining[len(thing):]
|
||||
possibility.append(thing)
|
||||
else:
|
||||
if i + 1 >= len(tokenization):
|
||||
if i + 1 >= len(tokenization): # Last element
|
||||
session().annotate("Token not found, considering it all of “{}”".format(remaining))
|
||||
possibility.append(remaining)
|
||||
remaining = ""
|
||||
|
||||
else:
|
||||
else: # Not las element, use the next one as cutter
|
||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
||||
# Cut using the next token we should use more!!!
|
||||
next_token = tokenization[i + 1]
|
||||
session().annotate("Trying to cut for next token on “{}”".format(next_token))
|
||||
|
||||
cutoff = remaining.find(next_token)
|
||||
if cutoff < 0:
|
||||
break
|
||||
|
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
|
|||
if remaining.find(token) < 0: # Not inmediately after!
|
||||
break
|
||||
remaining = remaining[len(token):]
|
||||
|
||||
session().annotate("OK, remaining: {}".format(remaining))
|
||||
else:
|
||||
# Tokenization applicable
|
||||
found += 1
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue