Increase logging, add failing tokenization tests.

This commit is contained in:
kenkeiras 2018-04-15 17:08:01 +02:00
parent 40b63128af
commit d601ae3f83
3 changed files with 24 additions and 6 deletions

View File

@ -71,9 +71,13 @@ class KnowledgeBase(object):
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = parsing.to_tokens(self, row)
options = list(parsing.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
return parsing.pick_one_tokenization(options)
chosen = parsing.pick_one_tokenization(options)
session().log("Chosen: “{}".format(chosen))
return chosen
return options
def add_tokenization(self, tokenization):

View File

@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
found = 0
for tokenization in knowledge_base.tokenization:
with session().log("Tokenization {}".format(tokenization)):
remaining = text
possibility = []
# Apply tokenization to all elmenets
for i, token in enumerate(tokenization):
with session().log("T “{}” over “{}".format(token, remaining)):
if token == Atom('token'):
for thing in knowledge_base.knowledge.keys():
session().annotate("Testing with “{}".format(thing))
if remaining.startswith(thing):
# TODO We should also branch here, probably :\
remaining = remaining[len(thing):]
possibility.append(thing)
else:
if i + 1 >= len(tokenization):
if i + 1 >= len(tokenization): # Last element
session().annotate("Token not found, considering it all of “{}".format(remaining))
possibility.append(remaining)
remaining = ""
else:
else: # Not las element, use the next one as cutter
# Try with (HYPERSIMPLISTIC!) backtracking
# Cut using the next token we should use more!!!
next_token = tokenization[i + 1]
session().annotate("Trying to cut for next token on “{}".format(next_token))
cutoff = remaining.find(next_token)
if cutoff < 0:
break
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
if remaining.find(token) < 0: # Not inmediately after!
break
remaining = remaining[len(token):]
session().annotate("OK, remaining: {}".format(remaining))
else:
# Tokenization applicable
found += 1

View File

@ -26,7 +26,10 @@ EXAMPLES = [
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],
}),
('example', {
"text": 'is earth a planet?',
"tokens": ['is', 'earth', 'a', 'planet', '?'],
}),
('test', {
"text": 'plane',
"tokens": ['plane'],
@ -39,6 +42,10 @@ EXAMPLES = [
('test', {
"text": 'some other text',
"tokens": ['some', 'other', 'text'],
}),
('test', {
"text": 'is the sun a star?',
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
})
]