Increase logging, add failing tokenization tests.
This commit is contained in:
parent
40b63128af
commit
d601ae3f83
@ -71,9 +71,13 @@ class KnowledgeBase(object):
|
||||
def tokenize(self, row, return_one=True):
|
||||
row = row.lower()
|
||||
with session().log("Tokenize: {}".format(row)):
|
||||
options = parsing.to_tokens(self, row)
|
||||
options = list(parsing.to_tokens(self, row))
|
||||
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
||||
|
||||
if return_one:
|
||||
return parsing.pick_one_tokenization(options)
|
||||
chosen = parsing.pick_one_tokenization(options)
|
||||
session().log("Chosen: “{}”".format(chosen))
|
||||
return chosen
|
||||
return options
|
||||
|
||||
def add_tokenization(self, tokenization):
|
||||
|
@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
|
||||
found = 0
|
||||
|
||||
for tokenization in knowledge_base.tokenization:
|
||||
with session().log("Tokenization {}".format(tokenization)):
|
||||
remaining = text
|
||||
possibility = []
|
||||
|
||||
# Apply tokenization to all elmenets
|
||||
for i, token in enumerate(tokenization):
|
||||
with session().log("T “{}” over “{}”".format(token, remaining)):
|
||||
if token == Atom('token'):
|
||||
for thing in knowledge_base.knowledge.keys():
|
||||
session().annotate("Testing with “{}”".format(thing))
|
||||
if remaining.startswith(thing):
|
||||
# TODO We should also branch here, probably :\
|
||||
remaining = remaining[len(thing):]
|
||||
possibility.append(thing)
|
||||
else:
|
||||
if i + 1 >= len(tokenization):
|
||||
if i + 1 >= len(tokenization): # Last element
|
||||
session().annotate("Token not found, considering it all of “{}”".format(remaining))
|
||||
possibility.append(remaining)
|
||||
remaining = ""
|
||||
|
||||
else:
|
||||
else: # Not las element, use the next one as cutter
|
||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
||||
# Cut using the next token we should use more!!!
|
||||
next_token = tokenization[i + 1]
|
||||
session().annotate("Trying to cut for next token on “{}”".format(next_token))
|
||||
|
||||
cutoff = remaining.find(next_token)
|
||||
if cutoff < 0:
|
||||
break
|
||||
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
|
||||
if remaining.find(token) < 0: # Not inmediately after!
|
||||
break
|
||||
remaining = remaining[len(token):]
|
||||
|
||||
session().annotate("OK, remaining: {}".format(remaining))
|
||||
else:
|
||||
# Tokenization applicable
|
||||
found += 1
|
||||
|
@ -26,7 +26,10 @@ EXAMPLES = [
|
||||
"text": 'text separated by spaces',
|
||||
"tokens": ['text', 'separated', 'by', 'spaces'],
|
||||
}),
|
||||
|
||||
('example', {
|
||||
"text": 'is earth a planet?',
|
||||
"tokens": ['is', 'earth', 'a', 'planet', '?'],
|
||||
}),
|
||||
('test', {
|
||||
"text": 'plane',
|
||||
"tokens": ['plane'],
|
||||
@ -39,6 +42,10 @@ EXAMPLES = [
|
||||
('test', {
|
||||
"text": 'some other text',
|
||||
"tokens": ['some', 'other', 'text'],
|
||||
}),
|
||||
('test', {
|
||||
"text": 'is the sun a star?',
|
||||
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
|
||||
})
|
||||
]
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user