Increase logging, add failing tokenization tests.
This commit is contained in:
parent
40b63128af
commit
d601ae3f83
@ -71,9 +71,13 @@ class KnowledgeBase(object):
|
|||||||
def tokenize(self, row, return_one=True):
|
def tokenize(self, row, return_one=True):
|
||||||
row = row.lower()
|
row = row.lower()
|
||||||
with session().log("Tokenize: {}".format(row)):
|
with session().log("Tokenize: {}".format(row)):
|
||||||
options = parsing.to_tokens(self, row)
|
options = list(parsing.to_tokens(self, row))
|
||||||
|
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
||||||
|
|
||||||
if return_one:
|
if return_one:
|
||||||
return parsing.pick_one_tokenization(options)
|
chosen = parsing.pick_one_tokenization(options)
|
||||||
|
session().log("Chosen: “{}”".format(chosen))
|
||||||
|
return chosen
|
||||||
return options
|
return options
|
||||||
|
|
||||||
def add_tokenization(self, tokenization):
|
def add_tokenization(self, tokenization):
|
||||||
|
@ -18,25 +18,32 @@ def to_tokens(knowledge_base, text, acc=None):
|
|||||||
found = 0
|
found = 0
|
||||||
|
|
||||||
for tokenization in knowledge_base.tokenization:
|
for tokenization in knowledge_base.tokenization:
|
||||||
|
with session().log("Tokenization {}".format(tokenization)):
|
||||||
remaining = text
|
remaining = text
|
||||||
possibility = []
|
possibility = []
|
||||||
|
|
||||||
|
# Apply tokenization to all elmenets
|
||||||
for i, token in enumerate(tokenization):
|
for i, token in enumerate(tokenization):
|
||||||
|
with session().log("T “{}” over “{}”".format(token, remaining)):
|
||||||
if token == Atom('token'):
|
if token == Atom('token'):
|
||||||
for thing in knowledge_base.knowledge.keys():
|
for thing in knowledge_base.knowledge.keys():
|
||||||
|
session().annotate("Testing with “{}”".format(thing))
|
||||||
if remaining.startswith(thing):
|
if remaining.startswith(thing):
|
||||||
# TODO We should also branch here, probably :\
|
# TODO We should also branch here, probably :\
|
||||||
remaining = remaining[len(thing):]
|
remaining = remaining[len(thing):]
|
||||||
possibility.append(thing)
|
possibility.append(thing)
|
||||||
else:
|
else:
|
||||||
if i + 1 >= len(tokenization):
|
if i + 1 >= len(tokenization): # Last element
|
||||||
|
session().annotate("Token not found, considering it all of “{}”".format(remaining))
|
||||||
possibility.append(remaining)
|
possibility.append(remaining)
|
||||||
remaining = ""
|
remaining = ""
|
||||||
|
|
||||||
else:
|
else: # Not las element, use the next one as cutter
|
||||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
# Try with (HYPERSIMPLISTIC!) backtracking
|
||||||
# Cut using the next token we should use more!!!
|
# Cut using the next token we should use more!!!
|
||||||
next_token = tokenization[i + 1]
|
next_token = tokenization[i + 1]
|
||||||
|
session().annotate("Trying to cut for next token on “{}”".format(next_token))
|
||||||
|
|
||||||
cutoff = remaining.find(next_token)
|
cutoff = remaining.find(next_token)
|
||||||
if cutoff < 0:
|
if cutoff < 0:
|
||||||
break
|
break
|
||||||
@ -47,7 +54,7 @@ def to_tokens(knowledge_base, text, acc=None):
|
|||||||
if remaining.find(token) < 0: # Not inmediately after!
|
if remaining.find(token) < 0: # Not inmediately after!
|
||||||
break
|
break
|
||||||
remaining = remaining[len(token):]
|
remaining = remaining[len(token):]
|
||||||
|
session().annotate("OK, remaining: {}".format(remaining))
|
||||||
else:
|
else:
|
||||||
# Tokenization applicable
|
# Tokenization applicable
|
||||||
found += 1
|
found += 1
|
||||||
|
@ -26,7 +26,10 @@ EXAMPLES = [
|
|||||||
"text": 'text separated by spaces',
|
"text": 'text separated by spaces',
|
||||||
"tokens": ['text', 'separated', 'by', 'spaces'],
|
"tokens": ['text', 'separated', 'by', 'spaces'],
|
||||||
}),
|
}),
|
||||||
|
('example', {
|
||||||
|
"text": 'is earth a planet?',
|
||||||
|
"tokens": ['is', 'earth', 'a', 'planet', '?'],
|
||||||
|
}),
|
||||||
('test', {
|
('test', {
|
||||||
"text": 'plane',
|
"text": 'plane',
|
||||||
"tokens": ['plane'],
|
"tokens": ['plane'],
|
||||||
@ -39,6 +42,10 @@ EXAMPLES = [
|
|||||||
('test', {
|
('test', {
|
||||||
"text": 'some other text',
|
"text": 'some other text',
|
||||||
"tokens": ['some', 'other', 'text'],
|
"tokens": ['some', 'other', 'text'],
|
||||||
|
}),
|
||||||
|
('test', {
|
||||||
|
"text": 'is the sun a star?',
|
||||||
|
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
|
||||||
})
|
})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user