Learn from tokenizations inferred.

This commit is contained in:
kenkeiras 2018-04-15 20:45:24 +02:00
parent 6fb1e1e649
commit d63781a0d2
3 changed files with 16 additions and 6 deletions

View File

@ -121,6 +121,7 @@ class KnowledgeBase(object):
if return_one:
chosen = parsing.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train_tokenizer({'text': row, 'tokens': chosen})
return chosen
return options

View File

@ -34,11 +34,11 @@ EXAMPLES = [
"text": 'plane',
"tokens": ['plane'],
}),
('test', {
"text": 'planes',
"tokens": ['planes'],
"meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
}),
# ('test', {
# "text": 'planes',
# "tokens": ['planes'],
# "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
# }),
('test', {
"text": 'some other text',
"tokens": ['some', 'other', 'text'],
@ -46,6 +46,10 @@ EXAMPLES = [
('test', {
"text": 'is the sun a star?',
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
}),
('test', {
"text": 'sometextnotseparatedbyspaces',
"tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
})
]
@ -66,7 +70,7 @@ def main():
tokens = list(knowledge.tokenize(example['text']))
session().log('Expected “{}”, found “{}'
.format(tokens, example['tokens']))
.format(example['tokens'], tokens))
assert example['tokens'] == tokens
else:

View File

@ -7,6 +7,11 @@ BASIC_TOKENIZATION_EXAMPLES = (
"text": 'cat',
"tokens": ['cat'],
}),
({
"text": 'cats',
"tokens": ['cats'],
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
}),
({
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],