Add (non-passing) tokenization.

This commit is contained in:
kenkeiras 2018-04-01 20:24:09 +02:00
parent 75174e1736
commit fc37450565
7 changed files with 229 additions and 11 deletions

View file

@ -3,6 +3,7 @@ import json
from ..knowledge_base import KnowledgeBase
from ..modifiable_property import is_modifiable_property
from ..utils.tokenization import train_basic_tokenization
examples = [
{
@ -107,6 +108,9 @@ base_knowledge = {
'swim': {
"groups": {'verb'},
},
'planet': {
'groups': {'noun'}
}
}
def test_assumption(expectedResponse, knowledge, query):
@ -125,6 +129,8 @@ def main():
knowledge=base_knowledge,
)
train_basic_tokenization(knowledge)
for example in examples:
with session().log(example['text']):
differences = knowledge.train([example])

View file

@ -0,0 +1,67 @@
from ..session.org_mode import global_session as session
from ..knowledge_base import KnowledgeBase
from ..utils.visuals import show_progbar
from ..visualization import show_knowledge
def _assert(args):
assert(args)
def _assert_msg(args, msg):
assert args, msg
EXAMPLES = [
('example', {
"text": 'cat',
"tokens": ['cat'],
}),
('example', {
"text": 'cats',
"tokens": ['cats'],
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
}),
('example', {
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],
}),
('test', {
"text": 'plane',
"tokens": ['plane'],
}),
('test', {
"text": 'planes',
"tokens": ['planes'],
"meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
}),
('test', {
"text": 'some other text',
"tokens": ['some', 'other', 'text'],
})
]
def main():
knowledge = KnowledgeBase()
total = len(EXAMPLES)
for i, (case_type, example) in enumerate(EXAMPLES):
show_progbar(i, total, example['text'])
if case_type == 'example':
with session().log(example['text']):
knowledge.train_tokenizer(example)
elif case_type == 'test':
with session().log(example['text']):
tokens = list(knowledge.tokenize(example['text']))
assert example['tokens'] == tokens
else:
raise Exception('Not implemented case {}'.format(case_type))
print("\r\x1b[K", end='')
return knowledge