lang-model/naive-nlu/tree_nlu/tests/tokenization.py

81 lines
2.1 KiB
Python

from ..session.org_mode import global_session as session
from ..knowledge_base import KnowledgeBase
from ..utils.visuals import show_progbar
from ..visualization import show_knowledge
def _assert(args):
assert(args)
def _assert_msg(args, msg):
assert args, msg
EXAMPLES = [
('example', {
"text": 'cat',
"tokens": ['cat'],
}),
('example', {
"text": 'cats',
"tokens": ['cats'],
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
}),
('example', {
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],
}),
('example', {
"text": 'is earth a planet?',
"tokens": ['is', 'earth', 'a', 'planet', '?'],
}),
('test', {
"text": 'plane',
"tokens": ['plane'],
}),
# ('test', {
# "text": 'planes',
# "tokens": ['planes'],
# "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
# }),
('test', {
"text": 'some other text',
"tokens": ['some', 'other', 'text'],
}),
('test', {
"text": 'is the sun a star?',
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
}),
('test', {
"text": 'sometextnotseparatedbyspaces',
"tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
})
]
def main():
knowledge = KnowledgeBase()
total = len(EXAMPLES)
for i, (case_type, example) in enumerate(EXAMPLES):
show_progbar(i, total, example['text'])
if case_type == 'example':
with session().log(example['text']):
knowledge.layers.tokenization.train(example)
elif case_type == 'test':
with session().log(example['text']):
tokens = list(knowledge.layers.tokenization.tokenize(example['text']))
session().log('Expected “{}”, found “{}'
.format(example['tokens'], tokens))
assert example['tokens'] == tokens
else:
raise Exception('Not implemented case {}'.format(case_type))
print("\r\x1b[K", end='')
return knowledge