30 lines
740 B
Python
30 lines
740 B
Python
from ..session.org_mode import (
|
|
global_session as session,
|
|
)
|
|
|
|
BASIC_TOKENIZATION_EXAMPLES = (
|
|
({
|
|
"text": 'cat',
|
|
"tokens": ['cat'],
|
|
}),
|
|
({
|
|
"text": 'cats',
|
|
"tokens": ['cats'],
|
|
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
|
|
}),
|
|
({
|
|
"text": 'text separated by spaces',
|
|
"tokens": ['text', 'separated', 'by', 'spaces'],
|
|
}),
|
|
({
|
|
"text": 'is earth a planet?',
|
|
"tokens": ['is', 'earth', 'a', 'planet', '?'],
|
|
}),
|
|
)
|
|
|
|
|
|
def train_basic_tokenization(knowledge_base):
|
|
with session().log('Training basic tokenization'):
|
|
for example in BASIC_TOKENIZATION_EXAMPLES:
|
|
knowledge_base.layers.tokenization.train(example)
|