Merge branch 'learn-tokenization' into naive-nlu
This commit is contained in:
commit
c18c9b8cb1
23
naive-nlu/tree_nlu/atoms.py
Normal file
23
naive-nlu/tree_nlu/atoms.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
'''
|
||||||
|
Analogous to erlang ones.
|
||||||
|
|
||||||
|
"An atom is a literal, a constant with name."
|
||||||
|
'''
|
||||||
|
|
||||||
|
from collections import namedtuple
|
||||||
|
|
||||||
|
Atom = namedtuple('Atom', field_names='name')
|
||||||
|
|
||||||
|
def is_atom(element, name=None):
|
||||||
|
'''Check if an element is an atom with a specific name.'''
|
||||||
|
if not isinstance(element, Atom):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if name is None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return element.name == name
|
||||||
|
|
||||||
|
def a(name):
|
||||||
|
'''Build an atom with a given name.'''
|
||||||
|
return Atom(name)
|
@ -3,22 +3,74 @@ import logging
|
|||||||
|
|
||||||
from .session.org_mode import global_session as session
|
from .session.org_mode import global_session as session
|
||||||
|
|
||||||
|
from .atoms import Atom
|
||||||
from . import parsing
|
from . import parsing
|
||||||
|
from . import tokenization
|
||||||
from . import knowledge_evaluation
|
from . import knowledge_evaluation
|
||||||
from .modifiable_property import is_modifiable_property
|
from .modifiable_property import is_modifiable_property
|
||||||
|
import random
|
||||||
|
|
||||||
def diff_knowledge(before, after):
|
def diff_knowledge(before, after):
|
||||||
import jsondiff
|
import jsondiff
|
||||||
return jsondiff.diff(before, after)
|
return jsondiff.diff(before, after)
|
||||||
|
|
||||||
|
|
||||||
|
def randomized_weighted_list(elements):
|
||||||
|
# Randomized
|
||||||
|
randomized = list(elements)
|
||||||
|
random.shuffle(randomized)
|
||||||
|
|
||||||
|
# And return only once
|
||||||
|
already_returned = set()
|
||||||
|
for e in randomized:
|
||||||
|
if e in already_returned:
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield e
|
||||||
|
already_returned.add(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeBase(object):
|
class KnowledgeBase(object):
|
||||||
def __init__(self, knowledge, examples=[], trained=[]):
|
def __init__(self, knowledge={}, examples=[], trained=[]):
|
||||||
self.knowledge = copy.copy(knowledge)
|
self.knowledge = copy.copy(knowledge)
|
||||||
self.originals = []
|
self.originals = []
|
||||||
self.examples = copy.copy(examples)
|
self.examples = copy.copy(examples)
|
||||||
self.trained = copy.copy(trained)
|
self.trained = copy.copy(trained)
|
||||||
|
self.structural_elements = set()
|
||||||
|
self.token_chains = {}
|
||||||
|
self.tokens = set()
|
||||||
|
|
||||||
|
def add_token_pair(self, precedent, consequent):
|
||||||
|
self.add_token(precedent)
|
||||||
|
self.add_token(consequent)
|
||||||
|
|
||||||
|
if precedent not in self.token_chains:
|
||||||
|
self.token_chains[precedent] = []
|
||||||
|
self.token_chains[precedent].append(consequent)
|
||||||
|
|
||||||
|
def add_token(self, token):
|
||||||
|
self.tokens.add(token)
|
||||||
|
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
|
||||||
|
session().annotate('Found new structural element “{}”'.format(token))
|
||||||
|
self.structural_elements.add(token)
|
||||||
|
|
||||||
|
def expected_token_after_precedent(self, precedent=None):
|
||||||
|
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
|
||||||
|
return randomized_weighted_list(self.tokens)
|
||||||
|
|
||||||
|
return randomized_weighted_list(self.token_chains[precedent])
|
||||||
|
|
||||||
|
def train_tokenizer(self, example):
|
||||||
|
with session().log('Training tokenizer'):
|
||||||
|
session().annotate("Example: {}".format(example))
|
||||||
|
tokens = tokenization.integrate_tokenization(self, example)
|
||||||
|
|
||||||
|
# Integrate knowledge of concept
|
||||||
|
for token in tokens:
|
||||||
|
if not token in self.knowledge:
|
||||||
|
self.knowledge[token] = {}
|
||||||
|
|
||||||
|
|
||||||
def train(self, examples):
|
def train(self, examples):
|
||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
@ -26,7 +78,7 @@ class KnowledgeBase(object):
|
|||||||
# Parse everything
|
# Parse everything
|
||||||
for example in examples:
|
for example in examples:
|
||||||
# If there's parsed data, leverage it ASAP
|
# If there's parsed data, leverage it ASAP
|
||||||
if 'parsed' in example:
|
if 'parsed' in example and isinstance(example['parsed'], tuple):
|
||||||
with session().log('parsed information integration'):
|
with session().log('parsed information integration'):
|
||||||
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
||||||
"parsed": example['parsed'],
|
"parsed": example['parsed'],
|
||||||
@ -35,7 +87,8 @@ class KnowledgeBase(object):
|
|||||||
|
|
||||||
with session().log("language integration"):
|
with session().log("language integration"):
|
||||||
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
|
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
|
||||||
session().annotate(tokens)
|
session().annotate("Tokens: {}".format(tokens))
|
||||||
|
session().annotate("Inferred tree: {}".format(inferred_tree))
|
||||||
|
|
||||||
with session().log("full information integration"):
|
with session().log("full information integration"):
|
||||||
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
||||||
@ -60,11 +113,24 @@ class KnowledgeBase(object):
|
|||||||
|
|
||||||
return knowledge_diff_getter
|
return knowledge_diff_getter
|
||||||
|
|
||||||
def process(self, row):
|
def tokenize(self, row, return_one=True):
|
||||||
row = row.lower()
|
row = row.lower()
|
||||||
|
with session().log("Tokenize: {}".format(row)):
|
||||||
|
options = list(tokenization.to_tokens(self, row))
|
||||||
|
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
||||||
|
|
||||||
|
if return_one:
|
||||||
|
chosen = tokenization.pick_one_tokenization(options, self)
|
||||||
|
session().log("Chosen: “{}”".format(chosen))
|
||||||
|
self.train_tokenizer({'text': row, 'tokens': chosen})
|
||||||
|
return chosen
|
||||||
|
return options
|
||||||
|
|
||||||
|
def process(self, row):
|
||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
with session().log("Process: {}".format(row)):
|
with session().log("Process: {}".format(row)):
|
||||||
tokens = parsing.to_tokens(row)
|
tokens = self.tokenize(row)
|
||||||
|
|
||||||
fit = parsing.get_fit(self, tokens)
|
fit = parsing.get_fit(self, tokens)
|
||||||
if fit is None:
|
if fit is None:
|
||||||
return None
|
return None
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from . import knowledge_evaluation
|
from . import knowledge_evaluation
|
||||||
|
from . import tokenization
|
||||||
|
|
||||||
from . import depth_meter
|
from . import depth_meter
|
||||||
from .session.org_mode import global_session as session
|
from .session.org_mode import global_session as session
|
||||||
@ -11,11 +12,7 @@ from functools import reduce
|
|||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from .modifiable_property import ModifiableProperty
|
from .modifiable_property import ModifiableProperty
|
||||||
from . import parameters
|
from . import parameters
|
||||||
|
from .atoms import Atom, a, is_atom
|
||||||
# TODO: more flexible tokenization
|
|
||||||
def to_tokens(text):
|
|
||||||
return re.findall(r'(\w+|[^\s])', text)
|
|
||||||
|
|
||||||
|
|
||||||
def make_template(knowledge_base, tokens, parsed):
|
def make_template(knowledge_base, tokens, parsed):
|
||||||
matcher = list(tokens)
|
matcher = list(tokens)
|
||||||
@ -87,7 +84,7 @@ def integrate_language(knowledge_base, example):
|
|||||||
parsed = example["parsed"]
|
parsed = example["parsed"]
|
||||||
|
|
||||||
resolved_parsed = copy.deepcopy(parsed)
|
resolved_parsed = copy.deepcopy(parsed)
|
||||||
tokens = to_tokens(text)
|
tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base))
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
session().annotate("P: {}".format(resolved_parsed))
|
session().annotate("P: {}".format(resolved_parsed))
|
||||||
@ -226,12 +223,22 @@ def all_indexes(collection, element):
|
|||||||
def all_matching_indexes(knowledge_base, collection, element):
|
def all_matching_indexes(knowledge_base, collection, element):
|
||||||
indexes = []
|
indexes = []
|
||||||
|
|
||||||
|
with session().log('Matching “{}”'.format(element)):
|
||||||
assert("groups" in element)
|
assert("groups" in element)
|
||||||
element = element["groups"]
|
element = element["groups"]
|
||||||
for i, instance in enumerate(collection):
|
for i, instance in enumerate(collection):
|
||||||
|
session().log('Checking “{}”'.format(instance))
|
||||||
|
|
||||||
if isinstance(instance, dict):
|
if isinstance(instance, dict):
|
||||||
instance = instance["groups"]
|
instance = instance["groups"]
|
||||||
elif instance in knowledge_base.knowledge:
|
elif instance in knowledge_base.knowledge:
|
||||||
|
session().log('Knowledge about “{}”: ”{}”'.format(instance, knowledge_base.knowledge[instance]))
|
||||||
|
|
||||||
|
if "groups" not in knowledge_base.knowledge[instance]:
|
||||||
|
# This means that is only known as token
|
||||||
|
# so we should try to avoid using it
|
||||||
|
continue
|
||||||
|
|
||||||
instance = knowledge_base.knowledge[instance]["groups"]
|
instance = knowledge_base.knowledge[instance]["groups"]
|
||||||
|
|
||||||
intersection = set(instance) & set(element)
|
intersection = set(instance) & set(element)
|
||||||
@ -242,8 +249,9 @@ def all_matching_indexes(knowledge_base, collection, element):
|
|||||||
|
|
||||||
|
|
||||||
def element_matches_groups(knowledge, element: Dict, groups):
|
def element_matches_groups(knowledge, element: Dict, groups):
|
||||||
|
with session().log("Checking if e “{}” matches groups “{}”".format(element, groups)):
|
||||||
if isinstance(groups, str) and groups in knowledge:
|
if isinstance(groups, str) and groups in knowledge:
|
||||||
return len(knowledge[element].get("groups", set()) & element['groups']) > 0
|
return len(knowledge[groups].get("groups", set()) & element['groups']) > 0
|
||||||
elif isinstance(groups, dict):
|
elif isinstance(groups, dict):
|
||||||
return len(element.get("groups", set()) & element['groups']) > 0
|
return len(element.get("groups", set()) & element['groups']) > 0
|
||||||
return False
|
return False
|
||||||
|
@ -1,7 +1,8 @@
|
|||||||
import traceback
|
import traceback
|
||||||
import logging
|
import logging
|
||||||
import datetime
|
|
||||||
from .session import org_mode
|
from .session import org_mode
|
||||||
|
|
||||||
|
from .tests import tokenization
|
||||||
from .tests import basic
|
from .tests import basic
|
||||||
from .tests import gac_100
|
from .tests import gac_100
|
||||||
from .tests import gac_extension
|
from .tests import gac_extension
|
||||||
@ -9,6 +10,7 @@ from .tests import gac_extension
|
|||||||
logging.getLogger().setLevel(logging.ERROR)
|
logging.getLogger().setLevel(logging.ERROR)
|
||||||
|
|
||||||
tests = (
|
tests = (
|
||||||
|
("tokenization", tokenization),
|
||||||
("basic", basic),
|
("basic", basic),
|
||||||
("gac 100", gac_100),
|
("gac 100", gac_100),
|
||||||
("gac+", gac_extension),
|
("gac+", gac_extension),
|
||||||
@ -24,12 +26,14 @@ def main():
|
|||||||
failed = False
|
failed = False
|
||||||
for test_name, test_module in tests:
|
for test_name, test_module in tests:
|
||||||
try:
|
try:
|
||||||
|
with org_mode.global_session().log(test_name):
|
||||||
test_module.main()
|
test_module.main()
|
||||||
print(" \x1b[1;32m✓\x1b[0m {}".format(test_name))
|
print(" \x1b[1;32m✓\x1b[0m {}".format(test_name))
|
||||||
except AssertionError as ae:
|
except AssertionError as ae:
|
||||||
print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name,
|
print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name,
|
||||||
('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0
|
('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0
|
||||||
else ''))
|
else ''))
|
||||||
|
traceback.print_exc()
|
||||||
failed = True
|
failed = True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
@ -3,6 +3,7 @@ import json
|
|||||||
|
|
||||||
from ..knowledge_base import KnowledgeBase
|
from ..knowledge_base import KnowledgeBase
|
||||||
from ..modifiable_property import is_modifiable_property
|
from ..modifiable_property import is_modifiable_property
|
||||||
|
from ..utils.tokenization import train_basic_tokenization
|
||||||
|
|
||||||
examples = [
|
examples = [
|
||||||
{
|
{
|
||||||
@ -107,6 +108,9 @@ base_knowledge = {
|
|||||||
'swim': {
|
'swim': {
|
||||||
"groups": {'verb'},
|
"groups": {'verb'},
|
||||||
},
|
},
|
||||||
|
'planet': {
|
||||||
|
'groups': {'noun'}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def test_assumption(expectedResponse, knowledge, query):
|
def test_assumption(expectedResponse, knowledge, query):
|
||||||
@ -125,6 +129,8 @@ def main():
|
|||||||
knowledge=base_knowledge,
|
knowledge=base_knowledge,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
train_basic_tokenization(knowledge)
|
||||||
|
|
||||||
for example in examples:
|
for example in examples:
|
||||||
with session().log(example['text']):
|
with session().log(example['text']):
|
||||||
differences = knowledge.train([example])
|
differences = knowledge.train([example])
|
||||||
|
@ -2,6 +2,7 @@ from ..session.org_mode import global_session as session
|
|||||||
from ..knowledge_base import KnowledgeBase
|
from ..knowledge_base import KnowledgeBase
|
||||||
from ..utils.visuals import show_progbar
|
from ..utils.visuals import show_progbar
|
||||||
from ..visualization import show_knowledge
|
from ..visualization import show_knowledge
|
||||||
|
from ..utils.tokenization import train_basic_tokenization
|
||||||
|
|
||||||
def _assert(args):
|
def _assert(args):
|
||||||
assert(args)
|
assert(args)
|
||||||
@ -667,6 +668,10 @@ base_knowledge = {
|
|||||||
'electricity': {
|
'electricity': {
|
||||||
"groups": {'power'},
|
"groups": {'power'},
|
||||||
},
|
},
|
||||||
|
'airplanes': {},
|
||||||
|
'white': {
|
||||||
|
'groups': {'property'},
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
@ -674,6 +679,8 @@ def main():
|
|||||||
knowledge=base_knowledge,
|
knowledge=base_knowledge,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
train_basic_tokenization(knowledge)
|
||||||
|
|
||||||
total = len(examples)
|
total = len(examples)
|
||||||
|
|
||||||
for i, (example_type, data) in enumerate(examples):
|
for i, (example_type, data) in enumerate(examples):
|
||||||
|
@ -22,4 +22,5 @@ def ask_then_learn_test(knowledge: KnowledgeBase):
|
|||||||
def main():
|
def main():
|
||||||
knowledge = gac_100.main()
|
knowledge = gac_100.main()
|
||||||
|
|
||||||
|
knowledge.knowledge['blue'] = {'groups': {'property'}}
|
||||||
knowledge = ask_then_learn_test(knowledge)
|
knowledge = ask_then_learn_test(knowledge)
|
||||||
|
80
naive-nlu/tree_nlu/tests/tokenization.py
Normal file
80
naive-nlu/tree_nlu/tests/tokenization.py
Normal file
@ -0,0 +1,80 @@
|
|||||||
|
from ..session.org_mode import global_session as session
|
||||||
|
from ..knowledge_base import KnowledgeBase
|
||||||
|
from ..utils.visuals import show_progbar
|
||||||
|
from ..visualization import show_knowledge
|
||||||
|
|
||||||
|
|
||||||
|
def _assert(args):
|
||||||
|
assert(args)
|
||||||
|
|
||||||
|
|
||||||
|
def _assert_msg(args, msg):
|
||||||
|
assert args, msg
|
||||||
|
|
||||||
|
|
||||||
|
EXAMPLES = [
|
||||||
|
('example', {
|
||||||
|
"text": 'cat',
|
||||||
|
"tokens": ['cat'],
|
||||||
|
}),
|
||||||
|
('example', {
|
||||||
|
"text": 'cats',
|
||||||
|
"tokens": ['cats'],
|
||||||
|
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
|
||||||
|
}),
|
||||||
|
('example', {
|
||||||
|
"text": 'text separated by spaces',
|
||||||
|
"tokens": ['text', 'separated', 'by', 'spaces'],
|
||||||
|
}),
|
||||||
|
('example', {
|
||||||
|
"text": 'is earth a planet?',
|
||||||
|
"tokens": ['is', 'earth', 'a', 'planet', '?'],
|
||||||
|
}),
|
||||||
|
('test', {
|
||||||
|
"text": 'plane',
|
||||||
|
"tokens": ['plane'],
|
||||||
|
}),
|
||||||
|
# ('test', {
|
||||||
|
# "text": 'planes',
|
||||||
|
# "tokens": ['planes'],
|
||||||
|
# "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
|
||||||
|
# }),
|
||||||
|
('test', {
|
||||||
|
"text": 'some other text',
|
||||||
|
"tokens": ['some', 'other', 'text'],
|
||||||
|
}),
|
||||||
|
('test', {
|
||||||
|
"text": 'is the sun a star?',
|
||||||
|
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
|
||||||
|
}),
|
||||||
|
('test', {
|
||||||
|
"text": 'sometextnotseparatedbyspaces',
|
||||||
|
"tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
|
||||||
|
})
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
knowledge = KnowledgeBase()
|
||||||
|
|
||||||
|
total = len(EXAMPLES)
|
||||||
|
|
||||||
|
for i, (case_type, example) in enumerate(EXAMPLES):
|
||||||
|
show_progbar(i, total, example['text'])
|
||||||
|
if case_type == 'example':
|
||||||
|
with session().log(example['text']):
|
||||||
|
knowledge.train_tokenizer(example)
|
||||||
|
|
||||||
|
elif case_type == 'test':
|
||||||
|
with session().log(example['text']):
|
||||||
|
tokens = list(knowledge.tokenize(example['text']))
|
||||||
|
|
||||||
|
session().log('Expected “{}”, found “{}”'
|
||||||
|
.format(example['tokens'], tokens))
|
||||||
|
assert example['tokens'] == tokens
|
||||||
|
|
||||||
|
else:
|
||||||
|
raise Exception('Not implemented case {}'.format(case_type))
|
||||||
|
|
||||||
|
print("\r\x1b[K", end='')
|
||||||
|
return knowledge
|
186
naive-nlu/tree_nlu/tokenization.py
Normal file
186
naive-nlu/tree_nlu/tokenization.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
from .session.org_mode import global_session as session
|
||||||
|
from .atoms import Atom, a, is_atom
|
||||||
|
|
||||||
|
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
||||||
|
for se in knowledge_base.structural_elements:
|
||||||
|
found_position = remaining.find(se)
|
||||||
|
found = found_position >= 0
|
||||||
|
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
|
||||||
|
if found:
|
||||||
|
return [
|
||||||
|
(remaining[:found_position], se, remaining[found_position + len(se):])
|
||||||
|
]
|
||||||
|
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
found_position = remaining.find(token)
|
||||||
|
found = found_position >= 0
|
||||||
|
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
|
||||||
|
if found:
|
||||||
|
return [
|
||||||
|
(remaining[:found_position], token, remaining[found_position + len(token):])
|
||||||
|
]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def to_tokens(knowledge_base, text, precedent=None):
|
||||||
|
if len(text) == 0:
|
||||||
|
session().annotate("No text remaining")
|
||||||
|
yield ['']
|
||||||
|
return
|
||||||
|
|
||||||
|
with session().log("Tokenizing {}".format(text)):
|
||||||
|
for option in knowledge_base.expected_token_after_precedent(precedent):
|
||||||
|
with session().log("Next: “{}”".format(option)):
|
||||||
|
with session().log("Matching “{}” on “{}”".format(option, text)):
|
||||||
|
for token_match in tokenization_match(option, text, knowledge_base):
|
||||||
|
if token_match is None:
|
||||||
|
session().annotate("No match")
|
||||||
|
|
||||||
|
match, remaining = token_match
|
||||||
|
if len(remaining) == len(text):
|
||||||
|
raise Exception('No text consumed in match')
|
||||||
|
|
||||||
|
session().annotate('Match: “{}”'.format(match))
|
||||||
|
with session().log('Remaining “{}”'.format(remaining)):
|
||||||
|
for sublevel in to_tokens(knowledge_base, remaining, match):
|
||||||
|
candidate = list(filter(lambda x: x != '', [match] + sublevel))
|
||||||
|
session().annotate('Yielding candidate “{}”'.format(candidate))
|
||||||
|
yield candidate
|
||||||
|
|
||||||
|
|
||||||
|
def tokenization_match(element, text, knowledge_base):
|
||||||
|
# Constant/structural string matching
|
||||||
|
if isinstance(element, str):
|
||||||
|
if text.find(element) == 0:
|
||||||
|
# This match comes from a structuring element
|
||||||
|
# It doesn't appear on the tokenization
|
||||||
|
# So we should return it as an empty string
|
||||||
|
yield ('', text[len(element):])
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# No match found
|
||||||
|
return
|
||||||
|
|
||||||
|
elif is_atom(element, 'token'):
|
||||||
|
yield from match_single_token(text, knowledge_base)
|
||||||
|
return
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def match_single_token(text, knowledge_base):
|
||||||
|
found_token = False
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
if text.find(token) == 0:
|
||||||
|
yield token, text[len(token):]
|
||||||
|
found_token = True
|
||||||
|
|
||||||
|
if found_token:
|
||||||
|
return
|
||||||
|
|
||||||
|
session().annotate('No token found at the start of ”{}”'.format(text))
|
||||||
|
session().annotate('using structural elements to infer it')
|
||||||
|
# TODO: review this when multiple structural elements are available
|
||||||
|
for se in knowledge_base.structural_elements:
|
||||||
|
session().annotate('Looking for se “{}” in “{}”'.format(se, text))
|
||||||
|
position = text.find(se, 0)
|
||||||
|
found = position > 0 # 0 is not considered a valid position for this kind of split
|
||||||
|
if found:
|
||||||
|
session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))
|
||||||
|
yield text[:position], text[position:]
|
||||||
|
|
||||||
|
session().annotate('No structural element or token found, inferring only token remaining')
|
||||||
|
yield text, ''
|
||||||
|
|
||||||
|
# Using other tokens for cutoff
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
session().annotate('Looking for token “{}” in “{}”'.format(token, text))
|
||||||
|
position = text.find(token)
|
||||||
|
found = position >= 0
|
||||||
|
if found:
|
||||||
|
session().annotate('Found ”{}”, in position ”{}”'.format(token, position))
|
||||||
|
yield text[:position], text[position:]
|
||||||
|
|
||||||
|
|
||||||
|
def integrate_tokenization(knowledge_base, example):
|
||||||
|
text = example['text']
|
||||||
|
tokens = example['tokens']
|
||||||
|
meaning = example.get('meaning')
|
||||||
|
|
||||||
|
return integrate_token_to_text_matching(knowledge_base, text, tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
||||||
|
texts = [text]
|
||||||
|
|
||||||
|
# Convert to tokens
|
||||||
|
for token_id, token in enumerate(tokens):
|
||||||
|
# Look for token in texts
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if isinstance(text, int):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if token in text:
|
||||||
|
before, after = text.split(token, maxsplit=1)
|
||||||
|
texts = (texts[:i] + [before]
|
||||||
|
+ [a('token')]
|
||||||
|
+ [after] + texts[i + 1:])
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise Exception('Token not found')
|
||||||
|
|
||||||
|
# Remove leftovers from splits
|
||||||
|
texts = list(filter(lambda x: x != '', texts))
|
||||||
|
session().log("Tokenized as {} over {}".format(texts, tokens))
|
||||||
|
|
||||||
|
for i, element in enumerate(texts[:-1]):
|
||||||
|
learn_token_pair(element, texts[i + 1], knowledge_base)
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def learn_token_pair(precedent, consequent, knowledge_base):
|
||||||
|
knowledge_base.add_token_pair(precedent, consequent)
|
||||||
|
|
||||||
|
|
||||||
|
def pick_one_tokenization(options, knowledge_base):
|
||||||
|
'''
|
||||||
|
Heuristic function to pick the most probable tokenization.
|
||||||
|
|
||||||
|
Just pick the one with more results.
|
||||||
|
'''
|
||||||
|
options = list(options)
|
||||||
|
with session().log("Picking among: {} options".format(len(options))):
|
||||||
|
session().log("Options: \n{}".format('\n'.join(map(str, options))))
|
||||||
|
return pick_by_score(options,
|
||||||
|
[
|
||||||
|
# By number of splits without structuring elements
|
||||||
|
lambda tokenization: sum(map(
|
||||||
|
lambda split: sum(map(
|
||||||
|
lambda se: se in split, knowledge_base.structural_elements
|
||||||
|
)), tokenization)),
|
||||||
|
|
||||||
|
# By number of unknown tokens
|
||||||
|
lambda tokenization: len(list(filter(lambda token:
|
||||||
|
(token not in knowledge_base.knowledge.keys()) and
|
||||||
|
(token not in knowledge_base.structural_elements),
|
||||||
|
tokenization))),
|
||||||
|
|
||||||
|
# By number of splits
|
||||||
|
lambda tokenization: -len(tokenization),
|
||||||
|
])
|
||||||
|
|
||||||
|
def pick_by_score(options, heuristics):
|
||||||
|
for heuristic in heuristics:
|
||||||
|
assert(len(options) > 0)
|
||||||
|
options = list(map(lambda opt: (heuristic(opt), opt), options))
|
||||||
|
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
|
||||||
|
|
||||||
|
heuristic_cutoff = sorted_options[0][0]
|
||||||
|
session().annotate(sorted_options)
|
||||||
|
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
|
||||||
|
options = pass_heuristic
|
||||||
|
|
||||||
|
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
|
||||||
|
return options[0]
|
||||||
|
|
29
naive-nlu/tree_nlu/utils/tokenization.py
Normal file
29
naive-nlu/tree_nlu/utils/tokenization.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from ..session.org_mode import (
|
||||||
|
global_session as session,
|
||||||
|
)
|
||||||
|
|
||||||
|
BASIC_TOKENIZATION_EXAMPLES = (
|
||||||
|
({
|
||||||
|
"text": 'cat',
|
||||||
|
"tokens": ['cat'],
|
||||||
|
}),
|
||||||
|
({
|
||||||
|
"text": 'cats',
|
||||||
|
"tokens": ['cats'],
|
||||||
|
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
|
||||||
|
}),
|
||||||
|
({
|
||||||
|
"text": 'text separated by spaces',
|
||||||
|
"tokens": ['text', 'separated', 'by', 'spaces'],
|
||||||
|
}),
|
||||||
|
({
|
||||||
|
"text": 'is earth a planet?',
|
||||||
|
"tokens": ['is', 'earth', 'a', 'planet', '?'],
|
||||||
|
}),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def train_basic_tokenization(knowledge_base):
|
||||||
|
with session().log('Training basic tokenization'):
|
||||||
|
for example in BASIC_TOKENIZATION_EXAMPLES:
|
||||||
|
knowledge_base.train_tokenizer(example)
|
Loading…
Reference in New Issue
Block a user