Add (non-passing) tokenization.
This commit is contained in:
parent
75174e1736
commit
fc37450565
14
naive-nlu/tree_nlu/atoms.py
Normal file
14
naive-nlu/tree_nlu/atoms.py
Normal file
@ -0,0 +1,14 @@
|
||||
'''
|
||||
Analogous to erlang ones.
|
||||
|
||||
"An atom is a literal, a constant with name."
|
||||
'''
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
Atom = namedtuple('Atom', field_names='name')
|
||||
|
||||
|
||||
def a(name):
|
||||
'''Build an atom with a given name.'''
|
||||
return Atom(name)
|
@ -14,11 +14,16 @@ def diff_knowledge(before, after):
|
||||
|
||||
|
||||
class KnowledgeBase(object):
|
||||
def __init__(self, knowledge, examples=[], trained=[]):
|
||||
def __init__(self, knowledge={}, examples=[], trained=[]):
|
||||
self.knowledge = copy.copy(knowledge)
|
||||
self.originals = []
|
||||
self.examples = copy.copy(examples)
|
||||
self.trained = copy.copy(trained)
|
||||
self.tokenization = set()
|
||||
|
||||
def train_tokenizer(self, example):
|
||||
with session().log('Train'):
|
||||
parsing.integrate_tokenization(self, example)
|
||||
|
||||
def train(self, examples):
|
||||
knowledge_before = copy.deepcopy(self.knowledge)
|
||||
@ -26,7 +31,7 @@ class KnowledgeBase(object):
|
||||
# Parse everything
|
||||
for example in examples:
|
||||
# If there's parsed data, leverage it ASAP
|
||||
if 'parsed' in example:
|
||||
if 'parsed' in example and isinstance(example['parsed'], tuple):
|
||||
with session().log('parsed information integration'):
|
||||
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
||||
"parsed": example['parsed'],
|
||||
@ -35,7 +40,8 @@ class KnowledgeBase(object):
|
||||
|
||||
with session().log("language integration"):
|
||||
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
|
||||
session().annotate(tokens)
|
||||
session().annotate("Tokens: {}".format(tokens))
|
||||
session().annotate("Inferred tree: {}".format(inferred_tree))
|
||||
|
||||
with session().log("full information integration"):
|
||||
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
||||
@ -60,11 +66,19 @@ class KnowledgeBase(object):
|
||||
|
||||
return knowledge_diff_getter
|
||||
|
||||
def process(self, row):
|
||||
def tokenize(self, row, return_one=True):
|
||||
row = row.lower()
|
||||
with session().log("Tokenize: {}".format(row)):
|
||||
options = parsing.to_tokens(self, row)
|
||||
if return_one:
|
||||
return parsing.pick_one_tokenization(options)
|
||||
return options
|
||||
|
||||
def process(self, row):
|
||||
knowledge_before = copy.deepcopy(self.knowledge)
|
||||
with session().log("Process: {}".format(row)):
|
||||
tokens = parsing.to_tokens(row)
|
||||
tokens = self.tokenize(row)
|
||||
|
||||
fit = parsing.get_fit(self, tokens)
|
||||
if fit is None:
|
||||
return None
|
||||
|
@ -11,11 +11,105 @@ from functools import reduce
|
||||
from typing import List, Dict
|
||||
from .modifiable_property import ModifiableProperty
|
||||
from . import parameters
|
||||
from .atoms import Atom, a
|
||||
|
||||
# TODO: more flexible tokenization
|
||||
def to_tokens(text):
|
||||
return re.findall(r'(\w+|[^\s])', text)
|
||||
def to_tokens(knowledge_base, text, acc=None):
|
||||
# TODO This is an extra-naïve implementation
|
||||
found = 0
|
||||
|
||||
for tokenization in knowledge_base.tokenization:
|
||||
remaining = text
|
||||
possibility = []
|
||||
|
||||
for i, token in enumerate(tokenization):
|
||||
if token == Atom('token'):
|
||||
for thing in knowledge_base.knowledge.keys():
|
||||
if remaining.startswith(thing):
|
||||
# TODO We should also branch here, probably :\
|
||||
remaining = remaining[len(thing):]
|
||||
possibility.append(thing)
|
||||
else:
|
||||
if i + 1 >= len(tokenization):
|
||||
possibility.append(remaining)
|
||||
remaining = ""
|
||||
|
||||
else:
|
||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
||||
# Cut using the next token we should use more!!!
|
||||
next_token = tokenization[i + 1]
|
||||
cutoff = remaining.find(next_token)
|
||||
if cutoff < 0:
|
||||
break
|
||||
|
||||
possibility.append(remaining[:cutoff])
|
||||
remaining = remaining[cutoff:]
|
||||
else:
|
||||
if remaining.find(token) < 0: # Not inmediately after!
|
||||
break
|
||||
remaining = remaining[len(token):]
|
||||
|
||||
else:
|
||||
# Tokenization applicable
|
||||
found += 1
|
||||
if remaining == '':
|
||||
yield possibility
|
||||
else:
|
||||
for consecuent in to_tokens(knowledge_base, remaining, possibility):
|
||||
yield list(filter(lambda x: x != '', possibility + consecuent))
|
||||
if found == 0:
|
||||
raise Exception('No tokenization found')
|
||||
|
||||
def integrate_tokenization(knowledge_base, example):
|
||||
text = example['text']
|
||||
tokens = example['tokens']
|
||||
meaning = example.get('meaning')
|
||||
|
||||
return integrate_token_to_text_matching(knowledge_base, text, tokens)
|
||||
|
||||
|
||||
def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
||||
texts = [text]
|
||||
|
||||
# Convert to tokens
|
||||
for token_id, token in enumerate(tokens):
|
||||
# Look for token in texts
|
||||
for i, text in enumerate(texts):
|
||||
if isinstance(text, int):
|
||||
continue
|
||||
|
||||
if token in text:
|
||||
before, after = text.split(token, maxsplit=1)
|
||||
texts = (texts[:i] + [before]
|
||||
+ [token_id]
|
||||
+ [after] + texts[i + 1:])
|
||||
break
|
||||
else:
|
||||
raise Exception('Token not found')
|
||||
|
||||
# Remove leftovers from splits
|
||||
texts = list(filter(lambda x: x != '', texts))
|
||||
|
||||
for token_id, _token in enumerate(tokens):
|
||||
# Find all elements between current token and next token
|
||||
i = texts.index(token_id)
|
||||
elements = [a('token')]
|
||||
|
||||
i += 1
|
||||
while i < len(texts) and not isinstance(texts[i], int):
|
||||
elements.append(texts[i])
|
||||
i += 1
|
||||
|
||||
knowledge_base.tokenization.add(tuple(elements))
|
||||
|
||||
def pick_one_tokenization(options):
|
||||
'''
|
||||
Heuristic function to pick the most probable tokenization.
|
||||
|
||||
Just pick the one with more results.
|
||||
'''
|
||||
return sorted(options,
|
||||
key=lambda tokenization: len(tokenization),
|
||||
reverse=True)[0]
|
||||
|
||||
def make_template(knowledge_base, tokens, parsed):
|
||||
matcher = list(tokens)
|
||||
@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example):
|
||||
parsed = example["parsed"]
|
||||
|
||||
resolved_parsed = copy.deepcopy(parsed)
|
||||
tokens = to_tokens(text)
|
||||
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
|
||||
|
||||
while True:
|
||||
session().annotate("P: {}".format(resolved_parsed))
|
||||
|
@ -1,7 +1,8 @@
|
||||
import traceback
|
||||
import logging
|
||||
import datetime
|
||||
from .session import org_mode
|
||||
|
||||
from .tests import tokenization
|
||||
from .tests import basic
|
||||
from .tests import gac_100
|
||||
from .tests import gac_extension
|
||||
@ -9,6 +10,7 @@ from .tests import gac_extension
|
||||
logging.getLogger().setLevel(logging.ERROR)
|
||||
|
||||
tests = (
|
||||
("tokenization", tokenization),
|
||||
("basic", basic),
|
||||
("gac 100", gac_100),
|
||||
("gac+", gac_extension),
|
||||
@ -24,12 +26,14 @@ def main():
|
||||
failed = False
|
||||
for test_name, test_module in tests:
|
||||
try:
|
||||
test_module.main()
|
||||
with org_mode.global_session().log(test_name):
|
||||
test_module.main()
|
||||
print(" \x1b[1;32m✓\x1b[0m {}".format(test_name))
|
||||
except AssertionError as ae:
|
||||
print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name,
|
||||
('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0
|
||||
else ''))
|
||||
traceback.print_exc()
|
||||
failed = True
|
||||
|
||||
except Exception as e:
|
||||
|
@ -3,6 +3,7 @@ import json
|
||||
|
||||
from ..knowledge_base import KnowledgeBase
|
||||
from ..modifiable_property import is_modifiable_property
|
||||
from ..utils.tokenization import train_basic_tokenization
|
||||
|
||||
examples = [
|
||||
{
|
||||
@ -107,6 +108,9 @@ base_knowledge = {
|
||||
'swim': {
|
||||
"groups": {'verb'},
|
||||
},
|
||||
'planet': {
|
||||
'groups': {'noun'}
|
||||
}
|
||||
}
|
||||
|
||||
def test_assumption(expectedResponse, knowledge, query):
|
||||
@ -125,6 +129,8 @@ def main():
|
||||
knowledge=base_knowledge,
|
||||
)
|
||||
|
||||
train_basic_tokenization(knowledge)
|
||||
|
||||
for example in examples:
|
||||
with session().log(example['text']):
|
||||
differences = knowledge.train([example])
|
||||
|
67
naive-nlu/tree_nlu/tests/tokenization.py
Normal file
67
naive-nlu/tree_nlu/tests/tokenization.py
Normal file
@ -0,0 +1,67 @@
|
||||
from ..session.org_mode import global_session as session
|
||||
from ..knowledge_base import KnowledgeBase
|
||||
from ..utils.visuals import show_progbar
|
||||
from ..visualization import show_knowledge
|
||||
|
||||
|
||||
def _assert(args):
|
||||
assert(args)
|
||||
|
||||
|
||||
def _assert_msg(args, msg):
|
||||
assert args, msg
|
||||
|
||||
|
||||
EXAMPLES = [
|
||||
('example', {
|
||||
"text": 'cat',
|
||||
"tokens": ['cat'],
|
||||
}),
|
||||
('example', {
|
||||
"text": 'cats',
|
||||
"tokens": ['cats'],
|
||||
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
|
||||
}),
|
||||
('example', {
|
||||
"text": 'text separated by spaces',
|
||||
"tokens": ['text', 'separated', 'by', 'spaces'],
|
||||
}),
|
||||
|
||||
('test', {
|
||||
"text": 'plane',
|
||||
"tokens": ['plane'],
|
||||
}),
|
||||
('test', {
|
||||
"text": 'planes',
|
||||
"tokens": ['planes'],
|
||||
"meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
|
||||
}),
|
||||
('test', {
|
||||
"text": 'some other text',
|
||||
"tokens": ['some', 'other', 'text'],
|
||||
})
|
||||
]
|
||||
|
||||
|
||||
def main():
|
||||
knowledge = KnowledgeBase()
|
||||
|
||||
total = len(EXAMPLES)
|
||||
|
||||
for i, (case_type, example) in enumerate(EXAMPLES):
|
||||
show_progbar(i, total, example['text'])
|
||||
if case_type == 'example':
|
||||
with session().log(example['text']):
|
||||
knowledge.train_tokenizer(example)
|
||||
|
||||
elif case_type == 'test':
|
||||
with session().log(example['text']):
|
||||
tokens = list(knowledge.tokenize(example['text']))
|
||||
|
||||
assert example['tokens'] == tokens
|
||||
|
||||
else:
|
||||
raise Exception('Not implemented case {}'.format(case_type))
|
||||
|
||||
print("\r\x1b[K", end='')
|
||||
return knowledge
|
19
naive-nlu/tree_nlu/utils/tokenization.py
Normal file
19
naive-nlu/tree_nlu/utils/tokenization.py
Normal file
@ -0,0 +1,19 @@
|
||||
BASIC_TOKENIZATION_EXAMPLES = (
|
||||
({
|
||||
"text": 'cat',
|
||||
"tokens": ['cat'],
|
||||
}),
|
||||
({
|
||||
"text": 'text separated by spaces',
|
||||
"tokens": ['text', 'separated', 'by', 'spaces'],
|
||||
}),
|
||||
({
|
||||
"text": 'is earth a planet?',
|
||||
"tokens": ['is', 'earth', 'a', 'planet', '?'],
|
||||
}),
|
||||
)
|
||||
|
||||
|
||||
def train_basic_tokenization(knowledge_base):
|
||||
for example in BASIC_TOKENIZATION_EXAMPLES:
|
||||
knowledge_base.train_tokenizer(example)
|
Loading…
Reference in New Issue
Block a user