From fc374505657efdcc28ec79a6cfa7a9521bda722d Mon Sep 17 00:00:00 2001 From: kenkeiras Date: Sun, 1 Apr 2018 20:24:09 +0200 Subject: [PATCH] Add (non-passing) tokenization. --- naive-nlu/tree_nlu/atoms.py | 14 ++++ naive-nlu/tree_nlu/knowledge_base.py | 24 ++++-- naive-nlu/tree_nlu/parsing.py | 102 ++++++++++++++++++++++- naive-nlu/tree_nlu/test.py | 8 +- naive-nlu/tree_nlu/tests/basic.py | 6 ++ naive-nlu/tree_nlu/tests/tokenization.py | 67 +++++++++++++++ naive-nlu/tree_nlu/utils/tokenization.py | 19 +++++ 7 files changed, 229 insertions(+), 11 deletions(-) create mode 100644 naive-nlu/tree_nlu/atoms.py create mode 100644 naive-nlu/tree_nlu/tests/tokenization.py create mode 100644 naive-nlu/tree_nlu/utils/tokenization.py diff --git a/naive-nlu/tree_nlu/atoms.py b/naive-nlu/tree_nlu/atoms.py new file mode 100644 index 0000000..a0028e5 --- /dev/null +++ b/naive-nlu/tree_nlu/atoms.py @@ -0,0 +1,14 @@ +''' +Analogous to erlang ones. + +"An atom is a literal, a constant with name." +''' + +from collections import namedtuple + +Atom = namedtuple('Atom', field_names='name') + + +def a(name): + '''Build an atom with a given name.''' + return Atom(name) diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index 931801f..830a6f3 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -14,11 +14,16 @@ def diff_knowledge(before, after): class KnowledgeBase(object): - def __init__(self, knowledge, examples=[], trained=[]): + def __init__(self, knowledge={}, examples=[], trained=[]): self.knowledge = copy.copy(knowledge) self.originals = [] self.examples = copy.copy(examples) self.trained = copy.copy(trained) + self.tokenization = set() + + def train_tokenizer(self, example): + with session().log('Train'): + parsing.integrate_tokenization(self, example) def train(self, examples): knowledge_before = copy.deepcopy(self.knowledge) @@ -26,7 +31,7 @@ class KnowledgeBase(object): # Parse everything for example in examples: # If there's parsed data, leverage it ASAP - if 'parsed' in example: + if 'parsed' in example and isinstance(example['parsed'], tuple): with session().log('parsed information integration'): result = knowledge_evaluation.integrate_information(self.knowledge, { "parsed": example['parsed'], @@ -35,7 +40,8 @@ class KnowledgeBase(object): with session().log("language integration"): tokens, decomposition, inferred_tree = parsing.integrate_language(self, example) - session().annotate(tokens) + session().annotate("Tokens: {}".format(tokens)) + session().annotate("Inferred tree: {}".format(inferred_tree)) with session().log("full information integration"): result = knowledge_evaluation.integrate_information(self.knowledge, { @@ -60,11 +66,19 @@ class KnowledgeBase(object): return knowledge_diff_getter - def process(self, row): + def tokenize(self, row, return_one=True): row = row.lower() + with session().log("Tokenize: {}".format(row)): + options = parsing.to_tokens(self, row) + if return_one: + return parsing.pick_one_tokenization(options) + return options + + def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): - tokens = parsing.to_tokens(row) + tokens = self.tokenize(row) + fit = parsing.get_fit(self, tokens) if fit is None: return None diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py index 8081265..6cae405 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/parsing.py @@ -11,11 +11,105 @@ from functools import reduce from typing import List, Dict from .modifiable_property import ModifiableProperty from . import parameters +from .atoms import Atom, a -# TODO: more flexible tokenization -def to_tokens(text): - return re.findall(r'(\w+|[^\s])', text) +def to_tokens(knowledge_base, text, acc=None): + # TODO This is an extra-naïve implementation + found = 0 + for tokenization in knowledge_base.tokenization: + remaining = text + possibility = [] + + for i, token in enumerate(tokenization): + if token == Atom('token'): + for thing in knowledge_base.knowledge.keys(): + if remaining.startswith(thing): + # TODO We should also branch here, probably :\ + remaining = remaining[len(thing):] + possibility.append(thing) + else: + if i + 1 >= len(tokenization): + possibility.append(remaining) + remaining = "" + + else: + # Try with (HYPERSIMPLISTIC!) backtracking + # Cut using the next token we should use more!!! + next_token = tokenization[i + 1] + cutoff = remaining.find(next_token) + if cutoff < 0: + break + + possibility.append(remaining[:cutoff]) + remaining = remaining[cutoff:] + else: + if remaining.find(token) < 0: # Not inmediately after! + break + remaining = remaining[len(token):] + + else: + # Tokenization applicable + found += 1 + if remaining == '': + yield possibility + else: + for consecuent in to_tokens(knowledge_base, remaining, possibility): + yield list(filter(lambda x: x != '', possibility + consecuent)) + if found == 0: + raise Exception('No tokenization found') + +def integrate_tokenization(knowledge_base, example): + text = example['text'] + tokens = example['tokens'] + meaning = example.get('meaning') + + return integrate_token_to_text_matching(knowledge_base, text, tokens) + + +def integrate_token_to_text_matching(knowledge_base, text, tokens): + texts = [text] + + # Convert to tokens + for token_id, token in enumerate(tokens): + # Look for token in texts + for i, text in enumerate(texts): + if isinstance(text, int): + continue + + if token in text: + before, after = text.split(token, maxsplit=1) + texts = (texts[:i] + [before] + + [token_id] + + [after] + texts[i + 1:]) + break + else: + raise Exception('Token not found') + + # Remove leftovers from splits + texts = list(filter(lambda x: x != '', texts)) + + for token_id, _token in enumerate(tokens): + # Find all elements between current token and next token + i = texts.index(token_id) + elements = [a('token')] + + i += 1 + while i < len(texts) and not isinstance(texts[i], int): + elements.append(texts[i]) + i += 1 + + knowledge_base.tokenization.add(tuple(elements)) + +def pick_one_tokenization(options): + ''' + Heuristic function to pick the most probable tokenization. + + Just pick the one with more results. + ''' + return sorted(options, + key=lambda tokenization: len(tokenization), + reverse=True)[0] def make_template(knowledge_base, tokens, parsed): matcher = list(tokens) @@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example): parsed = example["parsed"] resolved_parsed = copy.deepcopy(parsed) - tokens = to_tokens(text) + tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text))) while True: session().annotate("P: {}".format(resolved_parsed)) diff --git a/naive-nlu/tree_nlu/test.py b/naive-nlu/tree_nlu/test.py index 1cdfe11..683f85e 100644 --- a/naive-nlu/tree_nlu/test.py +++ b/naive-nlu/tree_nlu/test.py @@ -1,7 +1,8 @@ import traceback import logging -import datetime from .session import org_mode + +from .tests import tokenization from .tests import basic from .tests import gac_100 from .tests import gac_extension @@ -9,6 +10,7 @@ from .tests import gac_extension logging.getLogger().setLevel(logging.ERROR) tests = ( + ("tokenization", tokenization), ("basic", basic), ("gac 100", gac_100), ("gac+", gac_extension), @@ -24,12 +26,14 @@ def main(): failed = False for test_name, test_module in tests: try: - test_module.main() + with org_mode.global_session().log(test_name): + test_module.main() print(" \x1b[1;32m✓\x1b[0m {}".format(test_name)) except AssertionError as ae: print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name, ('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0 else '')) + traceback.print_exc() failed = True except Exception as e: diff --git a/naive-nlu/tree_nlu/tests/basic.py b/naive-nlu/tree_nlu/tests/basic.py index 4038bc6..bda8261 100644 --- a/naive-nlu/tree_nlu/tests/basic.py +++ b/naive-nlu/tree_nlu/tests/basic.py @@ -3,6 +3,7 @@ import json from ..knowledge_base import KnowledgeBase from ..modifiable_property import is_modifiable_property +from ..utils.tokenization import train_basic_tokenization examples = [ { @@ -107,6 +108,9 @@ base_knowledge = { 'swim': { "groups": {'verb'}, }, + 'planet': { + 'groups': {'noun'} + } } def test_assumption(expectedResponse, knowledge, query): @@ -125,6 +129,8 @@ def main(): knowledge=base_knowledge, ) + train_basic_tokenization(knowledge) + for example in examples: with session().log(example['text']): differences = knowledge.train([example]) diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py new file mode 100644 index 0000000..5a62def --- /dev/null +++ b/naive-nlu/tree_nlu/tests/tokenization.py @@ -0,0 +1,67 @@ +from ..session.org_mode import global_session as session +from ..knowledge_base import KnowledgeBase +from ..utils.visuals import show_progbar +from ..visualization import show_knowledge + + +def _assert(args): + assert(args) + + +def _assert_msg(args, msg): + assert args, msg + + +EXAMPLES = [ + ('example', { + "text": 'cat', + "tokens": ['cat'], + }), + ('example', { + "text": 'cats', + "tokens": ['cats'], + "meaning": { 'cats': ('add-modifier', 'cat', 'plural') }, + }), + ('example', { + "text": 'text separated by spaces', + "tokens": ['text', 'separated', 'by', 'spaces'], + }), + + ('test', { + "text": 'plane', + "tokens": ['plane'], + }), + ('test', { + "text": 'planes', + "tokens": ['planes'], + "meaning": { 'planes': ('add-modifier', 'plane', 'plural') }, + }), + ('test', { + "text": 'some other text', + "tokens": ['some', 'other', 'text'], + }) +] + + +def main(): + knowledge = KnowledgeBase() + + total = len(EXAMPLES) + + for i, (case_type, example) in enumerate(EXAMPLES): + show_progbar(i, total, example['text']) + if case_type == 'example': + with session().log(example['text']): + knowledge.train_tokenizer(example) + + elif case_type == 'test': + with session().log(example['text']): + tokens = list(knowledge.tokenize(example['text'])) + + assert example['tokens'] == tokens + + else: + raise Exception('Not implemented case {}'.format(case_type)) + + print("\r\x1b[K", end='') + return knowledge diff --git a/naive-nlu/tree_nlu/utils/tokenization.py b/naive-nlu/tree_nlu/utils/tokenization.py new file mode 100644 index 0000000..9b9ee11 --- /dev/null +++ b/naive-nlu/tree_nlu/utils/tokenization.py @@ -0,0 +1,19 @@ +BASIC_TOKENIZATION_EXAMPLES = ( + ({ + "text": 'cat', + "tokens": ['cat'], + }), + ({ + "text": 'text separated by spaces', + "tokens": ['text', 'separated', 'by', 'spaces'], + }), + ({ + "text": 'is earth a planet?', + "tokens": ['is', 'earth', 'a', 'planet', '?'], + }), +) + + +def train_basic_tokenization(knowledge_base): + for example in BASIC_TOKENIZATION_EXAMPLES: + knowledge_base.train_tokenizer(example)