Add (non-passing) tokenization.

2018-04-01 20:24:09 +02:00 · 2018-04-01 20:24:09 +02:00 · fc37450565
commit fc37450565
parent 75174e1736
7 changed files with 229 additions and 11 deletions
--- a/naive-nlu/tree_nlu/atoms.py
+++ b/naive-nlu/tree_nlu/atoms.py
@ -0,0 +1,14 @@
 '''
 Analogous to erlang ones.
 "An atom is a literal, a constant with name."
 '''
 from collections import namedtuple
 Atom = namedtuple('Atom', field_names='name')
 def a(name):
    '''Build an atom with a given name.'''
    return Atom(name)
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -14,11 +14,16 @@ def diff_knowledge(before, after):
 class KnowledgeBase(object):
-    def __init__(self, knowledge, examples=[], trained=[]):
+    def __init__(self, knowledge={}, examples=[], trained=[]):
        self.knowledge = copy.copy(knowledge)
        self.originals = []
        self.examples = copy.copy(examples)
        self.trained = copy.copy(trained)
        self.tokenization = set()
    def train_tokenizer(self, example):
        with session().log('Train'):
            parsing.integrate_tokenization(self, example)
    def train(self, examples):
        knowledge_before = copy.deepcopy(self.knowledge)
@ -26,7 +31,7 @@ class KnowledgeBase(object):
            # Parse everything
            for example in examples:
                # If there's parsed data, leverage it ASAP
-                if 'parsed' in example:
+                if 'parsed' in example and isinstance(example['parsed'], tuple):
                    with session().log('parsed information integration'):
                        result = knowledge_evaluation.integrate_information(self.knowledge, {
                            "parsed": example['parsed'],
@ -35,7 +40,8 @@ class KnowledgeBase(object):
                with session().log("language integration"):
                    tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
-                    session().annotate(tokens)
+                    session().annotate("Tokens: {}".format(tokens))
                    session().annotate("Inferred tree: {}".format(inferred_tree))
                with session().log("full information integration"):
                    result = knowledge_evaluation.integrate_information(self.knowledge, {
@ -60,11 +66,19 @@ class KnowledgeBase(object):
            return knowledge_diff_getter
-    def process(self, row):
+    def tokenize(self, row, return_one=True):
        row = row.lower()
        with session().log("Tokenize: {}".format(row)):
            options = parsing.to_tokens(self, row)
            if return_one:
                return parsing.pick_one_tokenization(options)
            return options
    def process(self, row):
        knowledge_before = copy.deepcopy(self.knowledge)
        with session().log("Process: {}".format(row)):
-            tokens = parsing.to_tokens(row)
+            tokens = self.tokenize(row)
            fit = parsing.get_fit(self, tokens)
            if fit is None:
                return None
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -11,11 +11,105 @@ from functools import reduce
 from typing import List, Dict
 from .modifiable_property import ModifiableProperty
 from . import parameters
 from .atoms import Atom, a
-# TODO: more flexible tokenization
+def to_tokens(knowledge_base, text, acc=None):
-def to_tokens(text):
+    # TODO This is an extra-naïve implementation
-    return re.findall(r'(\w+|[^\s])', text)
+    found = 0
    for tokenization in knowledge_base.tokenization:
        remaining = text
        possibility = []
        for i, token in enumerate(tokenization):
            if token == Atom('token'):
                for thing in knowledge_base.knowledge.keys():
                    if remaining.startswith(thing):
                        # TODO We should also branch here, probably :\
                        remaining = remaining[len(thing):]
                        possibility.append(thing)
                else:
                    if i + 1 >= len(tokenization):
                        possibility.append(remaining)
                        remaining = ""
                    else:
                        # Try with (HYPERSIMPLISTIC!) backtracking
                        # Cut using the next token we should use more!!!
                        next_token = tokenization[i + 1]
                        cutoff = remaining.find(next_token)
                        if cutoff < 0:
                            break
                        possibility.append(remaining[:cutoff])
                        remaining = remaining[cutoff:]
            else:
                if remaining.find(token) < 0: # Not inmediately after!
                    break
                remaining = remaining[len(token):]
        else:
            # Tokenization applicable
            found += 1
            if remaining == '':
                yield possibility
            else:
                for consecuent in to_tokens(knowledge_base, remaining, possibility):
                    yield list(filter(lambda x: x != '', possibility + consecuent))
    if found == 0:
        raise Exception('No tokenization found')
 def integrate_tokenization(knowledge_base, example):
    text = example['text']
    tokens = example['tokens']
    meaning = example.get('meaning')
    return integrate_token_to_text_matching(knowledge_base, text, tokens)
 def integrate_token_to_text_matching(knowledge_base, text, tokens):
    texts = [text]
    # Convert to tokens
    for token_id, token in enumerate(tokens):
        # Look for token in texts
        for i, text in enumerate(texts):
            if isinstance(text, int):
                continue
            if token in text:
                before, after = text.split(token, maxsplit=1)
                texts = (texts[:i] + [before]
                         + [token_id]
                         + [after] + texts[i + 1:])
                break
        else:
            raise Exception('Token not found')
    # Remove leftovers from splits
    texts = list(filter(lambda x: x != '', texts))
    for token_id, _token in enumerate(tokens):
        # Find all elements between current token and next token
        i = texts.index(token_id)
        elements = [a('token')]
        i += 1
        while i < len(texts) and not isinstance(texts[i], int):
            elements.append(texts[i])
            i += 1
        knowledge_base.tokenization.add(tuple(elements))
 def pick_one_tokenization(options):
    '''
    Heuristic function to pick the most probable tokenization.
    Just pick the one with more results.
    '''
    return sorted(options,
                  key=lambda tokenization: len(tokenization),
                  reverse=True)[0]
 def make_template(knowledge_base, tokens, parsed):
    matcher = list(tokens)
@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example):
    parsed = example["parsed"]
    resolved_parsed = copy.deepcopy(parsed)
-    tokens = to_tokens(text)
+    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
    while True:
        session().annotate("P: {}".format(resolved_parsed))
--- a/naive-nlu/tree_nlu/test.py
+++ b/naive-nlu/tree_nlu/test.py
@ -1,7 +1,8 @@
 import traceback
 import logging
 import datetime
 from .session import org_mode
 from .tests import tokenization
 from .tests import basic
 from .tests import gac_100
 from .tests import gac_extension
@ -9,6 +10,7 @@ from .tests import gac_extension
 logging.getLogger().setLevel(logging.ERROR)
 tests = (
    ("tokenization", tokenization),
    ("basic", basic),
    ("gac 100", gac_100),
    ("gac+", gac_extension),
@ -24,12 +26,14 @@ def main():
    failed = False
    for test_name, test_module in tests:
        try:
            with org_mode.global_session().log(test_name):
                test_module.main()
            print(" \x1b[1;32m✓\x1b[0m {}".format(test_name))
        except AssertionError as ae:
            print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name,
                                                  ('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0
                                                   else ''))
            traceback.print_exc()
            failed = True
        except Exception as e:
--- a/naive-nlu/tree_nlu/tests/basic.py
+++ b/naive-nlu/tree_nlu/tests/basic.py
@ -3,6 +3,7 @@ import json
 from ..knowledge_base import KnowledgeBase
 from ..modifiable_property import is_modifiable_property
 from ..utils.tokenization import train_basic_tokenization
 examples = [
    {
@ -107,6 +108,9 @@ base_knowledge = {
    'swim': {
        "groups": {'verb'},
    },
    'planet': {
        'groups': {'noun'}
    }
 }
 def test_assumption(expectedResponse, knowledge, query):
@ -125,6 +129,8 @@ def main():
        knowledge=base_knowledge,
    )
    train_basic_tokenization(knowledge)
    for example in examples:
        with session().log(example['text']):
            differences = knowledge.train([example])
--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@ -0,0 +1,67 @@
 from ..session.org_mode import global_session as session
 from ..knowledge_base import KnowledgeBase
 from ..utils.visuals import show_progbar
 from ..visualization import show_knowledge
 def _assert(args):
    assert(args)
 def _assert_msg(args, msg):
    assert args, msg
 EXAMPLES = [
    ('example', {
        "text": 'cat',
        "tokens": ['cat'],
    }),
    ('example', {
        "text": 'cats',
        "tokens": ['cats'],
        "meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
    }),
    ('example', {
        "text": 'text separated by spaces',
        "tokens": ['text', 'separated', 'by', 'spaces'],
    }),
    ('test', {
        "text": 'plane',
        "tokens": ['plane'],
    }),
    ('test', {
        "text": 'planes',
        "tokens": ['planes'],
        "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
    }),
    ('test', {
        "text": 'some other text',
        "tokens": ['some', 'other', 'text'],
    })
 ]
 def main():
    knowledge = KnowledgeBase()
    total = len(EXAMPLES)
    for i, (case_type, example) in enumerate(EXAMPLES):
        show_progbar(i, total, example['text'])
        if case_type == 'example':
            with session().log(example['text']):
                knowledge.train_tokenizer(example)
        elif case_type == 'test':
            with session().log(example['text']):
                tokens = list(knowledge.tokenize(example['text']))
                assert example['tokens'] == tokens
        else:
            raise Exception('Not implemented case {}'.format(case_type))
    print("\r\x1b[K", end='')
    return knowledge
--- a/naive-nlu/tree_nlu/utils/tokenization.py
+++ b/naive-nlu/tree_nlu/utils/tokenization.py
@ -0,0 +1,19 @@
 BASIC_TOKENIZATION_EXAMPLES = (
    ({
        "text": 'cat',
        "tokens": ['cat'],
    }),
    ({
        "text": 'text separated by spaces',
        "tokens": ['text', 'separated', 'by', 'spaces'],
    }),
    ({
        "text": 'is earth a planet?',
        "tokens": ['is', 'earth', 'a', 'planet', '?'],
    }),
 )
 def train_basic_tokenization(knowledge_base):
    for example in BASIC_TOKENIZATION_EXAMPLES:
        knowledge_base.train_tokenizer(example)