Add (non-passing) tokenization.

2018-04-01 20:24:09 +02:00 · 2018-04-01 20:24:09 +02:00 · fc37450565
commit fc37450565
parent 75174e1736
7 changed files with 229 additions and 11 deletions
--- a/naive-nlu/tree_nlu/atoms.py
+++ b/naive-nlu/tree_nlu/atoms.py
@ -0,0 +1,14 @@
+'''
+Analogous to erlang ones.
+
+"An atom is a literal, a constant with name."
+'''
+
+from collections import namedtuple
+
+Atom = namedtuple('Atom', field_names='name')
+
+
+def a(name):
+    '''Build an atom with a given name.'''
+    return Atom(name)
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -14,11 +14,16 @@ def diff_knowledge(before, after):


 class KnowledgeBase(object):
-    def __init__(self, knowledge, examples=[], trained=[]):
+    def __init__(self, knowledge={}, examples=[], trained=[]):
        self.knowledge = copy.copy(knowledge)
        self.originals = []
        self.examples = copy.copy(examples)
        self.trained = copy.copy(trained)
+        self.tokenization = set()
+
+    def train_tokenizer(self, example):
+        with session().log('Train'):
+            parsing.integrate_tokenization(self, example)

    def train(self, examples):
        knowledge_before = copy.deepcopy(self.knowledge)
@ -26,7 +31,7 @@ class KnowledgeBase(object):
            # Parse everything
            for example in examples:
                # If there's parsed data, leverage it ASAP
-                if 'parsed' in example:
+                if 'parsed' in example and isinstance(example['parsed'], tuple):
                    with session().log('parsed information integration'):
                        result = knowledge_evaluation.integrate_information(self.knowledge, {
                            "parsed": example['parsed'],
@ -35,7 +40,8 @@ class KnowledgeBase(object):

                with session().log("language integration"):
                    tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
-                    session().annotate(tokens)
+                    session().annotate("Tokens: {}".format(tokens))
+                    session().annotate("Inferred tree: {}".format(inferred_tree))

                with session().log("full information integration"):
                    result = knowledge_evaluation.integrate_information(self.knowledge, {
@ -60,11 +66,19 @@ class KnowledgeBase(object):

            return knowledge_diff_getter

-    def process(self, row):
+    def tokenize(self, row, return_one=True):
        row = row.lower()
+        with session().log("Tokenize: {}".format(row)):
+            options = parsing.to_tokens(self, row)
+            if return_one:
+                return parsing.pick_one_tokenization(options)
+            return options
+
+    def process(self, row):
        knowledge_before = copy.deepcopy(self.knowledge)
        with session().log("Process: {}".format(row)):
-            tokens = parsing.to_tokens(row)
+            tokens = self.tokenize(row)
+
            fit = parsing.get_fit(self, tokens)
            if fit is None:
                return None
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -11,11 +11,105 @@ from functools import reduce
 from typing import List, Dict
 from .modifiable_property import ModifiableProperty
 from . import parameters
+from .atoms import Atom, a

-# TODO: more flexible tokenization
-def to_tokens(text):
-    return re.findall(r'(\w+|[^\s])', text)
+def to_tokens(knowledge_base, text, acc=None):
+    # TODO This is an extra-naïve implementation
+    found = 0

+    for tokenization in knowledge_base.tokenization:
+        remaining = text
+        possibility = []
+
+        for i, token in enumerate(tokenization):
+            if token == Atom('token'):
+                for thing in knowledge_base.knowledge.keys():
+                    if remaining.startswith(thing):
+                        # TODO We should also branch here, probably :\
+                        remaining = remaining[len(thing):]
+                        possibility.append(thing)
+                else:
+                    if i + 1 >= len(tokenization):
+                        possibility.append(remaining)
+                        remaining = ""
+
+                    else:
+                        # Try with (HYPERSIMPLISTIC!) backtracking
+                        # Cut using the next token we should use more!!!
+                        next_token = tokenization[i + 1]
+                        cutoff = remaining.find(next_token)
+                        if cutoff < 0:
+                            break
+
+                        possibility.append(remaining[:cutoff])
+                        remaining = remaining[cutoff:]
+            else:
+                if remaining.find(token) < 0: # Not inmediately after!
+                    break
+                remaining = remaining[len(token):]
+
+        else:
+            # Tokenization applicable
+            found += 1
+            if remaining == '':
+                yield possibility
+            else:
+                for consecuent in to_tokens(knowledge_base, remaining, possibility):
+                    yield list(filter(lambda x: x != '', possibility + consecuent))
+    if found == 0:
+        raise Exception('No tokenization found')
+
+def integrate_tokenization(knowledge_base, example):
+    text = example['text']
+    tokens = example['tokens']
+    meaning = example.get('meaning')
+
+    return integrate_token_to_text_matching(knowledge_base, text, tokens)
+
+
+def integrate_token_to_text_matching(knowledge_base, text, tokens):
+    texts = [text]
+
+    # Convert to tokens
+    for token_id, token in enumerate(tokens):
+        # Look for token in texts
+        for i, text in enumerate(texts):
+            if isinstance(text, int):
+                continue
+
+            if token in text:
+                before, after = text.split(token, maxsplit=1)
+                texts = (texts[:i] + [before]
+                         + [token_id]
+                         + [after] + texts[i + 1:])
+                break
+        else:
+            raise Exception('Token not found')
+
+    # Remove leftovers from splits
+    texts = list(filter(lambda x: x != '', texts))
+
+    for token_id, _token in enumerate(tokens):
+        # Find all elements between current token and next token
+        i = texts.index(token_id)
+        elements = [a('token')]
+
+        i += 1
+        while i < len(texts) and not isinstance(texts[i], int):
+            elements.append(texts[i])
+            i += 1
+
+        knowledge_base.tokenization.add(tuple(elements))
+
+def pick_one_tokenization(options):
+    '''
+    Heuristic function to pick the most probable tokenization.
+
+    Just pick the one with more results.
+    '''
+    return sorted(options,
+                  key=lambda tokenization: len(tokenization),
+                  reverse=True)[0]

 def make_template(knowledge_base, tokens, parsed):
    matcher = list(tokens)
@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example):
    parsed = example["parsed"]

    resolved_parsed = copy.deepcopy(parsed)
-    tokens = to_tokens(text)
+    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))

    while True:
        session().annotate("P: {}".format(resolved_parsed))
--- a/naive-nlu/tree_nlu/test.py
+++ b/naive-nlu/tree_nlu/test.py
@ -1,7 +1,8 @@
 import traceback
 import logging
-import datetime
 from .session import org_mode
+
+from .tests import tokenization
 from .tests import basic
 from .tests import gac_100
 from .tests import gac_extension
@ -9,6 +10,7 @@ from .tests import gac_extension
 logging.getLogger().setLevel(logging.ERROR)

 tests = (
+    ("tokenization", tokenization),
    ("basic", basic),
    ("gac 100", gac_100),
    ("gac+", gac_extension),
@ -24,12 +26,14 @@ def main():
    failed = False
    for test_name, test_module in tests:
        try:
-            test_module.main()
+            with org_mode.global_session().log(test_name):
+                test_module.main()
            print(" \x1b[1;32m✓\x1b[0m {}".format(test_name))
        except AssertionError as ae:
            print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name,
                                                  ('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0
                                                   else ''))
+            traceback.print_exc()
            failed = True

        except Exception as e:
--- a/naive-nlu/tree_nlu/tests/basic.py
+++ b/naive-nlu/tree_nlu/tests/basic.py
@ -3,6 +3,7 @@ import json

 from ..knowledge_base import KnowledgeBase
 from ..modifiable_property import is_modifiable_property
+from ..utils.tokenization import train_basic_tokenization

 examples = [
    {
@ -107,6 +108,9 @@ base_knowledge = {
    'swim': {
        "groups": {'verb'},
    },
+    'planet': {
+        'groups': {'noun'}
+    }
 }

 def test_assumption(expectedResponse, knowledge, query):
@ -125,6 +129,8 @@ def main():
        knowledge=base_knowledge,
    )

+    train_basic_tokenization(knowledge)
+
    for example in examples:
        with session().log(example['text']):
            differences = knowledge.train([example])
--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@ -0,0 +1,67 @@
+from ..session.org_mode import global_session as session
+from ..knowledge_base import KnowledgeBase
+from ..utils.visuals import show_progbar
+from ..visualization import show_knowledge
+
+
+def _assert(args):
+    assert(args)
+
+
+def _assert_msg(args, msg):
+    assert args, msg
+
+
+EXAMPLES = [
+    ('example', {
+        "text": 'cat',
+        "tokens": ['cat'],
+    }),
+    ('example', {
+        "text": 'cats',
+        "tokens": ['cats'],
+        "meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
+    }),
+    ('example', {
+        "text": 'text separated by spaces',
+        "tokens": ['text', 'separated', 'by', 'spaces'],
+    }),
+
+    ('test', {
+        "text": 'plane',
+        "tokens": ['plane'],
+    }),
+    ('test', {
+        "text": 'planes',
+        "tokens": ['planes'],
+        "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
+    }),
+    ('test', {
+        "text": 'some other text',
+        "tokens": ['some', 'other', 'text'],
+    })
+]
+
+
+def main():
+    knowledge = KnowledgeBase()
+
+    total = len(EXAMPLES)
+
+    for i, (case_type, example) in enumerate(EXAMPLES):
+        show_progbar(i, total, example['text'])
+        if case_type == 'example':
+            with session().log(example['text']):
+                knowledge.train_tokenizer(example)
+
+        elif case_type == 'test':
+            with session().log(example['text']):
+                tokens = list(knowledge.tokenize(example['text']))
+
+                assert example['tokens'] == tokens
+
+        else:
+            raise Exception('Not implemented case {}'.format(case_type))
+
+    print("\r\x1b[K", end='')
+    return knowledge
--- a/naive-nlu/tree_nlu/utils/tokenization.py
+++ b/naive-nlu/tree_nlu/utils/tokenization.py
@ -0,0 +1,19 @@
+BASIC_TOKENIZATION_EXAMPLES = (
+    ({
+        "text": 'cat',
+        "tokens": ['cat'],
+    }),
+    ({
+        "text": 'text separated by spaces',
+        "tokens": ['text', 'separated', 'by', 'spaces'],
+    }),
+    ({
+        "text": 'is earth a planet?',
+        "tokens": ['is', 'earth', 'a', 'planet', '?'],
+    }),
+)
+
+
+def train_basic_tokenization(knowledge_base):
+    for example in BASIC_TOKENIZATION_EXAMPLES:
+        knowledge_base.train_tokenizer(example)