Move to a chaining model for tokenization.

This model also explores more tokenization possibilities. With this, the tokenization tests are passed.
2018-04-15 20:06:21 +02:00 · 2018-04-15 20:06:21 +02:00 · 79034f85a9
commit 79034f85a9
parent 998a183fd2
3 changed files with 153 additions and 97 deletions
--- a/naive-nlu/tree_nlu/atoms.py
+++ b/naive-nlu/tree_nlu/atoms.py
@ -8,6 +8,15 @@ from collections import namedtuple

 Atom = namedtuple('Atom', field_names='name')

+def is_atom(element, name=None):
+    '''Check if an element is an atom with a specific name.'''
+    if not isinstance(element, Atom):
+        return False
+
+    if name is None:
+        return True
+
+    return element.name == name

 def a(name):
    '''Build an atom with a given name.'''
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -7,25 +7,69 @@ from .atoms import Atom
 from . import parsing
 from . import knowledge_evaluation
 from .modifiable_property import is_modifiable_property
-
+import random

 def diff_knowledge(before, after):
    import jsondiff
    return jsondiff.diff(before, after)


+def randomized_weighted_list(elements):
+    # Randomized
+    randomized = list(elements)
+    random.shuffle(randomized)
+
+    # And return only once
+    already_returned = set()
+    for e in randomized:
+        if e in already_returned:
+            continue
+
+        yield e
+        already_returned.add(e)
+
+
+
 class KnowledgeBase(object):
    def __init__(self, knowledge={}, examples=[], trained=[]):
        self.knowledge = copy.copy(knowledge)
        self.originals = []
        self.examples = copy.copy(examples)
        self.trained = copy.copy(trained)
-        self.tokenization = set()
        self.structural_elements = set()
+        self.token_chains = {}
+        self.tokens = set()
+
+    def add_token_pair(self, precedent, consequent):
+        self.add_token(precedent)
+        self.add_token(consequent)
+
+        if precedent not in self.token_chains:
+            self.token_chains[precedent] = []
+        self.token_chains[precedent].append(consequent)
+
+    def add_token(self, token):
+        self.tokens.add(token)
+        if (not isinstance(token, Atom)) and (token not in self.structural_elements):
+            session().annotate('Found new structural element “{}”'.format(token))
+            self.structural_elements.add(token)
+
+    def expected_token_after_precedent(self, precedent=None):
+        if precedent not in self.token_chains:  # If there's no known precedent, just return all tokens
+            return randomized_weighted_list(self.tokens)
+
+        return randomized_weighted_list(self.token_chains[precedent])

    def train_tokenizer(self, example):
-        with session().log('Train'):
-            parsing.integrate_tokenization(self, example)
+        with session().log('Training tokenizer'):
+            session().annotate("Example: {}".format(example))
+            tokens = parsing.integrate_tokenization(self, example)
+
+            # Integrate knowledge of concept
+            for token in tokens:
+                if not token in self.knowledge:
+                    self.knowledge[token] = {}
+

    def train(self, examples):
        knowledge_before = copy.deepcopy(self.knowledge)
@ -80,14 +124,6 @@ class KnowledgeBase(object):
                return chosen
            return options

-    def add_tokenization(self, tokenization):
-        with session().log('Added tokenization: “{}”'.format(tokenization)):
-            self.tokenization.add(tokenization)
-            for e in tokenization:
-                if (not isinstance(e, Atom)) and (e not in self.structural_elements):
-                    session().annotate('Found new structural element “{}”'.format(e))
-                    self.structural_elements.add(e)
-
    def process(self, row):
        knowledge_before = copy.deepcopy(self.knowledge)
        with session().log("Process: {}".format(row)):
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -11,7 +11,7 @@ from functools import reduce
 from typing import List, Dict
 from .modifiable_property import ModifiableProperty
 from . import parameters
-from .atoms import Atom, a
+from .atoms import Atom, a, is_atom

 def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
    for se in knowledge_base.structural_elements:
@ -36,79 +36,84 @@ def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):



-def to_tokens(knowledge_base, text, acc=None):
-    # TODO This is an extra-naïve implementation
-    found = 0
+def to_tokens(knowledge_base, text, precedent=None):
+    if len(text) == 0:
+        session().annotate("No text remaining")
+        yield ['']
+        return

-    for tokenization in knowledge_base.tokenization:
-      with session().log("Tokenization {}".format(tokenization)):
-        remaining = text
-        possibility = []
+    with session().log("Tokenizing {}".format(text)):
+        for option in knowledge_base.expected_token_after_precedent(precedent):
+            with session().log("Next: “{}”".format(option)):
+                with session().log("Matching “{}” on “{}”".format(option, text)):
+                    for token_match in tokenization_match(option, text, knowledge_base):
+                        if token_match is None:
+                            session().annotate("No match")

-        # Apply tokenization to all elmenets
-        for i, token in enumerate(tokenization):
-          with session().log("T “{}” over “{}”".format(token, remaining)):
-            if token == Atom('token'):
-                for thing in knowledge_base.knowledge.keys():
-                    session().annotate("Testing with “{}”".format(thing))
-                    if remaining.startswith(thing):
-                        # TODO We should also branch here, probably :\
-                        remaining = remaining[len(thing):]
-                        possibility.append(thing)
-                else:
-                    if i + 1 >= len(tokenization):  # Last element, lookahead for tokens/structural elements
-                        with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)):
-                            # If we start with remaining[0:] it's not a real lookahead
-                            # ... and it can get us trapped on infinite recursion
-                            splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
+                        match, remaining = token_match
+                        if len(remaining) == len(text):
+                            raise Exception('No text consumed in match')

-                            if splits is None:
-                                session().log("No splits found, keeping remaining as token “{}”".format(remaining))
+                        session().annotate('Match: “{}”'.format(match))
+                        with session().log('Remaining “{}”'.format(remaining)):
+                            for sublevel in to_tokens(knowledge_base, remaining, match):
+                                candidate = list(filter(lambda x: x != '', [match] + sublevel))
+                                session().annotate('Yielding candidate “{}”'.format(candidate))
+                                yield candidate

-                                possibility.append(remaining)
-                                remaining = ""

-                            else:
-                                # Consider we only have one possibility
-                                assert len(splits) == 1
-
-                                before_split, pivot, after_split = splits[0]
-                                before_split = remaining[0] + before_split
-
-                                session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split))
-
-                                possibility.append(before_split)
-                                remaining = pivot + after_split
-
-                    else:  # Not las element, use the next one as cutter
-                        # Try with (HYPERSIMPLISTIC!) backtracking
-                        # Cut using the next token we should use more!!!
-                        next_token = tokenization[i + 1]
-                        session().annotate("Trying to cut for next token on “{}”".format(next_token))
-
-                        cutoff = remaining.find(next_token)
-                        if cutoff < 0:
-                            break
-
-                        possibility.append(remaining[:cutoff])
-                        remaining = remaining[cutoff:]
-            else:
-                if remaining.find(token) < 0: # Not inmediately after!
-                    break
-                remaining = remaining[len(token):]
-            session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1)))
+def tokenization_match(element, text, knowledge_base):
+    # Constant/structural string matching
+    if isinstance(element, str):
+        if text.find(element) == 0:
+            # This match comes from a structuring element
+            # It doesn't appear on the tokenization
+            # So we should return it as an empty string
+            yield ('', text[len(element):])
+            return
        else:
-            # Tokenization applicable
-            found += 1
-            if remaining == '':
-                session().log("Concluded possibility “{}”".format(possibility))
-                yield possibility
-            else:
-                with session().log("Continuing with “{}”".format(remaining)):
-                    for consecuent in to_tokens(knowledge_base, remaining, possibility):
-                        yield list(filter(lambda x: x != '', possibility + consecuent))
-    if found == 0:
-        raise Exception('No tokenization found')
+            # No match found
+            return
+
+    elif is_atom(element, 'token'):
+        yield from match_single_token(text, knowledge_base)
+        return
+    raise NotImplementedError()
+
+
+def match_single_token(text, knowledge_base):
+    found_token = False
+    for token in knowledge_base.knowledge.keys():
+        if text.find(token) == 0:
+            yield token, text[len(token):]
+            found_token = True
+
+    if found_token:
+        return
+
+    session().annotate('No token found at the start of ”{}”'.format(text))
+    session().annotate('using structural elements to infer it')
+    # TODO: review this when multiple structural elements are available
+    for se in knowledge_base.structural_elements:
+        session().annotate('Looking for se “{}” in “{}”'.format(se, text))
+        position = text.find(se, 0)
+        found = position > 0  # 0 is not considered a valid position for this kind of split
+        if found:
+            session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))
+            yield text[:position], text[position:]
+
+    session().annotate('No structural element or token found, inferring only token remaining')
+    yield text, ''
+
+    # Using other tokens for cutoff
+    for token in knowledge_base.knowledge.keys():
+        session().annotate('Looking for token “{}” in “{}”'.format(token, text))
+        position = text.find(token)
+        found = position >= 0
+        if found:
+            session().annotate('Found ”{}”, in position ”{}”'.format(token, position))
+            yield text[:position], text[position:]
+

 def integrate_tokenization(knowledge_base, example):
    text = example['text']
@ -131,7 +136,7 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
            if token in text:
                before, after = text.split(token, maxsplit=1)
                texts = (texts[:i] + [before]
-                         + [token_id]
+                         + [a('token')]
                         + [after] + texts[i + 1:])
                break
        else:
@ -139,18 +144,16 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):

    # Remove leftovers from splits
    texts = list(filter(lambda x: x != '', texts))
+    session().log("Tokenized as {} over {}".format(texts, tokens))

-    for token_id, _token in enumerate(tokens):
-        # Find all elements between current token and next token
-        i = texts.index(token_id)
-        elements = [a('token')]
+    for i, element in enumerate(texts[:-1]):
+       learn_token_pair(element, texts[i + 1], knowledge_base)

-        i += 1
-        while i < len(texts) and not isinstance(texts[i], int):
-            elements.append(texts[i])
-            i += 1
+    return tokens
+
+def learn_token_pair(precedent, consequent, knowledge_base):
+    knowledge_base.add_token_pair(precedent, consequent)

-        knowledge_base.add_tokenization(tuple(elements))

 def pick_one_tokenization(options, knowledge_base):
    '''
@ -158,26 +161,34 @@ def pick_one_tokenization(options, knowledge_base):

    Just pick the one with more results.
    '''
+    options = list(options)
    with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
        return pick_by_score(options,
                             [
-                                 # First by number of splits
-                                 lambda tokenization: len(tokenization),
-
-                                 # Among them, by number of splits without structuring elements
+                                 # By number of splits without structuring elements
                                 lambda tokenization: sum(map(
-                                     lambda split: -sum(map(
+                                     lambda split: sum(map(
                                         lambda se: se in split, knowledge_base.structural_elements
-                                     )), tokenization))
+                                     )), tokenization)),
+
+                                 # By number of unknown tokens
+                                 lambda tokenization: len(list(filter(lambda token:
+                                                                      (token not in knowledge_base.knowledge.keys()) and
+                                                                      (token not in knowledge_base.structural_elements),
+                                                                      tokenization))),
+
+                                 # By number of splits
+                                 lambda tokenization: -len(tokenization),
                             ])

 def pick_by_score(options, heuristics):
    for heuristic in heuristics:
        assert(len(options) > 0)
        options = list(map(lambda opt: (heuristic(opt), opt), options))
-        sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
+        sorted_options = sorted(options, key=lambda x: x[0], reverse=False)

        heuristic_cutoff = sorted_options[0][0]
+        session().annotate(sorted_options)
        pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
        options = pass_heuristic