Dig deeper in cut-by-token approach.

2018-04-15 17:47:04 +02:00 · 2018-04-15 17:47:04 +02:00 · 998a183fd2
commit 998a183fd2
parent d601ae3f83
4 changed files with 86 additions and 16 deletions
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -75,7 +75,7 @@ class KnowledgeBase(object):
            session().log("Results:\n{}".format('\n'.join(map(str, options))))
            if return_one:
-                chosen = parsing.pick_one_tokenization(options)
+                chosen = parsing.pick_one_tokenization(options, self)
                session().log("Chosen: “{}”".format(chosen))
                return chosen
            return options
@ -92,6 +92,7 @@ class KnowledgeBase(object):
        knowledge_before = copy.deepcopy(self.knowledge)
        with session().log("Process: {}".format(row)):
            tokens = self.tokenize(row)
            print(tokens)
            fit = parsing.get_fit(self, tokens)
            if fit is None:
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty
 from . import parameters
 from .atoms import Atom, a
 def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
    for se in knowledge_base.structural_elements:
        found_position = remaining.find(se)
        found = found_position >= 0
        session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
        if found:
            return [
                (remaining[:found_position], se, remaining[found_position + len(se):])
            ]
    for token in knowledge_base.knowledge.keys():
        found_position = remaining.find(token)
        found = found_position >= 0
        session().annotate('Looking for token “{}”, found? {}'.format(token, found))
        if found:
            return [
                (remaining[:found_position], token, remaining[found_position + len(token):])
            ]
    return None
 def to_tokens(knowledge_base, text, acc=None):
    # TODO This is an extra-naïve implementation
    found = 0
@ -33,10 +56,29 @@ def to_tokens(knowledge_base, text, acc=None):
                        remaining = remaining[len(thing):]
                        possibility.append(thing)
                else:
-                    if i + 1 >= len(tokenization):  # Last element
+                    if i + 1 >= len(tokenization):  # Last element, lookahead for tokens/structural elements
-                        session().annotate("Token not found, considering it all of  “{}”".format(remaining))
+                        with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)):
-                        possibility.append(remaining)
+                            # If we start with remaining[0:] it's not a real lookahead
-                        remaining = ""
+                            # ... and it can get us trapped on infinite recursion
                            splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
                            if splits is None:
                                session().log("No splits found, keeping remaining as token “{}”".format(remaining))
                                possibility.append(remaining)
                                remaining = ""
                            else:
                                # Consider we only have one possibility
                                assert len(splits) == 1
                                before_split, pivot, after_split = splits[0]
                                before_split = remaining[0] + before_split
                                session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split))
                                possibility.append(before_split)
                                remaining = pivot + after_split
                    else:  # Not las element, use the next one as cutter
                        # Try with (HYPERSIMPLISTIC!) backtracking
@ -54,15 +96,17 @@ def to_tokens(knowledge_base, text, acc=None):
                if remaining.find(token) < 0: # Not inmediately after!
                    break
                remaining = remaining[len(token):]
-            session().annotate("OK, remaining: {}".format(remaining))
+            session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1)))
        else:
            # Tokenization applicable
            found += 1
            if remaining == '':
                session().log("Concluded possibility “{}”".format(possibility))
                yield possibility
            else:
-                for consecuent in to_tokens(knowledge_base, remaining, possibility):
+                with session().log("Continuing with “{}”".format(remaining)):
-                    yield list(filter(lambda x: x != '', possibility + consecuent))
+                    for consecuent in to_tokens(knowledge_base, remaining, possibility):
                        yield list(filter(lambda x: x != '', possibility + consecuent))
    if found == 0:
        raise Exception('No tokenization found')
@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
        knowledge_base.add_tokenization(tuple(elements))
-def pick_one_tokenization(options):
+def pick_one_tokenization(options, knowledge_base):
    '''
    Heuristic function to pick the most probable tokenization.
    Just pick the one with more results.
    '''
-    return sorted(options,
+    with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
-                  key=lambda tokenization: len(tokenization),
+        return pick_by_score(options,
-                  reverse=True)[0]
+                             [
                                 # First by number of splits
                                 lambda tokenization: len(tokenization),
                                 # Among them, by number of splits without structuring elements
                                 lambda tokenization: sum(map(
                                     lambda split: -sum(map(
                                         lambda se: se in split, knowledge_base.structural_elements
                                     )), tokenization))
                             ])
 def pick_by_score(options, heuristics):
    for heuristic in heuristics:
        assert(len(options) > 0)
        options = list(map(lambda opt: (heuristic(opt), opt), options))
        sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
        heuristic_cutoff = sorted_options[0][0]
        pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
        options = pass_heuristic
    session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
    return options[0]
 def make_template(knowledge_base, tokens, parsed):
    matcher = list(tokens)
@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example):
    parsed = example["parsed"]
    resolved_parsed = copy.deepcopy(parsed)
-    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
+    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
    while True:
        session().annotate("P: {}".format(resolved_parsed))
--- a/naive-nlu/tree_nlu/test.py
+++ b/naive-nlu/tree_nlu/test.py
@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR)
 tests = (
    ("tokenization", tokenization),
-    ("basic", basic),
+    # ("basic", basic),
-    ("gac 100", gac_100),
+    # ("gac 100", gac_100),
-    ("gac+", gac_extension),
+    # ("gac+", gac_extension),
 )
--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@ -65,6 +65,8 @@ def main():
            with session().log(example['text']):
                tokens = list(knowledge.tokenize(example['text']))
                print(tokens)
                print(example['tokens'])
                assert example['tokens'] == tokens
        else: