Dig deeper in cut-by-token approach.

2018-04-15 17:47:04 +02:00 · 2018-04-15 17:47:04 +02:00 · 998a183fd2
commit 998a183fd2
parent d601ae3f83
4 changed files with 86 additions and 16 deletions
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@ -75,7 +75,7 @@ class KnowledgeBase(object):
            session().log("Results:\n{}".format('\n'.join(map(str, options))))

            if return_one:
-                chosen = parsing.pick_one_tokenization(options)
+                chosen = parsing.pick_one_tokenization(options, self)
                session().log("Chosen: “{}”".format(chosen))
                return chosen
            return options
@ -92,6 +92,7 @@ class KnowledgeBase(object):
        knowledge_before = copy.deepcopy(self.knowledge)
        with session().log("Process: {}".format(row)):
            tokens = self.tokenize(row)
+            print(tokens)

            fit = parsing.get_fit(self, tokens)
            if fit is None:
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty
 from . import parameters
 from .atoms import Atom, a

+def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
+    for se in knowledge_base.structural_elements:
+        found_position = remaining.find(se)
+        found = found_position >= 0
+        session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
+        if found:
+            return [
+                (remaining[:found_position], se, remaining[found_position + len(se):])
+            ]
+
+    for token in knowledge_base.knowledge.keys():
+        found_position = remaining.find(token)
+        found = found_position >= 0
+        session().annotate('Looking for token “{}”, found? {}'.format(token, found))
+        if found:
+            return [
+                (remaining[:found_position], token, remaining[found_position + len(token):])
+            ]
+
+    return None
+
+
+
 def to_tokens(knowledge_base, text, acc=None):
    # TODO This is an extra-naïve implementation
    found = 0
@ -33,10 +56,29 @@ def to_tokens(knowledge_base, text, acc=None):
                        remaining = remaining[len(thing):]
                        possibility.append(thing)
                else:
-                    if i + 1 >= len(tokenization):  # Last element
-                        session().annotate("Token not found, considering it all of  “{}”".format(remaining))
-                        possibility.append(remaining)
-                        remaining = ""
+                    if i + 1 >= len(tokenization):  # Last element, lookahead for tokens/structural elements
+                        with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)):
+                            # If we start with remaining[0:] it's not a real lookahead
+                            # ... and it can get us trapped on infinite recursion
+                            splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
+
+                            if splits is None:
+                                session().log("No splits found, keeping remaining as token “{}”".format(remaining))
+
+                                possibility.append(remaining)
+                                remaining = ""
+
+                            else:
+                                # Consider we only have one possibility
+                                assert len(splits) == 1
+
+                                before_split, pivot, after_split = splits[0]
+                                before_split = remaining[0] + before_split
+
+                                session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split))
+
+                                possibility.append(before_split)
+                                remaining = pivot + after_split

                    else:  # Not las element, use the next one as cutter
                        # Try with (HYPERSIMPLISTIC!) backtracking
@ -54,15 +96,17 @@ def to_tokens(knowledge_base, text, acc=None):
                if remaining.find(token) < 0: # Not inmediately after!
                    break
                remaining = remaining[len(token):]
-            session().annotate("OK, remaining: {}".format(remaining))
+            session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1)))
        else:
            # Tokenization applicable
            found += 1
            if remaining == '':
+                session().log("Concluded possibility “{}”".format(possibility))
                yield possibility
            else:
-                for consecuent in to_tokens(knowledge_base, remaining, possibility):
-                    yield list(filter(lambda x: x != '', possibility + consecuent))
+                with session().log("Continuing with “{}”".format(remaining)):
+                    for consecuent in to_tokens(knowledge_base, remaining, possibility):
+                        yield list(filter(lambda x: x != '', possibility + consecuent))
    if found == 0:
        raise Exception('No tokenization found')

@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):

        knowledge_base.add_tokenization(tuple(elements))

-def pick_one_tokenization(options):
+def pick_one_tokenization(options, knowledge_base):
    '''
    Heuristic function to pick the most probable tokenization.

    Just pick the one with more results.
    '''
-    return sorted(options,
-                  key=lambda tokenization: len(tokenization),
-                  reverse=True)[0]
+    with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
+        return pick_by_score(options,
+                             [
+                                 # First by number of splits
+                                 lambda tokenization: len(tokenization),
+
+                                 # Among them, by number of splits without structuring elements
+                                 lambda tokenization: sum(map(
+                                     lambda split: -sum(map(
+                                         lambda se: se in split, knowledge_base.structural_elements
+                                     )), tokenization))
+                             ])
+
+def pick_by_score(options, heuristics):
+    for heuristic in heuristics:
+        assert(len(options) > 0)
+        options = list(map(lambda opt: (heuristic(opt), opt), options))
+        sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
+
+        heuristic_cutoff = sorted_options[0][0]
+        pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
+        options = pass_heuristic
+
+    session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
+    return options[0]
+

 def make_template(knowledge_base, tokens, parsed):
    matcher = list(tokens)
@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example):
    parsed = example["parsed"]

    resolved_parsed = copy.deepcopy(parsed)
-    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
+    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))

    while True:
        session().annotate("P: {}".format(resolved_parsed))
--- a/naive-nlu/tree_nlu/test.py
+++ b/naive-nlu/tree_nlu/test.py
@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR)

 tests = (
    ("tokenization", tokenization),
-    ("basic", basic),
-    ("gac 100", gac_100),
-    ("gac+", gac_extension),
+    # ("basic", basic),
+    # ("gac 100", gac_100),
+    # ("gac+", gac_extension),
 )


--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@ -65,6 +65,8 @@ def main():
            with session().log(example['text']):
                tokens = list(knowledge.tokenize(example['text']))

+                print(tokens)
+                print(example['tokens'])
                assert example['tokens'] == tokens

        else: