From 998a183fd2bdcf8b89f1f0e18c22f64ca878af8f Mon Sep 17 00:00:00 2001
From: kenkeiras <kenkeiras@codigoparallevar.com>
Date: Sun, 15 Apr 2018 17:47:04 +0200
Subject: [PATCH] Dig deeper in cut-by-token approach.

---
 naive-nlu/tree_nlu/knowledge_base.py     |  3 +-
 naive-nlu/tree_nlu/parsing.py            | 91 ++++++++++++++++++++----
 naive-nlu/tree_nlu/test.py               |  6 +-
 naive-nlu/tree_nlu/tests/tokenization.py |  2 +
 4 files changed, 86 insertions(+), 16 deletions(-)

diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py
index b796d43..3e09ec6 100644
--- a/naive-nlu/tree_nlu/knowledge_base.py
+++ b/naive-nlu/tree_nlu/knowledge_base.py
@@ -75,7 +75,7 @@ class KnowledgeBase(object):
             session().log("Results:\n{}".format('\n'.join(map(str, options))))
 
             if return_one:
-                chosen = parsing.pick_one_tokenization(options)
+                chosen = parsing.pick_one_tokenization(options, self)
                 session().log("Chosen: “{}”".format(chosen))
                 return chosen
             return options
@@ -92,6 +92,7 @@ class KnowledgeBase(object):
         knowledge_before = copy.deepcopy(self.knowledge)
         with session().log("Process: {}".format(row)):
             tokens = self.tokenize(row)
+            print(tokens)
 
             fit = parsing.get_fit(self, tokens)
             if fit is None:
diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py
index 1450636..5683943 100644
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty
 from . import parameters
 from .atoms import Atom, a
 
+def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
+    for se in knowledge_base.structural_elements:
+        found_position = remaining.find(se)
+        found = found_position >= 0
+        session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
+        if found:
+            return [
+                (remaining[:found_position], se, remaining[found_position + len(se):])
+            ]
+
+    for token in knowledge_base.knowledge.keys():
+        found_position = remaining.find(token)
+        found = found_position >= 0
+        session().annotate('Looking for token “{}”, found? {}'.format(token, found))
+        if found:
+            return [
+                (remaining[:found_position], token, remaining[found_position + len(token):])
+            ]
+
+    return None
+
+
+
 def to_tokens(knowledge_base, text, acc=None):
     # TODO This is an extra-naïve implementation
     found = 0
@@ -33,10 +56,29 @@ def to_tokens(knowledge_base, text, acc=None):
                         remaining = remaining[len(thing):]
                         possibility.append(thing)
                 else:
-                    if i + 1 >= len(tokenization):  # Last element
-                        session().annotate("Token not found, considering it all of  “{}”".format(remaining))
-                        possibility.append(remaining)
-                        remaining = ""
+                    if i + 1 >= len(tokenization):  # Last element, lookahead for tokens/structural elements
+                        with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)):
+                            # If we start with remaining[0:] it's not a real lookahead
+                            # ... and it can get us trapped on infinite recursion
+                            splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
+
+                            if splits is None:
+                                session().log("No splits found, keeping remaining as token “{}”".format(remaining))
+
+                                possibility.append(remaining)
+                                remaining = ""
+
+                            else:
+                                # Consider we only have one possibility
+                                assert len(splits) == 1
+
+                                before_split, pivot, after_split = splits[0]
+                                before_split = remaining[0] + before_split
+
+                                session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split))
+
+                                possibility.append(before_split)
+                                remaining = pivot + after_split
 
                     else:  # Not las element, use the next one as cutter
                         # Try with (HYPERSIMPLISTIC!) backtracking
@@ -54,15 +96,17 @@ def to_tokens(knowledge_base, text, acc=None):
                 if remaining.find(token) < 0: # Not inmediately after!
                     break
                 remaining = remaining[len(token):]
-            session().annotate("OK, remaining: {}".format(remaining))
+            session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1)))
         else:
             # Tokenization applicable
             found += 1
             if remaining == '':
+                session().log("Concluded possibility “{}”".format(possibility))
                 yield possibility
             else:
-                for consecuent in to_tokens(knowledge_base, remaining, possibility):
-                    yield list(filter(lambda x: x != '', possibility + consecuent))
+                with session().log("Continuing with “{}”".format(remaining)):
+                    for consecuent in to_tokens(knowledge_base, remaining, possibility):
+                        yield list(filter(lambda x: x != '', possibility + consecuent))
     if found == 0:
         raise Exception('No tokenization found')
 
@@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
 
         knowledge_base.add_tokenization(tuple(elements))
 
-def pick_one_tokenization(options):
+def pick_one_tokenization(options, knowledge_base):
     '''
     Heuristic function to pick the most probable tokenization.
 
     Just pick the one with more results.
     '''
-    return sorted(options,
-                  key=lambda tokenization: len(tokenization),
-                  reverse=True)[0]
+    with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
+        return pick_by_score(options,
+                             [
+                                 # First by number of splits
+                                 lambda tokenization: len(tokenization),
+
+                                 # Among them, by number of splits without structuring elements
+                                 lambda tokenization: sum(map(
+                                     lambda split: -sum(map(
+                                         lambda se: se in split, knowledge_base.structural_elements
+                                     )), tokenization))
+                             ])
+
+def pick_by_score(options, heuristics):
+    for heuristic in heuristics:
+        assert(len(options) > 0)
+        options = list(map(lambda opt: (heuristic(opt), opt), options))
+        sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
+
+        heuristic_cutoff = sorted_options[0][0]
+        pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
+        options = pass_heuristic
+
+    session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
+    return options[0]
+
 
 def make_template(knowledge_base, tokens, parsed):
     matcher = list(tokens)
@@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example):
     parsed = example["parsed"]
 
     resolved_parsed = copy.deepcopy(parsed)
-    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
+    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
 
     while True:
         session().annotate("P: {}".format(resolved_parsed))
diff --git a/naive-nlu/tree_nlu/test.py b/naive-nlu/tree_nlu/test.py
index 683f85e..11cd561 100644
--- a/naive-nlu/tree_nlu/test.py
+++ b/naive-nlu/tree_nlu/test.py
@@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR)
 
 tests = (
     ("tokenization", tokenization),
-    ("basic", basic),
-    ("gac 100", gac_100),
-    ("gac+", gac_extension),
+    # ("basic", basic),
+    # ("gac 100", gac_100),
+    # ("gac+", gac_extension),
 )
 
 
diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py
index 0bc1a80..4b91dae 100644
--- a/naive-nlu/tree_nlu/tests/tokenization.py
+++ b/naive-nlu/tree_nlu/tests/tokenization.py
@@ -65,6 +65,8 @@ def main():
             with session().log(example['text']):
                 tokens = list(knowledge.tokenize(example['text']))
 
+                print(tokens)
+                print(example['tokens'])
                 assert example['tokens'] == tokens
 
         else: