Add (non-passing) tokenization.

2018-04-01 20:24:09 +02:00 · 2018-04-01 20:24:09 +02:00 · fc37450565
commit fc37450565
parent 75174e1736
7 changed files with 229 additions and 11 deletions
--- a/naive-nlu/tree_nlu/parsing.py
+++ b/naive-nlu/tree_nlu/parsing.py
@ -11,11 +11,105 @@ from functools import reduce
 from typing import List, Dict
 from .modifiable_property import ModifiableProperty
 from . import parameters
+from .atoms import Atom, a

-# TODO: more flexible tokenization
-def to_tokens(text):
-    return re.findall(r'(\w+|[^\s])', text)
+def to_tokens(knowledge_base, text, acc=None):
+    # TODO This is an extra-naïve implementation
+    found = 0

+    for tokenization in knowledge_base.tokenization:
+        remaining = text
+        possibility = []
+
+        for i, token in enumerate(tokenization):
+            if token == Atom('token'):
+                for thing in knowledge_base.knowledge.keys():
+                    if remaining.startswith(thing):
+                        # TODO We should also branch here, probably :\
+                        remaining = remaining[len(thing):]
+                        possibility.append(thing)
+                else:
+                    if i + 1 >= len(tokenization):
+                        possibility.append(remaining)
+                        remaining = ""
+
+                    else:
+                        # Try with (HYPERSIMPLISTIC!) backtracking
+                        # Cut using the next token we should use more!!!
+                        next_token = tokenization[i + 1]
+                        cutoff = remaining.find(next_token)
+                        if cutoff < 0:
+                            break
+
+                        possibility.append(remaining[:cutoff])
+                        remaining = remaining[cutoff:]
+            else:
+                if remaining.find(token) < 0: # Not inmediately after!
+                    break
+                remaining = remaining[len(token):]
+
+        else:
+            # Tokenization applicable
+            found += 1
+            if remaining == '':
+                yield possibility
+            else:
+                for consecuent in to_tokens(knowledge_base, remaining, possibility):
+                    yield list(filter(lambda x: x != '', possibility + consecuent))
+    if found == 0:
+        raise Exception('No tokenization found')
+
+def integrate_tokenization(knowledge_base, example):
+    text = example['text']
+    tokens = example['tokens']
+    meaning = example.get('meaning')
+
+    return integrate_token_to_text_matching(knowledge_base, text, tokens)
+
+
+def integrate_token_to_text_matching(knowledge_base, text, tokens):
+    texts = [text]
+
+    # Convert to tokens
+    for token_id, token in enumerate(tokens):
+        # Look for token in texts
+        for i, text in enumerate(texts):
+            if isinstance(text, int):
+                continue
+
+            if token in text:
+                before, after = text.split(token, maxsplit=1)
+                texts = (texts[:i] + [before]
+                         + [token_id]
+                         + [after] + texts[i + 1:])
+                break
+        else:
+            raise Exception('Token not found')
+
+    # Remove leftovers from splits
+    texts = list(filter(lambda x: x != '', texts))
+
+    for token_id, _token in enumerate(tokens):
+        # Find all elements between current token and next token
+        i = texts.index(token_id)
+        elements = [a('token')]
+
+        i += 1
+        while i < len(texts) and not isinstance(texts[i], int):
+            elements.append(texts[i])
+            i += 1
+
+        knowledge_base.tokenization.add(tuple(elements))
+
+def pick_one_tokenization(options):
+    '''
+    Heuristic function to pick the most probable tokenization.
+
+    Just pick the one with more results.
+    '''
+    return sorted(options,
+                  key=lambda tokenization: len(tokenization),
+                  reverse=True)[0]

 def make_template(knowledge_base, tokens, parsed):
    matcher = list(tokens)
@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example):
    parsed = example["parsed"]

    resolved_parsed = copy.deepcopy(parsed)
-    tokens = to_tokens(text)
+    tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))

    while True:
        session().annotate("P: {}".format(resolved_parsed))