Add (non-passing) tokenization.

This commit is contained in:
kenkeiras 2018-04-01 20:24:09 +02:00
parent 75174e1736
commit fc37450565
7 changed files with 229 additions and 11 deletions

View file

@ -11,11 +11,105 @@ from functools import reduce
from typing import List, Dict
from .modifiable_property import ModifiableProperty
from . import parameters
from .atoms import Atom, a
# TODO: more flexible tokenization
def to_tokens(text):
return re.findall(r'(\w+|[^\s])', text)
def to_tokens(knowledge_base, text, acc=None):
# TODO This is an extra-naïve implementation
found = 0
for tokenization in knowledge_base.tokenization:
remaining = text
possibility = []
for i, token in enumerate(tokenization):
if token == Atom('token'):
for thing in knowledge_base.knowledge.keys():
if remaining.startswith(thing):
# TODO We should also branch here, probably :\
remaining = remaining[len(thing):]
possibility.append(thing)
else:
if i + 1 >= len(tokenization):
possibility.append(remaining)
remaining = ""
else:
# Try with (HYPERSIMPLISTIC!) backtracking
# Cut using the next token we should use more!!!
next_token = tokenization[i + 1]
cutoff = remaining.find(next_token)
if cutoff < 0:
break
possibility.append(remaining[:cutoff])
remaining = remaining[cutoff:]
else:
if remaining.find(token) < 0: # Not inmediately after!
break
remaining = remaining[len(token):]
else:
# Tokenization applicable
found += 1
if remaining == '':
yield possibility
else:
for consecuent in to_tokens(knowledge_base, remaining, possibility):
yield list(filter(lambda x: x != '', possibility + consecuent))
if found == 0:
raise Exception('No tokenization found')
def integrate_tokenization(knowledge_base, example):
text = example['text']
tokens = example['tokens']
meaning = example.get('meaning')
return integrate_token_to_text_matching(knowledge_base, text, tokens)
def integrate_token_to_text_matching(knowledge_base, text, tokens):
texts = [text]
# Convert to tokens
for token_id, token in enumerate(tokens):
# Look for token in texts
for i, text in enumerate(texts):
if isinstance(text, int):
continue
if token in text:
before, after = text.split(token, maxsplit=1)
texts = (texts[:i] + [before]
+ [token_id]
+ [after] + texts[i + 1:])
break
else:
raise Exception('Token not found')
# Remove leftovers from splits
texts = list(filter(lambda x: x != '', texts))
for token_id, _token in enumerate(tokens):
# Find all elements between current token and next token
i = texts.index(token_id)
elements = [a('token')]
i += 1
while i < len(texts) and not isinstance(texts[i], int):
elements.append(texts[i])
i += 1
knowledge_base.tokenization.add(tuple(elements))
def pick_one_tokenization(options):
'''
Heuristic function to pick the most probable tokenization.
Just pick the one with more results.
'''
return sorted(options,
key=lambda tokenization: len(tokenization),
reverse=True)[0]
def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens)
@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example):
parsed = example["parsed"]
resolved_parsed = copy.deepcopy(parsed)
tokens = to_tokens(text)
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
while True:
session().annotate("P: {}".format(resolved_parsed))