Add (non-passing) tokenization.
This commit is contained in:
parent
75174e1736
commit
fc37450565
7 changed files with 229 additions and 11 deletions
|
@ -11,11 +11,105 @@ from functools import reduce
|
|||
from typing import List, Dict
|
||||
from .modifiable_property import ModifiableProperty
|
||||
from . import parameters
|
||||
from .atoms import Atom, a
|
||||
|
||||
# TODO: more flexible tokenization
|
||||
def to_tokens(text):
|
||||
return re.findall(r'(\w+|[^\s])', text)
|
||||
def to_tokens(knowledge_base, text, acc=None):
|
||||
# TODO This is an extra-naïve implementation
|
||||
found = 0
|
||||
|
||||
for tokenization in knowledge_base.tokenization:
|
||||
remaining = text
|
||||
possibility = []
|
||||
|
||||
for i, token in enumerate(tokenization):
|
||||
if token == Atom('token'):
|
||||
for thing in knowledge_base.knowledge.keys():
|
||||
if remaining.startswith(thing):
|
||||
# TODO We should also branch here, probably :\
|
||||
remaining = remaining[len(thing):]
|
||||
possibility.append(thing)
|
||||
else:
|
||||
if i + 1 >= len(tokenization):
|
||||
possibility.append(remaining)
|
||||
remaining = ""
|
||||
|
||||
else:
|
||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
||||
# Cut using the next token we should use more!!!
|
||||
next_token = tokenization[i + 1]
|
||||
cutoff = remaining.find(next_token)
|
||||
if cutoff < 0:
|
||||
break
|
||||
|
||||
possibility.append(remaining[:cutoff])
|
||||
remaining = remaining[cutoff:]
|
||||
else:
|
||||
if remaining.find(token) < 0: # Not inmediately after!
|
||||
break
|
||||
remaining = remaining[len(token):]
|
||||
|
||||
else:
|
||||
# Tokenization applicable
|
||||
found += 1
|
||||
if remaining == '':
|
||||
yield possibility
|
||||
else:
|
||||
for consecuent in to_tokens(knowledge_base, remaining, possibility):
|
||||
yield list(filter(lambda x: x != '', possibility + consecuent))
|
||||
if found == 0:
|
||||
raise Exception('No tokenization found')
|
||||
|
||||
def integrate_tokenization(knowledge_base, example):
|
||||
text = example['text']
|
||||
tokens = example['tokens']
|
||||
meaning = example.get('meaning')
|
||||
|
||||
return integrate_token_to_text_matching(knowledge_base, text, tokens)
|
||||
|
||||
|
||||
def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
||||
texts = [text]
|
||||
|
||||
# Convert to tokens
|
||||
for token_id, token in enumerate(tokens):
|
||||
# Look for token in texts
|
||||
for i, text in enumerate(texts):
|
||||
if isinstance(text, int):
|
||||
continue
|
||||
|
||||
if token in text:
|
||||
before, after = text.split(token, maxsplit=1)
|
||||
texts = (texts[:i] + [before]
|
||||
+ [token_id]
|
||||
+ [after] + texts[i + 1:])
|
||||
break
|
||||
else:
|
||||
raise Exception('Token not found')
|
||||
|
||||
# Remove leftovers from splits
|
||||
texts = list(filter(lambda x: x != '', texts))
|
||||
|
||||
for token_id, _token in enumerate(tokens):
|
||||
# Find all elements between current token and next token
|
||||
i = texts.index(token_id)
|
||||
elements = [a('token')]
|
||||
|
||||
i += 1
|
||||
while i < len(texts) and not isinstance(texts[i], int):
|
||||
elements.append(texts[i])
|
||||
i += 1
|
||||
|
||||
knowledge_base.tokenization.add(tuple(elements))
|
||||
|
||||
def pick_one_tokenization(options):
|
||||
'''
|
||||
Heuristic function to pick the most probable tokenization.
|
||||
|
||||
Just pick the one with more results.
|
||||
'''
|
||||
return sorted(options,
|
||||
key=lambda tokenization: len(tokenization),
|
||||
reverse=True)[0]
|
||||
|
||||
def make_template(knowledge_base, tokens, parsed):
|
||||
matcher = list(tokens)
|
||||
|
@ -87,7 +181,7 @@ def integrate_language(knowledge_base, example):
|
|||
parsed = example["parsed"]
|
||||
|
||||
resolved_parsed = copy.deepcopy(parsed)
|
||||
tokens = to_tokens(text)
|
||||
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
|
||||
|
||||
while True:
|
||||
session().annotate("P: {}".format(resolved_parsed))
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue