Dig deeper in cut-by-token approach.

This commit is contained in:
kenkeiras 2018-04-15 17:47:04 +02:00
parent d601ae3f83
commit 998a183fd2
4 changed files with 86 additions and 16 deletions

View File

@ -75,7 +75,7 @@ class KnowledgeBase(object):
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = parsing.pick_one_tokenization(options)
chosen = parsing.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
return chosen
return options
@ -92,6 +92,7 @@ class KnowledgeBase(object):
knowledge_before = copy.deepcopy(self.knowledge)
with session().log("Process: {}".format(row)):
tokens = self.tokenize(row)
print(tokens)
fit = parsing.get_fit(self, tokens)
if fit is None:

View File

@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty
from . import parameters
from .atoms import Atom, a
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements:
found_position = remaining.find(se)
found = found_position >= 0
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
if found:
return [
(remaining[:found_position], se, remaining[found_position + len(se):])
]
for token in knowledge_base.knowledge.keys():
found_position = remaining.find(token)
found = found_position >= 0
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
if found:
return [
(remaining[:found_position], token, remaining[found_position + len(token):])
]
return None
def to_tokens(knowledge_base, text, acc=None):
# TODO This is an extra-naïve implementation
found = 0
@ -33,10 +56,29 @@ def to_tokens(knowledge_base, text, acc=None):
remaining = remaining[len(thing):]
possibility.append(thing)
else:
if i + 1 >= len(tokenization): # Last element
session().annotate("Token not found, considering it all of “{}".format(remaining))
possibility.append(remaining)
remaining = ""
if i + 1 >= len(tokenization): # Last element, lookahead for tokens/structural elements
with session().log("Token not found, looking ahead for splits on “{}".format(remaining)):
# If we start with remaining[0:] it's not a real lookahead
# ... and it can get us trapped on infinite recursion
splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
if splits is None:
session().log("No splits found, keeping remaining as token “{}".format(remaining))
possibility.append(remaining)
remaining = ""
else:
# Consider we only have one possibility
assert len(splits) == 1
before_split, pivot, after_split = splits[0]
before_split = remaining[0] + before_split
session().log("1 split found, cutting on token “{}”, keeping “{}".format(found, before_split))
possibility.append(before_split)
remaining = pivot + after_split
else: # Not las element, use the next one as cutter
# Try with (HYPERSIMPLISTIC!) backtracking
@ -54,15 +96,17 @@ def to_tokens(knowledge_base, text, acc=None):
if remaining.find(token) < 0: # Not inmediately after!
break
remaining = remaining[len(token):]
session().annotate("OK, remaining: {}".format(remaining))
session().annotate("OK, remaining: {}” with {} items".format(remaining, len(tokenization) - (i + 1)))
else:
# Tokenization applicable
found += 1
if remaining == '':
session().log("Concluded possibility “{}".format(possibility))
yield possibility
else:
for consecuent in to_tokens(knowledge_base, remaining, possibility):
yield list(filter(lambda x: x != '', possibility + consecuent))
with session().log("Continuing with “{}".format(remaining)):
for consecuent in to_tokens(knowledge_base, remaining, possibility):
yield list(filter(lambda x: x != '', possibility + consecuent))
if found == 0:
raise Exception('No tokenization found')
@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
knowledge_base.add_tokenization(tuple(elements))
def pick_one_tokenization(options):
def pick_one_tokenization(options, knowledge_base):
'''
Heuristic function to pick the most probable tokenization.
Just pick the one with more results.
'''
return sorted(options,
key=lambda tokenization: len(tokenization),
reverse=True)[0]
with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
return pick_by_score(options,
[
# First by number of splits
lambda tokenization: len(tokenization),
# Among them, by number of splits without structuring elements
lambda tokenization: sum(map(
lambda split: -sum(map(
lambda se: se in split, knowledge_base.structural_elements
)), tokenization))
])
def pick_by_score(options, heuristics):
for heuristic in heuristics:
assert(len(options) > 0)
options = list(map(lambda opt: (heuristic(opt), opt), options))
sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
heuristic_cutoff = sorted_options[0][0]
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
options = pass_heuristic
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
return options[0]
def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens)
@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example):
parsed = example["parsed"]
resolved_parsed = copy.deepcopy(parsed)
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
while True:
session().annotate("P: {}".format(resolved_parsed))

View File

@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR)
tests = (
("tokenization", tokenization),
("basic", basic),
("gac 100", gac_100),
("gac+", gac_extension),
# ("basic", basic),
# ("gac 100", gac_100),
# ("gac+", gac_extension),
)

View File

@ -65,6 +65,8 @@ def main():
with session().log(example['text']):
tokens = list(knowledge.tokenize(example['text']))
print(tokens)
print(example['tokens'])
assert example['tokens'] == tokens
else: