Dig deeper in cut-by-token approach.

This commit is contained in:
kenkeiras 2018-04-15 17:47:04 +02:00
parent d601ae3f83
commit 998a183fd2
4 changed files with 86 additions and 16 deletions

View File

@ -75,7 +75,7 @@ class KnowledgeBase(object):
session().log("Results:\n{}".format('\n'.join(map(str, options)))) session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one: if return_one:
chosen = parsing.pick_one_tokenization(options) chosen = parsing.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen)) session().log("Chosen: “{}".format(chosen))
return chosen return chosen
return options return options
@ -92,6 +92,7 @@ class KnowledgeBase(object):
knowledge_before = copy.deepcopy(self.knowledge) knowledge_before = copy.deepcopy(self.knowledge)
with session().log("Process: {}".format(row)): with session().log("Process: {}".format(row)):
tokens = self.tokenize(row) tokens = self.tokenize(row)
print(tokens)
fit = parsing.get_fit(self, tokens) fit = parsing.get_fit(self, tokens)
if fit is None: if fit is None:

View File

@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty
from . import parameters from . import parameters
from .atoms import Atom, a from .atoms import Atom, a
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements:
found_position = remaining.find(se)
found = found_position >= 0
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
if found:
return [
(remaining[:found_position], se, remaining[found_position + len(se):])
]
for token in knowledge_base.knowledge.keys():
found_position = remaining.find(token)
found = found_position >= 0
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
if found:
return [
(remaining[:found_position], token, remaining[found_position + len(token):])
]
return None
def to_tokens(knowledge_base, text, acc=None): def to_tokens(knowledge_base, text, acc=None):
# TODO This is an extra-naïve implementation # TODO This is an extra-naïve implementation
found = 0 found = 0
@ -33,10 +56,29 @@ def to_tokens(knowledge_base, text, acc=None):
remaining = remaining[len(thing):] remaining = remaining[len(thing):]
possibility.append(thing) possibility.append(thing)
else: else:
if i + 1 >= len(tokenization): # Last element if i + 1 >= len(tokenization): # Last element, lookahead for tokens/structural elements
session().annotate("Token not found, considering it all of “{}".format(remaining)) with session().log("Token not found, looking ahead for splits on “{}".format(remaining)):
possibility.append(remaining) # If we start with remaining[0:] it's not a real lookahead
remaining = "" # ... and it can get us trapped on infinite recursion
splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
if splits is None:
session().log("No splits found, keeping remaining as token “{}".format(remaining))
possibility.append(remaining)
remaining = ""
else:
# Consider we only have one possibility
assert len(splits) == 1
before_split, pivot, after_split = splits[0]
before_split = remaining[0] + before_split
session().log("1 split found, cutting on token “{}”, keeping “{}".format(found, before_split))
possibility.append(before_split)
remaining = pivot + after_split
else: # Not las element, use the next one as cutter else: # Not las element, use the next one as cutter
# Try with (HYPERSIMPLISTIC!) backtracking # Try with (HYPERSIMPLISTIC!) backtracking
@ -54,15 +96,17 @@ def to_tokens(knowledge_base, text, acc=None):
if remaining.find(token) < 0: # Not inmediately after! if remaining.find(token) < 0: # Not inmediately after!
break break
remaining = remaining[len(token):] remaining = remaining[len(token):]
session().annotate("OK, remaining: {}".format(remaining)) session().annotate("OK, remaining: {}” with {} items".format(remaining, len(tokenization) - (i + 1)))
else: else:
# Tokenization applicable # Tokenization applicable
found += 1 found += 1
if remaining == '': if remaining == '':
session().log("Concluded possibility “{}".format(possibility))
yield possibility yield possibility
else: else:
for consecuent in to_tokens(knowledge_base, remaining, possibility): with session().log("Continuing with “{}".format(remaining)):
yield list(filter(lambda x: x != '', possibility + consecuent)) for consecuent in to_tokens(knowledge_base, remaining, possibility):
yield list(filter(lambda x: x != '', possibility + consecuent))
if found == 0: if found == 0:
raise Exception('No tokenization found') raise Exception('No tokenization found')
@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
knowledge_base.add_tokenization(tuple(elements)) knowledge_base.add_tokenization(tuple(elements))
def pick_one_tokenization(options): def pick_one_tokenization(options, knowledge_base):
''' '''
Heuristic function to pick the most probable tokenization. Heuristic function to pick the most probable tokenization.
Just pick the one with more results. Just pick the one with more results.
''' '''
return sorted(options, with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
key=lambda tokenization: len(tokenization), return pick_by_score(options,
reverse=True)[0] [
# First by number of splits
lambda tokenization: len(tokenization),
# Among them, by number of splits without structuring elements
lambda tokenization: sum(map(
lambda split: -sum(map(
lambda se: se in split, knowledge_base.structural_elements
)), tokenization))
])
def pick_by_score(options, heuristics):
for heuristic in heuristics:
assert(len(options) > 0)
options = list(map(lambda opt: (heuristic(opt), opt), options))
sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
heuristic_cutoff = sorted_options[0][0]
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
options = pass_heuristic
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
return options[0]
def make_template(knowledge_base, tokens, parsed): def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens) matcher = list(tokens)
@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example):
parsed = example["parsed"] parsed = example["parsed"]
resolved_parsed = copy.deepcopy(parsed) resolved_parsed = copy.deepcopy(parsed)
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text))) tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
while True: while True:
session().annotate("P: {}".format(resolved_parsed)) session().annotate("P: {}".format(resolved_parsed))

View File

@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR)
tests = ( tests = (
("tokenization", tokenization), ("tokenization", tokenization),
("basic", basic), # ("basic", basic),
("gac 100", gac_100), # ("gac 100", gac_100),
("gac+", gac_extension), # ("gac+", gac_extension),
) )

View File

@ -65,6 +65,8 @@ def main():
with session().log(example['text']): with session().log(example['text']):
tokens = list(knowledge.tokenize(example['text'])) tokens = list(knowledge.tokenize(example['text']))
print(tokens)
print(example['tokens'])
assert example['tokens'] == tokens assert example['tokens'] == tokens
else: else: