Dig deeper in cut-by-token approach.
This commit is contained in:
parent
d601ae3f83
commit
998a183fd2
@ -75,7 +75,7 @@ class KnowledgeBase(object):
|
|||||||
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
||||||
|
|
||||||
if return_one:
|
if return_one:
|
||||||
chosen = parsing.pick_one_tokenization(options)
|
chosen = parsing.pick_one_tokenization(options, self)
|
||||||
session().log("Chosen: “{}”".format(chosen))
|
session().log("Chosen: “{}”".format(chosen))
|
||||||
return chosen
|
return chosen
|
||||||
return options
|
return options
|
||||||
@ -92,6 +92,7 @@ class KnowledgeBase(object):
|
|||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
with session().log("Process: {}".format(row)):
|
with session().log("Process: {}".format(row)):
|
||||||
tokens = self.tokenize(row)
|
tokens = self.tokenize(row)
|
||||||
|
print(tokens)
|
||||||
|
|
||||||
fit = parsing.get_fit(self, tokens)
|
fit = parsing.get_fit(self, tokens)
|
||||||
if fit is None:
|
if fit is None:
|
||||||
|
@ -13,6 +13,29 @@ from .modifiable_property import ModifiableProperty
|
|||||||
from . import parameters
|
from . import parameters
|
||||||
from .atoms import Atom, a
|
from .atoms import Atom, a
|
||||||
|
|
||||||
|
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
||||||
|
for se in knowledge_base.structural_elements:
|
||||||
|
found_position = remaining.find(se)
|
||||||
|
found = found_position >= 0
|
||||||
|
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
|
||||||
|
if found:
|
||||||
|
return [
|
||||||
|
(remaining[:found_position], se, remaining[found_position + len(se):])
|
||||||
|
]
|
||||||
|
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
found_position = remaining.find(token)
|
||||||
|
found = found_position >= 0
|
||||||
|
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
|
||||||
|
if found:
|
||||||
|
return [
|
||||||
|
(remaining[:found_position], token, remaining[found_position + len(token):])
|
||||||
|
]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def to_tokens(knowledge_base, text, acc=None):
|
def to_tokens(knowledge_base, text, acc=None):
|
||||||
# TODO This is an extra-naïve implementation
|
# TODO This is an extra-naïve implementation
|
||||||
found = 0
|
found = 0
|
||||||
@ -33,11 +56,30 @@ def to_tokens(knowledge_base, text, acc=None):
|
|||||||
remaining = remaining[len(thing):]
|
remaining = remaining[len(thing):]
|
||||||
possibility.append(thing)
|
possibility.append(thing)
|
||||||
else:
|
else:
|
||||||
if i + 1 >= len(tokenization): # Last element
|
if i + 1 >= len(tokenization): # Last element, lookahead for tokens/structural elements
|
||||||
session().annotate("Token not found, considering it all of “{}”".format(remaining))
|
with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)):
|
||||||
|
# If we start with remaining[0:] it's not a real lookahead
|
||||||
|
# ... and it can get us trapped on infinite recursion
|
||||||
|
splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
|
||||||
|
|
||||||
|
if splits is None:
|
||||||
|
session().log("No splits found, keeping remaining as token “{}”".format(remaining))
|
||||||
|
|
||||||
possibility.append(remaining)
|
possibility.append(remaining)
|
||||||
remaining = ""
|
remaining = ""
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Consider we only have one possibility
|
||||||
|
assert len(splits) == 1
|
||||||
|
|
||||||
|
before_split, pivot, after_split = splits[0]
|
||||||
|
before_split = remaining[0] + before_split
|
||||||
|
|
||||||
|
session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split))
|
||||||
|
|
||||||
|
possibility.append(before_split)
|
||||||
|
remaining = pivot + after_split
|
||||||
|
|
||||||
else: # Not las element, use the next one as cutter
|
else: # Not las element, use the next one as cutter
|
||||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
# Try with (HYPERSIMPLISTIC!) backtracking
|
||||||
# Cut using the next token we should use more!!!
|
# Cut using the next token we should use more!!!
|
||||||
@ -54,13 +96,15 @@ def to_tokens(knowledge_base, text, acc=None):
|
|||||||
if remaining.find(token) < 0: # Not inmediately after!
|
if remaining.find(token) < 0: # Not inmediately after!
|
||||||
break
|
break
|
||||||
remaining = remaining[len(token):]
|
remaining = remaining[len(token):]
|
||||||
session().annotate("OK, remaining: {}".format(remaining))
|
session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1)))
|
||||||
else:
|
else:
|
||||||
# Tokenization applicable
|
# Tokenization applicable
|
||||||
found += 1
|
found += 1
|
||||||
if remaining == '':
|
if remaining == '':
|
||||||
|
session().log("Concluded possibility “{}”".format(possibility))
|
||||||
yield possibility
|
yield possibility
|
||||||
else:
|
else:
|
||||||
|
with session().log("Continuing with “{}”".format(remaining)):
|
||||||
for consecuent in to_tokens(knowledge_base, remaining, possibility):
|
for consecuent in to_tokens(knowledge_base, remaining, possibility):
|
||||||
yield list(filter(lambda x: x != '', possibility + consecuent))
|
yield list(filter(lambda x: x != '', possibility + consecuent))
|
||||||
if found == 0:
|
if found == 0:
|
||||||
@ -108,15 +152,38 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
|||||||
|
|
||||||
knowledge_base.add_tokenization(tuple(elements))
|
knowledge_base.add_tokenization(tuple(elements))
|
||||||
|
|
||||||
def pick_one_tokenization(options):
|
def pick_one_tokenization(options, knowledge_base):
|
||||||
'''
|
'''
|
||||||
Heuristic function to pick the most probable tokenization.
|
Heuristic function to pick the most probable tokenization.
|
||||||
|
|
||||||
Just pick the one with more results.
|
Just pick the one with more results.
|
||||||
'''
|
'''
|
||||||
return sorted(options,
|
with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
|
||||||
key=lambda tokenization: len(tokenization),
|
return pick_by_score(options,
|
||||||
reverse=True)[0]
|
[
|
||||||
|
# First by number of splits
|
||||||
|
lambda tokenization: len(tokenization),
|
||||||
|
|
||||||
|
# Among them, by number of splits without structuring elements
|
||||||
|
lambda tokenization: sum(map(
|
||||||
|
lambda split: -sum(map(
|
||||||
|
lambda se: se in split, knowledge_base.structural_elements
|
||||||
|
)), tokenization))
|
||||||
|
])
|
||||||
|
|
||||||
|
def pick_by_score(options, heuristics):
|
||||||
|
for heuristic in heuristics:
|
||||||
|
assert(len(options) > 0)
|
||||||
|
options = list(map(lambda opt: (heuristic(opt), opt), options))
|
||||||
|
sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
|
||||||
|
|
||||||
|
heuristic_cutoff = sorted_options[0][0]
|
||||||
|
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
|
||||||
|
options = pass_heuristic
|
||||||
|
|
||||||
|
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
|
||||||
|
return options[0]
|
||||||
|
|
||||||
|
|
||||||
def make_template(knowledge_base, tokens, parsed):
|
def make_template(knowledge_base, tokens, parsed):
|
||||||
matcher = list(tokens)
|
matcher = list(tokens)
|
||||||
@ -188,7 +255,7 @@ def integrate_language(knowledge_base, example):
|
|||||||
parsed = example["parsed"]
|
parsed = example["parsed"]
|
||||||
|
|
||||||
resolved_parsed = copy.deepcopy(parsed)
|
resolved_parsed = copy.deepcopy(parsed)
|
||||||
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text)))
|
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
session().annotate("P: {}".format(resolved_parsed))
|
session().annotate("P: {}".format(resolved_parsed))
|
||||||
|
@ -11,9 +11,9 @@ logging.getLogger().setLevel(logging.ERROR)
|
|||||||
|
|
||||||
tests = (
|
tests = (
|
||||||
("tokenization", tokenization),
|
("tokenization", tokenization),
|
||||||
("basic", basic),
|
# ("basic", basic),
|
||||||
("gac 100", gac_100),
|
# ("gac 100", gac_100),
|
||||||
("gac+", gac_extension),
|
# ("gac+", gac_extension),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,6 +65,8 @@ def main():
|
|||||||
with session().log(example['text']):
|
with session().log(example['text']):
|
||||||
tokens = list(knowledge.tokenize(example['text']))
|
tokens = list(knowledge.tokenize(example['text']))
|
||||||
|
|
||||||
|
print(tokens)
|
||||||
|
print(example['tokens'])
|
||||||
assert example['tokens'] == tokens
|
assert example['tokens'] == tokens
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
Loading…
Reference in New Issue
Block a user