Separate tokenization module.

This commit is contained in:
kenkeiras 2018-04-15 22:15:28 +02:00
parent 1306306723
commit 8b67b96d2f
3 changed files with 192 additions and 188 deletions

View File

@ -5,6 +5,7 @@ from .session.org_mode import global_session as session
from .atoms import Atom
from . import parsing
from . import tokenization
from . import knowledge_evaluation
from .modifiable_property import is_modifiable_property
import random
@ -63,7 +64,7 @@ class KnowledgeBase(object):
def train_tokenizer(self, example):
with session().log('Training tokenizer'):
session().annotate("Example: {}".format(example))
tokens = parsing.integrate_tokenization(self, example)
tokens = tokenization.integrate_tokenization(self, example)
# Integrate knowledge of concept
for token in tokens:
@ -115,11 +116,11 @@ class KnowledgeBase(object):
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = list(parsing.to_tokens(self, row))
options = list(tokenization.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = parsing.pick_one_tokenization(options, self)
chosen = tokenization.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train_tokenizer({'text': row, 'tokens': chosen})
return chosen

View File

@ -1,6 +1,7 @@
#!/usr/bin/env python
from . import knowledge_evaluation
from . import tokenization
from . import depth_meter
from .session.org_mode import global_session as session
@ -13,190 +14,6 @@ from .modifiable_property import ModifiableProperty
from . import parameters
from .atoms import Atom, a, is_atom
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements:
found_position = remaining.find(se)
found = found_position >= 0
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
if found:
return [
(remaining[:found_position], se, remaining[found_position + len(se):])
]
for token in knowledge_base.knowledge.keys():
found_position = remaining.find(token)
found = found_position >= 0
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
if found:
return [
(remaining[:found_position], token, remaining[found_position + len(token):])
]
return None
def to_tokens(knowledge_base, text, precedent=None):
if len(text) == 0:
session().annotate("No text remaining")
yield ['']
return
with session().log("Tokenizing {}".format(text)):
for option in knowledge_base.expected_token_after_precedent(precedent):
with session().log("Next: “{}".format(option)):
with session().log("Matching “{}” on “{}".format(option, text)):
for token_match in tokenization_match(option, text, knowledge_base):
if token_match is None:
session().annotate("No match")
match, remaining = token_match
if len(remaining) == len(text):
raise Exception('No text consumed in match')
session().annotate('Match: “{}'.format(match))
with session().log('Remaining “{}'.format(remaining)):
for sublevel in to_tokens(knowledge_base, remaining, match):
candidate = list(filter(lambda x: x != '', [match] + sublevel))
session().annotate('Yielding candidate “{}'.format(candidate))
yield candidate
def tokenization_match(element, text, knowledge_base):
# Constant/structural string matching
if isinstance(element, str):
if text.find(element) == 0:
# This match comes from a structuring element
# It doesn't appear on the tokenization
# So we should return it as an empty string
yield ('', text[len(element):])
return
else:
# No match found
return
elif is_atom(element, 'token'):
yield from match_single_token(text, knowledge_base)
return
raise NotImplementedError()
def match_single_token(text, knowledge_base):
found_token = False
for token in knowledge_base.knowledge.keys():
if text.find(token) == 0:
yield token, text[len(token):]
found_token = True
if found_token:
return
session().annotate('No token found at the start of ”{}'.format(text))
session().annotate('using structural elements to infer it')
# TODO: review this when multiple structural elements are available
for se in knowledge_base.structural_elements:
session().annotate('Looking for se “{}” in “{}'.format(se, text))
position = text.find(se, 0)
found = position > 0 # 0 is not considered a valid position for this kind of split
if found:
session().annotate('Found ”{}”, inferring “{}'.format(se, text[:position]))
yield text[:position], text[position:]
session().annotate('No structural element or token found, inferring only token remaining')
yield text, ''
# Using other tokens for cutoff
for token in knowledge_base.knowledge.keys():
session().annotate('Looking for token “{}” in “{}'.format(token, text))
position = text.find(token)
found = position >= 0
if found:
session().annotate('Found ”{}”, in position ”{}'.format(token, position))
yield text[:position], text[position:]
def integrate_tokenization(knowledge_base, example):
text = example['text']
tokens = example['tokens']
meaning = example.get('meaning')
return integrate_token_to_text_matching(knowledge_base, text, tokens)
def integrate_token_to_text_matching(knowledge_base, text, tokens):
texts = [text]
# Convert to tokens
for token_id, token in enumerate(tokens):
# Look for token in texts
for i, text in enumerate(texts):
if isinstance(text, int):
continue
if token in text:
before, after = text.split(token, maxsplit=1)
texts = (texts[:i] + [before]
+ [a('token')]
+ [after] + texts[i + 1:])
break
else:
raise Exception('Token not found')
# Remove leftovers from splits
texts = list(filter(lambda x: x != '', texts))
session().log("Tokenized as {} over {}".format(texts, tokens))
for i, element in enumerate(texts[:-1]):
learn_token_pair(element, texts[i + 1], knowledge_base)
return tokens
def learn_token_pair(precedent, consequent, knowledge_base):
knowledge_base.add_token_pair(precedent, consequent)
def pick_one_tokenization(options, knowledge_base):
'''
Heuristic function to pick the most probable tokenization.
Just pick the one with more results.
'''
options = list(options)
with session().log("Picking among: {} options".format(len(options))):
session().log("Options: \n{}".format('\n'.join(map(str, options))))
return pick_by_score(options,
[
# By number of splits without structuring elements
lambda tokenization: sum(map(
lambda split: sum(map(
lambda se: se in split, knowledge_base.structural_elements
)), tokenization)),
# By number of unknown tokens
lambda tokenization: len(list(filter(lambda token:
(token not in knowledge_base.knowledge.keys()) and
(token not in knowledge_base.structural_elements),
tokenization))),
# By number of splits
lambda tokenization: -len(tokenization),
])
def pick_by_score(options, heuristics):
for heuristic in heuristics:
assert(len(options) > 0)
options = list(map(lambda opt: (heuristic(opt), opt), options))
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
heuristic_cutoff = sorted_options[0][0]
session().annotate(sorted_options)
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
options = pass_heuristic
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
return options[0]
def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens)
template = list(parsed)
@ -267,7 +84,7 @@ def integrate_language(knowledge_base, example):
parsed = example["parsed"]
resolved_parsed = copy.deepcopy(parsed)
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base))
while True:
session().annotate("P: {}".format(resolved_parsed))

View File

@ -0,0 +1,186 @@
from .session.org_mode import global_session as session
from .atoms import Atom, a, is_atom
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements:
found_position = remaining.find(se)
found = found_position >= 0
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
if found:
return [
(remaining[:found_position], se, remaining[found_position + len(se):])
]
for token in knowledge_base.knowledge.keys():
found_position = remaining.find(token)
found = found_position >= 0
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
if found:
return [
(remaining[:found_position], token, remaining[found_position + len(token):])
]
return None
def to_tokens(knowledge_base, text, precedent=None):
if len(text) == 0:
session().annotate("No text remaining")
yield ['']
return
with session().log("Tokenizing {}".format(text)):
for option in knowledge_base.expected_token_after_precedent(precedent):
with session().log("Next: “{}".format(option)):
with session().log("Matching “{}” on “{}".format(option, text)):
for token_match in tokenization_match(option, text, knowledge_base):
if token_match is None:
session().annotate("No match")
match, remaining = token_match
if len(remaining) == len(text):
raise Exception('No text consumed in match')
session().annotate('Match: “{}'.format(match))
with session().log('Remaining “{}'.format(remaining)):
for sublevel in to_tokens(knowledge_base, remaining, match):
candidate = list(filter(lambda x: x != '', [match] + sublevel))
session().annotate('Yielding candidate “{}'.format(candidate))
yield candidate
def tokenization_match(element, text, knowledge_base):
# Constant/structural string matching
if isinstance(element, str):
if text.find(element) == 0:
# This match comes from a structuring element
# It doesn't appear on the tokenization
# So we should return it as an empty string
yield ('', text[len(element):])
return
else:
# No match found
return
elif is_atom(element, 'token'):
yield from match_single_token(text, knowledge_base)
return
raise NotImplementedError()
def match_single_token(text, knowledge_base):
found_token = False
for token in knowledge_base.knowledge.keys():
if text.find(token) == 0:
yield token, text[len(token):]
found_token = True
if found_token:
return
session().annotate('No token found at the start of ”{}'.format(text))
session().annotate('using structural elements to infer it')
# TODO: review this when multiple structural elements are available
for se in knowledge_base.structural_elements:
session().annotate('Looking for se “{}” in “{}'.format(se, text))
position = text.find(se, 0)
found = position > 0 # 0 is not considered a valid position for this kind of split
if found:
session().annotate('Found ”{}”, inferring “{}'.format(se, text[:position]))
yield text[:position], text[position:]
session().annotate('No structural element or token found, inferring only token remaining')
yield text, ''
# Using other tokens for cutoff
for token in knowledge_base.knowledge.keys():
session().annotate('Looking for token “{}” in “{}'.format(token, text))
position = text.find(token)
found = position >= 0
if found:
session().annotate('Found ”{}”, in position ”{}'.format(token, position))
yield text[:position], text[position:]
def integrate_tokenization(knowledge_base, example):
text = example['text']
tokens = example['tokens']
meaning = example.get('meaning')
return integrate_token_to_text_matching(knowledge_base, text, tokens)
def integrate_token_to_text_matching(knowledge_base, text, tokens):
texts = [text]
# Convert to tokens
for token_id, token in enumerate(tokens):
# Look for token in texts
for i, text in enumerate(texts):
if isinstance(text, int):
continue
if token in text:
before, after = text.split(token, maxsplit=1)
texts = (texts[:i] + [before]
+ [a('token')]
+ [after] + texts[i + 1:])
break
else:
raise Exception('Token not found')
# Remove leftovers from splits
texts = list(filter(lambda x: x != '', texts))
session().log("Tokenized as {} over {}".format(texts, tokens))
for i, element in enumerate(texts[:-1]):
learn_token_pair(element, texts[i + 1], knowledge_base)
return tokens
def learn_token_pair(precedent, consequent, knowledge_base):
knowledge_base.add_token_pair(precedent, consequent)
def pick_one_tokenization(options, knowledge_base):
'''
Heuristic function to pick the most probable tokenization.
Just pick the one with more results.
'''
options = list(options)
with session().log("Picking among: {} options".format(len(options))):
session().log("Options: \n{}".format('\n'.join(map(str, options))))
return pick_by_score(options,
[
# By number of splits without structuring elements
lambda tokenization: sum(map(
lambda split: sum(map(
lambda se: se in split, knowledge_base.structural_elements
)), tokenization)),
# By number of unknown tokens
lambda tokenization: len(list(filter(lambda token:
(token not in knowledge_base.knowledge.keys()) and
(token not in knowledge_base.structural_elements),
tokenization))),
# By number of splits
lambda tokenization: -len(tokenization),
])
def pick_by_score(options, heuristics):
for heuristic in heuristics:
assert(len(options) > 0)
options = list(map(lambda opt: (heuristic(opt), opt), options))
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
heuristic_cutoff = sorted_options[0][0]
session().annotate(sorted_options)
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
options = pass_heuristic
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
return options[0]