Separate tokenization module.
This commit is contained in:
parent
1306306723
commit
8b67b96d2f
@ -5,6 +5,7 @@ from .session.org_mode import global_session as session
|
|||||||
|
|
||||||
from .atoms import Atom
|
from .atoms import Atom
|
||||||
from . import parsing
|
from . import parsing
|
||||||
|
from . import tokenization
|
||||||
from . import knowledge_evaluation
|
from . import knowledge_evaluation
|
||||||
from .modifiable_property import is_modifiable_property
|
from .modifiable_property import is_modifiable_property
|
||||||
import random
|
import random
|
||||||
@ -63,7 +64,7 @@ class KnowledgeBase(object):
|
|||||||
def train_tokenizer(self, example):
|
def train_tokenizer(self, example):
|
||||||
with session().log('Training tokenizer'):
|
with session().log('Training tokenizer'):
|
||||||
session().annotate("Example: {}".format(example))
|
session().annotate("Example: {}".format(example))
|
||||||
tokens = parsing.integrate_tokenization(self, example)
|
tokens = tokenization.integrate_tokenization(self, example)
|
||||||
|
|
||||||
# Integrate knowledge of concept
|
# Integrate knowledge of concept
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
@ -115,11 +116,11 @@ class KnowledgeBase(object):
|
|||||||
def tokenize(self, row, return_one=True):
|
def tokenize(self, row, return_one=True):
|
||||||
row = row.lower()
|
row = row.lower()
|
||||||
with session().log("Tokenize: {}".format(row)):
|
with session().log("Tokenize: {}".format(row)):
|
||||||
options = list(parsing.to_tokens(self, row))
|
options = list(tokenization.to_tokens(self, row))
|
||||||
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
||||||
|
|
||||||
if return_one:
|
if return_one:
|
||||||
chosen = parsing.pick_one_tokenization(options, self)
|
chosen = tokenization.pick_one_tokenization(options, self)
|
||||||
session().log("Chosen: “{}”".format(chosen))
|
session().log("Chosen: “{}”".format(chosen))
|
||||||
self.train_tokenizer({'text': row, 'tokens': chosen})
|
self.train_tokenizer({'text': row, 'tokens': chosen})
|
||||||
return chosen
|
return chosen
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from . import knowledge_evaluation
|
from . import knowledge_evaluation
|
||||||
|
from . import tokenization
|
||||||
|
|
||||||
from . import depth_meter
|
from . import depth_meter
|
||||||
from .session.org_mode import global_session as session
|
from .session.org_mode import global_session as session
|
||||||
@ -13,190 +14,6 @@ from .modifiable_property import ModifiableProperty
|
|||||||
from . import parameters
|
from . import parameters
|
||||||
from .atoms import Atom, a, is_atom
|
from .atoms import Atom, a, is_atom
|
||||||
|
|
||||||
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
|
||||||
for se in knowledge_base.structural_elements:
|
|
||||||
found_position = remaining.find(se)
|
|
||||||
found = found_position >= 0
|
|
||||||
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
|
|
||||||
if found:
|
|
||||||
return [
|
|
||||||
(remaining[:found_position], se, remaining[found_position + len(se):])
|
|
||||||
]
|
|
||||||
|
|
||||||
for token in knowledge_base.knowledge.keys():
|
|
||||||
found_position = remaining.find(token)
|
|
||||||
found = found_position >= 0
|
|
||||||
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
|
|
||||||
if found:
|
|
||||||
return [
|
|
||||||
(remaining[:found_position], token, remaining[found_position + len(token):])
|
|
||||||
]
|
|
||||||
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def to_tokens(knowledge_base, text, precedent=None):
|
|
||||||
if len(text) == 0:
|
|
||||||
session().annotate("No text remaining")
|
|
||||||
yield ['']
|
|
||||||
return
|
|
||||||
|
|
||||||
with session().log("Tokenizing {}".format(text)):
|
|
||||||
for option in knowledge_base.expected_token_after_precedent(precedent):
|
|
||||||
with session().log("Next: “{}”".format(option)):
|
|
||||||
with session().log("Matching “{}” on “{}”".format(option, text)):
|
|
||||||
for token_match in tokenization_match(option, text, knowledge_base):
|
|
||||||
if token_match is None:
|
|
||||||
session().annotate("No match")
|
|
||||||
|
|
||||||
match, remaining = token_match
|
|
||||||
if len(remaining) == len(text):
|
|
||||||
raise Exception('No text consumed in match')
|
|
||||||
|
|
||||||
session().annotate('Match: “{}”'.format(match))
|
|
||||||
with session().log('Remaining “{}”'.format(remaining)):
|
|
||||||
for sublevel in to_tokens(knowledge_base, remaining, match):
|
|
||||||
candidate = list(filter(lambda x: x != '', [match] + sublevel))
|
|
||||||
session().annotate('Yielding candidate “{}”'.format(candidate))
|
|
||||||
yield candidate
|
|
||||||
|
|
||||||
|
|
||||||
def tokenization_match(element, text, knowledge_base):
|
|
||||||
# Constant/structural string matching
|
|
||||||
if isinstance(element, str):
|
|
||||||
if text.find(element) == 0:
|
|
||||||
# This match comes from a structuring element
|
|
||||||
# It doesn't appear on the tokenization
|
|
||||||
# So we should return it as an empty string
|
|
||||||
yield ('', text[len(element):])
|
|
||||||
return
|
|
||||||
else:
|
|
||||||
# No match found
|
|
||||||
return
|
|
||||||
|
|
||||||
elif is_atom(element, 'token'):
|
|
||||||
yield from match_single_token(text, knowledge_base)
|
|
||||||
return
|
|
||||||
raise NotImplementedError()
|
|
||||||
|
|
||||||
|
|
||||||
def match_single_token(text, knowledge_base):
|
|
||||||
found_token = False
|
|
||||||
for token in knowledge_base.knowledge.keys():
|
|
||||||
if text.find(token) == 0:
|
|
||||||
yield token, text[len(token):]
|
|
||||||
found_token = True
|
|
||||||
|
|
||||||
if found_token:
|
|
||||||
return
|
|
||||||
|
|
||||||
session().annotate('No token found at the start of ”{}”'.format(text))
|
|
||||||
session().annotate('using structural elements to infer it')
|
|
||||||
# TODO: review this when multiple structural elements are available
|
|
||||||
for se in knowledge_base.structural_elements:
|
|
||||||
session().annotate('Looking for se “{}” in “{}”'.format(se, text))
|
|
||||||
position = text.find(se, 0)
|
|
||||||
found = position > 0 # 0 is not considered a valid position for this kind of split
|
|
||||||
if found:
|
|
||||||
session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))
|
|
||||||
yield text[:position], text[position:]
|
|
||||||
|
|
||||||
session().annotate('No structural element or token found, inferring only token remaining')
|
|
||||||
yield text, ''
|
|
||||||
|
|
||||||
# Using other tokens for cutoff
|
|
||||||
for token in knowledge_base.knowledge.keys():
|
|
||||||
session().annotate('Looking for token “{}” in “{}”'.format(token, text))
|
|
||||||
position = text.find(token)
|
|
||||||
found = position >= 0
|
|
||||||
if found:
|
|
||||||
session().annotate('Found ”{}”, in position ”{}”'.format(token, position))
|
|
||||||
yield text[:position], text[position:]
|
|
||||||
|
|
||||||
|
|
||||||
def integrate_tokenization(knowledge_base, example):
|
|
||||||
text = example['text']
|
|
||||||
tokens = example['tokens']
|
|
||||||
meaning = example.get('meaning')
|
|
||||||
|
|
||||||
return integrate_token_to_text_matching(knowledge_base, text, tokens)
|
|
||||||
|
|
||||||
|
|
||||||
def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
|
||||||
texts = [text]
|
|
||||||
|
|
||||||
# Convert to tokens
|
|
||||||
for token_id, token in enumerate(tokens):
|
|
||||||
# Look for token in texts
|
|
||||||
for i, text in enumerate(texts):
|
|
||||||
if isinstance(text, int):
|
|
||||||
continue
|
|
||||||
|
|
||||||
if token in text:
|
|
||||||
before, after = text.split(token, maxsplit=1)
|
|
||||||
texts = (texts[:i] + [before]
|
|
||||||
+ [a('token')]
|
|
||||||
+ [after] + texts[i + 1:])
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise Exception('Token not found')
|
|
||||||
|
|
||||||
# Remove leftovers from splits
|
|
||||||
texts = list(filter(lambda x: x != '', texts))
|
|
||||||
session().log("Tokenized as {} over {}".format(texts, tokens))
|
|
||||||
|
|
||||||
for i, element in enumerate(texts[:-1]):
|
|
||||||
learn_token_pair(element, texts[i + 1], knowledge_base)
|
|
||||||
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
def learn_token_pair(precedent, consequent, knowledge_base):
|
|
||||||
knowledge_base.add_token_pair(precedent, consequent)
|
|
||||||
|
|
||||||
|
|
||||||
def pick_one_tokenization(options, knowledge_base):
|
|
||||||
'''
|
|
||||||
Heuristic function to pick the most probable tokenization.
|
|
||||||
|
|
||||||
Just pick the one with more results.
|
|
||||||
'''
|
|
||||||
options = list(options)
|
|
||||||
with session().log("Picking among: {} options".format(len(options))):
|
|
||||||
session().log("Options: \n{}".format('\n'.join(map(str, options))))
|
|
||||||
return pick_by_score(options,
|
|
||||||
[
|
|
||||||
# By number of splits without structuring elements
|
|
||||||
lambda tokenization: sum(map(
|
|
||||||
lambda split: sum(map(
|
|
||||||
lambda se: se in split, knowledge_base.structural_elements
|
|
||||||
)), tokenization)),
|
|
||||||
|
|
||||||
# By number of unknown tokens
|
|
||||||
lambda tokenization: len(list(filter(lambda token:
|
|
||||||
(token not in knowledge_base.knowledge.keys()) and
|
|
||||||
(token not in knowledge_base.structural_elements),
|
|
||||||
tokenization))),
|
|
||||||
|
|
||||||
# By number of splits
|
|
||||||
lambda tokenization: -len(tokenization),
|
|
||||||
])
|
|
||||||
|
|
||||||
def pick_by_score(options, heuristics):
|
|
||||||
for heuristic in heuristics:
|
|
||||||
assert(len(options) > 0)
|
|
||||||
options = list(map(lambda opt: (heuristic(opt), opt), options))
|
|
||||||
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
|
|
||||||
|
|
||||||
heuristic_cutoff = sorted_options[0][0]
|
|
||||||
session().annotate(sorted_options)
|
|
||||||
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
|
|
||||||
options = pass_heuristic
|
|
||||||
|
|
||||||
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
|
|
||||||
return options[0]
|
|
||||||
|
|
||||||
|
|
||||||
def make_template(knowledge_base, tokens, parsed):
|
def make_template(knowledge_base, tokens, parsed):
|
||||||
matcher = list(tokens)
|
matcher = list(tokens)
|
||||||
template = list(parsed)
|
template = list(parsed)
|
||||||
@ -267,7 +84,7 @@ def integrate_language(knowledge_base, example):
|
|||||||
parsed = example["parsed"]
|
parsed = example["parsed"]
|
||||||
|
|
||||||
resolved_parsed = copy.deepcopy(parsed)
|
resolved_parsed = copy.deepcopy(parsed)
|
||||||
tokens = list(pick_one_tokenization(to_tokens(knowledge_base, text), knowledge_base))
|
tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base))
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
session().annotate("P: {}".format(resolved_parsed))
|
session().annotate("P: {}".format(resolved_parsed))
|
||||||
|
186
naive-nlu/tree_nlu/tokenization.py
Normal file
186
naive-nlu/tree_nlu/tokenization.py
Normal file
@ -0,0 +1,186 @@
|
|||||||
|
from .session.org_mode import global_session as session
|
||||||
|
from .atoms import Atom, a, is_atom
|
||||||
|
|
||||||
|
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
||||||
|
for se in knowledge_base.structural_elements:
|
||||||
|
found_position = remaining.find(se)
|
||||||
|
found = found_position >= 0
|
||||||
|
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
|
||||||
|
if found:
|
||||||
|
return [
|
||||||
|
(remaining[:found_position], se, remaining[found_position + len(se):])
|
||||||
|
]
|
||||||
|
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
found_position = remaining.find(token)
|
||||||
|
found = found_position >= 0
|
||||||
|
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
|
||||||
|
if found:
|
||||||
|
return [
|
||||||
|
(remaining[:found_position], token, remaining[found_position + len(token):])
|
||||||
|
]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def to_tokens(knowledge_base, text, precedent=None):
|
||||||
|
if len(text) == 0:
|
||||||
|
session().annotate("No text remaining")
|
||||||
|
yield ['']
|
||||||
|
return
|
||||||
|
|
||||||
|
with session().log("Tokenizing {}".format(text)):
|
||||||
|
for option in knowledge_base.expected_token_after_precedent(precedent):
|
||||||
|
with session().log("Next: “{}”".format(option)):
|
||||||
|
with session().log("Matching “{}” on “{}”".format(option, text)):
|
||||||
|
for token_match in tokenization_match(option, text, knowledge_base):
|
||||||
|
if token_match is None:
|
||||||
|
session().annotate("No match")
|
||||||
|
|
||||||
|
match, remaining = token_match
|
||||||
|
if len(remaining) == len(text):
|
||||||
|
raise Exception('No text consumed in match')
|
||||||
|
|
||||||
|
session().annotate('Match: “{}”'.format(match))
|
||||||
|
with session().log('Remaining “{}”'.format(remaining)):
|
||||||
|
for sublevel in to_tokens(knowledge_base, remaining, match):
|
||||||
|
candidate = list(filter(lambda x: x != '', [match] + sublevel))
|
||||||
|
session().annotate('Yielding candidate “{}”'.format(candidate))
|
||||||
|
yield candidate
|
||||||
|
|
||||||
|
|
||||||
|
def tokenization_match(element, text, knowledge_base):
|
||||||
|
# Constant/structural string matching
|
||||||
|
if isinstance(element, str):
|
||||||
|
if text.find(element) == 0:
|
||||||
|
# This match comes from a structuring element
|
||||||
|
# It doesn't appear on the tokenization
|
||||||
|
# So we should return it as an empty string
|
||||||
|
yield ('', text[len(element):])
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# No match found
|
||||||
|
return
|
||||||
|
|
||||||
|
elif is_atom(element, 'token'):
|
||||||
|
yield from match_single_token(text, knowledge_base)
|
||||||
|
return
|
||||||
|
raise NotImplementedError()
|
||||||
|
|
||||||
|
|
||||||
|
def match_single_token(text, knowledge_base):
|
||||||
|
found_token = False
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
if text.find(token) == 0:
|
||||||
|
yield token, text[len(token):]
|
||||||
|
found_token = True
|
||||||
|
|
||||||
|
if found_token:
|
||||||
|
return
|
||||||
|
|
||||||
|
session().annotate('No token found at the start of ”{}”'.format(text))
|
||||||
|
session().annotate('using structural elements to infer it')
|
||||||
|
# TODO: review this when multiple structural elements are available
|
||||||
|
for se in knowledge_base.structural_elements:
|
||||||
|
session().annotate('Looking for se “{}” in “{}”'.format(se, text))
|
||||||
|
position = text.find(se, 0)
|
||||||
|
found = position > 0 # 0 is not considered a valid position for this kind of split
|
||||||
|
if found:
|
||||||
|
session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))
|
||||||
|
yield text[:position], text[position:]
|
||||||
|
|
||||||
|
session().annotate('No structural element or token found, inferring only token remaining')
|
||||||
|
yield text, ''
|
||||||
|
|
||||||
|
# Using other tokens for cutoff
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
session().annotate('Looking for token “{}” in “{}”'.format(token, text))
|
||||||
|
position = text.find(token)
|
||||||
|
found = position >= 0
|
||||||
|
if found:
|
||||||
|
session().annotate('Found ”{}”, in position ”{}”'.format(token, position))
|
||||||
|
yield text[:position], text[position:]
|
||||||
|
|
||||||
|
|
||||||
|
def integrate_tokenization(knowledge_base, example):
|
||||||
|
text = example['text']
|
||||||
|
tokens = example['tokens']
|
||||||
|
meaning = example.get('meaning')
|
||||||
|
|
||||||
|
return integrate_token_to_text_matching(knowledge_base, text, tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
||||||
|
texts = [text]
|
||||||
|
|
||||||
|
# Convert to tokens
|
||||||
|
for token_id, token in enumerate(tokens):
|
||||||
|
# Look for token in texts
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
if isinstance(text, int):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if token in text:
|
||||||
|
before, after = text.split(token, maxsplit=1)
|
||||||
|
texts = (texts[:i] + [before]
|
||||||
|
+ [a('token')]
|
||||||
|
+ [after] + texts[i + 1:])
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise Exception('Token not found')
|
||||||
|
|
||||||
|
# Remove leftovers from splits
|
||||||
|
texts = list(filter(lambda x: x != '', texts))
|
||||||
|
session().log("Tokenized as {} over {}".format(texts, tokens))
|
||||||
|
|
||||||
|
for i, element in enumerate(texts[:-1]):
|
||||||
|
learn_token_pair(element, texts[i + 1], knowledge_base)
|
||||||
|
|
||||||
|
return tokens
|
||||||
|
|
||||||
|
def learn_token_pair(precedent, consequent, knowledge_base):
|
||||||
|
knowledge_base.add_token_pair(precedent, consequent)
|
||||||
|
|
||||||
|
|
||||||
|
def pick_one_tokenization(options, knowledge_base):
|
||||||
|
'''
|
||||||
|
Heuristic function to pick the most probable tokenization.
|
||||||
|
|
||||||
|
Just pick the one with more results.
|
||||||
|
'''
|
||||||
|
options = list(options)
|
||||||
|
with session().log("Picking among: {} options".format(len(options))):
|
||||||
|
session().log("Options: \n{}".format('\n'.join(map(str, options))))
|
||||||
|
return pick_by_score(options,
|
||||||
|
[
|
||||||
|
# By number of splits without structuring elements
|
||||||
|
lambda tokenization: sum(map(
|
||||||
|
lambda split: sum(map(
|
||||||
|
lambda se: se in split, knowledge_base.structural_elements
|
||||||
|
)), tokenization)),
|
||||||
|
|
||||||
|
# By number of unknown tokens
|
||||||
|
lambda tokenization: len(list(filter(lambda token:
|
||||||
|
(token not in knowledge_base.knowledge.keys()) and
|
||||||
|
(token not in knowledge_base.structural_elements),
|
||||||
|
tokenization))),
|
||||||
|
|
||||||
|
# By number of splits
|
||||||
|
lambda tokenization: -len(tokenization),
|
||||||
|
])
|
||||||
|
|
||||||
|
def pick_by_score(options, heuristics):
|
||||||
|
for heuristic in heuristics:
|
||||||
|
assert(len(options) > 0)
|
||||||
|
options = list(map(lambda opt: (heuristic(opt), opt), options))
|
||||||
|
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
|
||||||
|
|
||||||
|
heuristic_cutoff = sorted_options[0][0]
|
||||||
|
session().annotate(sorted_options)
|
||||||
|
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
|
||||||
|
options = pass_heuristic
|
||||||
|
|
||||||
|
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
|
||||||
|
return options[0]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user