Move to a chaining model for tokenization.
This model also explores more tokenization possibilities. With this, the tokenization tests are passed.
This commit is contained in:
parent
998a183fd2
commit
79034f85a9
@ -8,6 +8,15 @@ from collections import namedtuple
|
|||||||
|
|
||||||
Atom = namedtuple('Atom', field_names='name')
|
Atom = namedtuple('Atom', field_names='name')
|
||||||
|
|
||||||
|
def is_atom(element, name=None):
|
||||||
|
'''Check if an element is an atom with a specific name.'''
|
||||||
|
if not isinstance(element, Atom):
|
||||||
|
return False
|
||||||
|
|
||||||
|
if name is None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return element.name == name
|
||||||
|
|
||||||
def a(name):
|
def a(name):
|
||||||
'''Build an atom with a given name.'''
|
'''Build an atom with a given name.'''
|
||||||
|
@ -7,25 +7,69 @@ from .atoms import Atom
|
|||||||
from . import parsing
|
from . import parsing
|
||||||
from . import knowledge_evaluation
|
from . import knowledge_evaluation
|
||||||
from .modifiable_property import is_modifiable_property
|
from .modifiable_property import is_modifiable_property
|
||||||
|
import random
|
||||||
|
|
||||||
def diff_knowledge(before, after):
|
def diff_knowledge(before, after):
|
||||||
import jsondiff
|
import jsondiff
|
||||||
return jsondiff.diff(before, after)
|
return jsondiff.diff(before, after)
|
||||||
|
|
||||||
|
|
||||||
|
def randomized_weighted_list(elements):
|
||||||
|
# Randomized
|
||||||
|
randomized = list(elements)
|
||||||
|
random.shuffle(randomized)
|
||||||
|
|
||||||
|
# And return only once
|
||||||
|
already_returned = set()
|
||||||
|
for e in randomized:
|
||||||
|
if e in already_returned:
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield e
|
||||||
|
already_returned.add(e)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeBase(object):
|
class KnowledgeBase(object):
|
||||||
def __init__(self, knowledge={}, examples=[], trained=[]):
|
def __init__(self, knowledge={}, examples=[], trained=[]):
|
||||||
self.knowledge = copy.copy(knowledge)
|
self.knowledge = copy.copy(knowledge)
|
||||||
self.originals = []
|
self.originals = []
|
||||||
self.examples = copy.copy(examples)
|
self.examples = copy.copy(examples)
|
||||||
self.trained = copy.copy(trained)
|
self.trained = copy.copy(trained)
|
||||||
self.tokenization = set()
|
|
||||||
self.structural_elements = set()
|
self.structural_elements = set()
|
||||||
|
self.token_chains = {}
|
||||||
|
self.tokens = set()
|
||||||
|
|
||||||
|
def add_token_pair(self, precedent, consequent):
|
||||||
|
self.add_token(precedent)
|
||||||
|
self.add_token(consequent)
|
||||||
|
|
||||||
|
if precedent not in self.token_chains:
|
||||||
|
self.token_chains[precedent] = []
|
||||||
|
self.token_chains[precedent].append(consequent)
|
||||||
|
|
||||||
|
def add_token(self, token):
|
||||||
|
self.tokens.add(token)
|
||||||
|
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
|
||||||
|
session().annotate('Found new structural element “{}”'.format(token))
|
||||||
|
self.structural_elements.add(token)
|
||||||
|
|
||||||
|
def expected_token_after_precedent(self, precedent=None):
|
||||||
|
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
|
||||||
|
return randomized_weighted_list(self.tokens)
|
||||||
|
|
||||||
|
return randomized_weighted_list(self.token_chains[precedent])
|
||||||
|
|
||||||
def train_tokenizer(self, example):
|
def train_tokenizer(self, example):
|
||||||
with session().log('Train'):
|
with session().log('Training tokenizer'):
|
||||||
parsing.integrate_tokenization(self, example)
|
session().annotate("Example: {}".format(example))
|
||||||
|
tokens = parsing.integrate_tokenization(self, example)
|
||||||
|
|
||||||
|
# Integrate knowledge of concept
|
||||||
|
for token in tokens:
|
||||||
|
if not token in self.knowledge:
|
||||||
|
self.knowledge[token] = {}
|
||||||
|
|
||||||
|
|
||||||
def train(self, examples):
|
def train(self, examples):
|
||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
@ -80,14 +124,6 @@ class KnowledgeBase(object):
|
|||||||
return chosen
|
return chosen
|
||||||
return options
|
return options
|
||||||
|
|
||||||
def add_tokenization(self, tokenization):
|
|
||||||
with session().log('Added tokenization: “{}”'.format(tokenization)):
|
|
||||||
self.tokenization.add(tokenization)
|
|
||||||
for e in tokenization:
|
|
||||||
if (not isinstance(e, Atom)) and (e not in self.structural_elements):
|
|
||||||
session().annotate('Found new structural element “{}”'.format(e))
|
|
||||||
self.structural_elements.add(e)
|
|
||||||
|
|
||||||
def process(self, row):
|
def process(self, row):
|
||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
with session().log("Process: {}".format(row)):
|
with session().log("Process: {}".format(row)):
|
||||||
|
@ -11,7 +11,7 @@ from functools import reduce
|
|||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from .modifiable_property import ModifiableProperty
|
from .modifiable_property import ModifiableProperty
|
||||||
from . import parameters
|
from . import parameters
|
||||||
from .atoms import Atom, a
|
from .atoms import Atom, a, is_atom
|
||||||
|
|
||||||
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
||||||
for se in knowledge_base.structural_elements:
|
for se in knowledge_base.structural_elements:
|
||||||
@ -36,79 +36,84 @@ def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
def to_tokens(knowledge_base, text, acc=None):
|
def to_tokens(knowledge_base, text, precedent=None):
|
||||||
# TODO This is an extra-naïve implementation
|
if len(text) == 0:
|
||||||
found = 0
|
session().annotate("No text remaining")
|
||||||
|
yield ['']
|
||||||
|
return
|
||||||
|
|
||||||
for tokenization in knowledge_base.tokenization:
|
with session().log("Tokenizing {}".format(text)):
|
||||||
with session().log("Tokenization {}".format(tokenization)):
|
for option in knowledge_base.expected_token_after_precedent(precedent):
|
||||||
remaining = text
|
with session().log("Next: “{}”".format(option)):
|
||||||
possibility = []
|
with session().log("Matching “{}” on “{}”".format(option, text)):
|
||||||
|
for token_match in tokenization_match(option, text, knowledge_base):
|
||||||
|
if token_match is None:
|
||||||
|
session().annotate("No match")
|
||||||
|
|
||||||
# Apply tokenization to all elmenets
|
match, remaining = token_match
|
||||||
for i, token in enumerate(tokenization):
|
if len(remaining) == len(text):
|
||||||
with session().log("T “{}” over “{}”".format(token, remaining)):
|
raise Exception('No text consumed in match')
|
||||||
if token == Atom('token'):
|
|
||||||
for thing in knowledge_base.knowledge.keys():
|
|
||||||
session().annotate("Testing with “{}”".format(thing))
|
|
||||||
if remaining.startswith(thing):
|
|
||||||
# TODO We should also branch here, probably :\
|
|
||||||
remaining = remaining[len(thing):]
|
|
||||||
possibility.append(thing)
|
|
||||||
else:
|
|
||||||
if i + 1 >= len(tokenization): # Last element, lookahead for tokens/structural elements
|
|
||||||
with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)):
|
|
||||||
# If we start with remaining[0:] it's not a real lookahead
|
|
||||||
# ... and it can get us trapped on infinite recursion
|
|
||||||
splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:])
|
|
||||||
|
|
||||||
if splits is None:
|
session().annotate('Match: “{}”'.format(match))
|
||||||
session().log("No splits found, keeping remaining as token “{}”".format(remaining))
|
with session().log('Remaining “{}”'.format(remaining)):
|
||||||
|
for sublevel in to_tokens(knowledge_base, remaining, match):
|
||||||
|
candidate = list(filter(lambda x: x != '', [match] + sublevel))
|
||||||
|
session().annotate('Yielding candidate “{}”'.format(candidate))
|
||||||
|
yield candidate
|
||||||
|
|
||||||
possibility.append(remaining)
|
|
||||||
remaining = ""
|
|
||||||
|
|
||||||
else:
|
def tokenization_match(element, text, knowledge_base):
|
||||||
# Consider we only have one possibility
|
# Constant/structural string matching
|
||||||
assert len(splits) == 1
|
if isinstance(element, str):
|
||||||
|
if text.find(element) == 0:
|
||||||
before_split, pivot, after_split = splits[0]
|
# This match comes from a structuring element
|
||||||
before_split = remaining[0] + before_split
|
# It doesn't appear on the tokenization
|
||||||
|
# So we should return it as an empty string
|
||||||
session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split))
|
yield ('', text[len(element):])
|
||||||
|
return
|
||||||
possibility.append(before_split)
|
|
||||||
remaining = pivot + after_split
|
|
||||||
|
|
||||||
else: # Not las element, use the next one as cutter
|
|
||||||
# Try with (HYPERSIMPLISTIC!) backtracking
|
|
||||||
# Cut using the next token we should use more!!!
|
|
||||||
next_token = tokenization[i + 1]
|
|
||||||
session().annotate("Trying to cut for next token on “{}”".format(next_token))
|
|
||||||
|
|
||||||
cutoff = remaining.find(next_token)
|
|
||||||
if cutoff < 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
possibility.append(remaining[:cutoff])
|
|
||||||
remaining = remaining[cutoff:]
|
|
||||||
else:
|
|
||||||
if remaining.find(token) < 0: # Not inmediately after!
|
|
||||||
break
|
|
||||||
remaining = remaining[len(token):]
|
|
||||||
session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1)))
|
|
||||||
else:
|
else:
|
||||||
# Tokenization applicable
|
# No match found
|
||||||
found += 1
|
return
|
||||||
if remaining == '':
|
|
||||||
session().log("Concluded possibility “{}”".format(possibility))
|
elif is_atom(element, 'token'):
|
||||||
yield possibility
|
yield from match_single_token(text, knowledge_base)
|
||||||
else:
|
return
|
||||||
with session().log("Continuing with “{}”".format(remaining)):
|
raise NotImplementedError()
|
||||||
for consecuent in to_tokens(knowledge_base, remaining, possibility):
|
|
||||||
yield list(filter(lambda x: x != '', possibility + consecuent))
|
|
||||||
if found == 0:
|
def match_single_token(text, knowledge_base):
|
||||||
raise Exception('No tokenization found')
|
found_token = False
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
if text.find(token) == 0:
|
||||||
|
yield token, text[len(token):]
|
||||||
|
found_token = True
|
||||||
|
|
||||||
|
if found_token:
|
||||||
|
return
|
||||||
|
|
||||||
|
session().annotate('No token found at the start of ”{}”'.format(text))
|
||||||
|
session().annotate('using structural elements to infer it')
|
||||||
|
# TODO: review this when multiple structural elements are available
|
||||||
|
for se in knowledge_base.structural_elements:
|
||||||
|
session().annotate('Looking for se “{}” in “{}”'.format(se, text))
|
||||||
|
position = text.find(se, 0)
|
||||||
|
found = position > 0 # 0 is not considered a valid position for this kind of split
|
||||||
|
if found:
|
||||||
|
session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))
|
||||||
|
yield text[:position], text[position:]
|
||||||
|
|
||||||
|
session().annotate('No structural element or token found, inferring only token remaining')
|
||||||
|
yield text, ''
|
||||||
|
|
||||||
|
# Using other tokens for cutoff
|
||||||
|
for token in knowledge_base.knowledge.keys():
|
||||||
|
session().annotate('Looking for token “{}” in “{}”'.format(token, text))
|
||||||
|
position = text.find(token)
|
||||||
|
found = position >= 0
|
||||||
|
if found:
|
||||||
|
session().annotate('Found ”{}”, in position ”{}”'.format(token, position))
|
||||||
|
yield text[:position], text[position:]
|
||||||
|
|
||||||
|
|
||||||
def integrate_tokenization(knowledge_base, example):
|
def integrate_tokenization(knowledge_base, example):
|
||||||
text = example['text']
|
text = example['text']
|
||||||
@ -131,7 +136,7 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
|||||||
if token in text:
|
if token in text:
|
||||||
before, after = text.split(token, maxsplit=1)
|
before, after = text.split(token, maxsplit=1)
|
||||||
texts = (texts[:i] + [before]
|
texts = (texts[:i] + [before]
|
||||||
+ [token_id]
|
+ [a('token')]
|
||||||
+ [after] + texts[i + 1:])
|
+ [after] + texts[i + 1:])
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
@ -139,18 +144,16 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens):
|
|||||||
|
|
||||||
# Remove leftovers from splits
|
# Remove leftovers from splits
|
||||||
texts = list(filter(lambda x: x != '', texts))
|
texts = list(filter(lambda x: x != '', texts))
|
||||||
|
session().log("Tokenized as {} over {}".format(texts, tokens))
|
||||||
|
|
||||||
for token_id, _token in enumerate(tokens):
|
for i, element in enumerate(texts[:-1]):
|
||||||
# Find all elements between current token and next token
|
learn_token_pair(element, texts[i + 1], knowledge_base)
|
||||||
i = texts.index(token_id)
|
|
||||||
elements = [a('token')]
|
|
||||||
|
|
||||||
i += 1
|
return tokens
|
||||||
while i < len(texts) and not isinstance(texts[i], int):
|
|
||||||
elements.append(texts[i])
|
def learn_token_pair(precedent, consequent, knowledge_base):
|
||||||
i += 1
|
knowledge_base.add_token_pair(precedent, consequent)
|
||||||
|
|
||||||
knowledge_base.add_tokenization(tuple(elements))
|
|
||||||
|
|
||||||
def pick_one_tokenization(options, knowledge_base):
|
def pick_one_tokenization(options, knowledge_base):
|
||||||
'''
|
'''
|
||||||
@ -158,26 +161,34 @@ def pick_one_tokenization(options, knowledge_base):
|
|||||||
|
|
||||||
Just pick the one with more results.
|
Just pick the one with more results.
|
||||||
'''
|
'''
|
||||||
|
options = list(options)
|
||||||
with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
|
with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))):
|
||||||
return pick_by_score(options,
|
return pick_by_score(options,
|
||||||
[
|
[
|
||||||
# First by number of splits
|
# By number of splits without structuring elements
|
||||||
lambda tokenization: len(tokenization),
|
|
||||||
|
|
||||||
# Among them, by number of splits without structuring elements
|
|
||||||
lambda tokenization: sum(map(
|
lambda tokenization: sum(map(
|
||||||
lambda split: -sum(map(
|
lambda split: sum(map(
|
||||||
lambda se: se in split, knowledge_base.structural_elements
|
lambda se: se in split, knowledge_base.structural_elements
|
||||||
)), tokenization))
|
)), tokenization)),
|
||||||
|
|
||||||
|
# By number of unknown tokens
|
||||||
|
lambda tokenization: len(list(filter(lambda token:
|
||||||
|
(token not in knowledge_base.knowledge.keys()) and
|
||||||
|
(token not in knowledge_base.structural_elements),
|
||||||
|
tokenization))),
|
||||||
|
|
||||||
|
# By number of splits
|
||||||
|
lambda tokenization: -len(tokenization),
|
||||||
])
|
])
|
||||||
|
|
||||||
def pick_by_score(options, heuristics):
|
def pick_by_score(options, heuristics):
|
||||||
for heuristic in heuristics:
|
for heuristic in heuristics:
|
||||||
assert(len(options) > 0)
|
assert(len(options) > 0)
|
||||||
options = list(map(lambda opt: (heuristic(opt), opt), options))
|
options = list(map(lambda opt: (heuristic(opt), opt), options))
|
||||||
sorted_options = sorted(options, key=lambda x: x[0], reverse=True)
|
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
|
||||||
|
|
||||||
heuristic_cutoff = sorted_options[0][0]
|
heuristic_cutoff = sorted_options[0][0]
|
||||||
|
session().annotate(sorted_options)
|
||||||
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
|
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
|
||||||
options = pass_heuristic
|
options = pass_heuristic
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user