Exploration of layers for tokenization and parsing.

This commit is contained in:
kenkeiras 2018-04-23 22:48:10 +02:00
parent c18c9b8cb1
commit a444766c7c
10 changed files with 173 additions and 108 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
*#* *#*
*~ *~
.vscode
*.ba?k *.ba?k
*.pyc *.pyc
__pycache__ __pycache__

View File

@ -4,8 +4,7 @@ import logging
from .session.org_mode import global_session as session from .session.org_mode import global_session as session
from .atoms import Atom from .atoms import Atom
from . import parsing from . import layered_model
from . import tokenization
from . import knowledge_evaluation from . import knowledge_evaluation
from .modifiable_property import is_modifiable_property from .modifiable_property import is_modifiable_property
import random import random
@ -15,21 +14,6 @@ def diff_knowledge(before, after):
return jsondiff.diff(before, after) return jsondiff.diff(before, after)
def randomized_weighted_list(elements):
# Randomized
randomized = list(elements)
random.shuffle(randomized)
# And return only once
already_returned = set()
for e in randomized:
if e in already_returned:
continue
yield e
already_returned.add(e)
class KnowledgeBase(object): class KnowledgeBase(object):
def __init__(self, knowledge={}, examples=[], trained=[]): def __init__(self, knowledge={}, examples=[], trained=[]):
@ -37,41 +21,9 @@ class KnowledgeBase(object):
self.originals = [] self.originals = []
self.examples = copy.copy(examples) self.examples = copy.copy(examples)
self.trained = copy.copy(trained) self.trained = copy.copy(trained)
self.structural_elements = set() self.layers = layered_model.BaseModel(self)
self.token_chains = {}
self.tokens = set()
def add_token_pair(self, precedent, consequent):
self.add_token(precedent)
self.add_token(consequent)
if precedent not in self.token_chains:
self.token_chains[precedent] = []
self.token_chains[precedent].append(consequent)
def add_token(self, token):
self.tokens.add(token)
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
session().annotate('Found new structural element “{}'.format(token))
self.structural_elements.add(token)
def expected_token_after_precedent(self, precedent=None):
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
return randomized_weighted_list(self.tokens)
return randomized_weighted_list(self.token_chains[precedent])
def train_tokenizer(self, example):
with session().log('Training tokenizer'):
session().annotate("Example: {}".format(example))
tokens = tokenization.integrate_tokenization(self, example)
# Integrate knowledge of concept
for token in tokens:
if not token in self.knowledge:
self.knowledge[token] = {}
## Parsing
def train(self, examples): def train(self, examples):
knowledge_before = copy.deepcopy(self.knowledge) knowledge_before = copy.deepcopy(self.knowledge)
with session().log('Train'): with session().log('Train'):
@ -86,11 +38,12 @@ class KnowledgeBase(object):
self.act_upon(result) self.act_upon(result)
with session().log("language integration"): with session().log("language integration"):
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example) for tokens, decomposition, inferred_tree in self.layers.integrate(self, example):
session().annotate("Tokens: {}".format(tokens)) session().annotate("Tokens: {}".format(tokens))
session().annotate("Inferred tree: {}".format(inferred_tree)) session().annotate("Inferred tree: {}".format(inferred_tree))
with session().log("full information integration"): with session().log("full information integration"):
tokens = self.layers.tokenization.tokenize(example['text'], return_one=True)
result = knowledge_evaluation.integrate_information(self.knowledge, { result = knowledge_evaluation.integrate_information(self.knowledge, {
"elements": tokens, "elements": tokens,
"decomposition": decomposition, "decomposition": decomposition,
@ -105,7 +58,7 @@ class KnowledgeBase(object):
# Reduce values # Reduce values
with session().log("reprocessing"): with session().log("reprocessing"):
self.trained = parsing.reprocess_language_knowledge(self, self.examples) self.layers.reprocess(self.examples)
knowledge_after = copy.deepcopy(self.knowledge) knowledge_after = copy.deepcopy(self.knowledge)
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
@ -113,19 +66,6 @@ class KnowledgeBase(object):
return knowledge_diff_getter return knowledge_diff_getter
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = list(tokenization.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = tokenization.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train_tokenizer({'text': row, 'tokens': chosen})
return chosen
return options
def process(self, row): def process(self, row):
knowledge_before = copy.deepcopy(self.knowledge) knowledge_before = copy.deepcopy(self.knowledge)
with session().log("Process: {}".format(row)): with session().log("Process: {}".format(row)):

View File

@ -0,0 +1,47 @@
from .layers import tokenization_layer
from .layers import parsing_layer
def make_yield_pipe(layers, knowledge_base, example):
if len(layers) < 1:
yield example
return
input_generator = make_yield_pipe(layers[:-1], knowledge_base, example)
for input in input_generator:
print("-->", input)
for d in list(layers[-1].integrate(knowledge_base, input)):
yield d
class BaseModel:
def __init__(self, knowledge_base):
self.tokenization = tokenization_layer.TokenizationLayer(knowledge_base)
self.parsing = parsing_layer.ParsingLayer()
self.layers = [
self.tokenization,
self.parsing,
]
def reprocess(self, examples):
for example in examples:
self._reprocess_single(example)
def _reprocess_single(self, example):
return
pattern_examples = []
for i, sample in enumerate(examples):
other = examples[:i] + examples[i + 1:]
match = get_matching(sample, other)
if len(match) > 0:
sample = (match, sample[1],)
pattern_examples.append(sample)
return pattern_examples
def integrate(self, knowledge_base, example):
yield from make_yield_pipe(self.layers, knowledge_base, example)
def tokenize(self, row, return_one=True):
return self.tokenization.to_tokens(row)

View File

@ -1,18 +1,14 @@
#!/usr/bin/env python #!/usr/bin/env python
from . import knowledge_evaluation from ..session.org_mode import global_session as session
from . import tokenization
from . import depth_meter
from .session.org_mode import global_session as session
import re import re
import copy import copy
from functools import reduce from functools import reduce
from typing import List, Dict from typing import List, Dict
from .modifiable_property import ModifiableProperty from ..modifiable_property import ModifiableProperty
from . import parameters from .. import parameters
from .atoms import Atom, a, is_atom from ..atoms import Atom, a, is_atom
def make_template(knowledge_base, tokens, parsed): def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens) matcher = list(tokens)
@ -83,8 +79,8 @@ def integrate_language(knowledge_base, example):
text = example["text"].lower() text = example["text"].lower()
parsed = example["parsed"] parsed = example["parsed"]
tokens = example['tokens']
resolved_parsed = copy.deepcopy(parsed) resolved_parsed = copy.deepcopy(parsed)
tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base))
while True: while True:
session().annotate("P: {}".format(resolved_parsed)) session().annotate("P: {}".format(resolved_parsed))
@ -95,14 +91,14 @@ def integrate_language(knowledge_base, example):
for position, atom in lower_levels: for position, atom in lower_levels:
with session().log("Atom {}".format(atom)): with session().log("Atom {}".format(atom)):
result = None
similars = get_similar_tree(knowledge_base, atom, tokens) similars = get_similar_tree(knowledge_base, atom, tokens)
for similar in similars: for similar in similars:
result = build_remix_matrix(knowledge_base, tokens, atom, similar) result = build_remix_matrix(knowledge_base, tokens, atom, similar)
if result is not None: if result is not None:
break break
if result is None: return
raise Exception("No match found")
remix, (start_bounds, end_bounds) = result remix, (start_bounds, end_bounds) = result
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix) after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
@ -147,7 +143,7 @@ def integrate_language(knowledge_base, example):
session().annotate("M: {}".format(matcher)) session().annotate("M: {}".format(matcher))
session().annotate("R: {}".format(result)) session().annotate("R: {}".format(result))
session().annotate("---") session().annotate("---")
return tokens, matcher, result yield tokens, matcher, result
def apply_remix(tokens, remix): def apply_remix(tokens, remix):
@ -319,7 +315,7 @@ def get_similar_tree(knowledge_base, atom, tokens):
sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True) sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True)
if len(sorted_possibilities) < 1: if len(sorted_possibilities) < 1:
return None return []
for i, possibility in enumerate(sorted_possibilities): for i, possibility in enumerate(sorted_possibilities):
similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility
@ -369,20 +365,6 @@ def get_matching(sample, other):
return matching return matching
def reprocess_language_knowledge(knowledge_base, examples):
examples = knowledge_base.examples + examples
pattern_examples = []
for i, sample in enumerate(examples):
other = examples[:i] + examples[i + 1:]
match = get_matching(sample, other)
if len(match) > 0:
sample = (match, sample[1],)
pattern_examples.append(sample)
return pattern_examples
def reverse_remix(tree_section, remix): def reverse_remix(tree_section, remix):
result_section = [] result_section = []
offset = 0 offset = 0

View File

@ -0,0 +1,11 @@
from . import parsing
class ParsingLayer:
def __init__(self):
pass
def integrate(self, knowledge_base, example):
yield from parsing.integrate_language(knowledge_base, example)
def train(self, knowledge_base, example):
assert False

View File

@ -1,5 +1,5 @@
from .session.org_mode import global_session as session from ..session.org_mode import global_session as session
from .atoms import Atom, a, is_atom from ..atoms import Atom, a, is_atom
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements: for se in knowledge_base.structural_elements:

View File

@ -0,0 +1,84 @@
from ..session.org_mode import global_session as session
from ..atoms import Atom
from . import tokenization
import random
import copy
def randomized_weighted_list(elements):
# Randomized
randomized = list(elements)
random.shuffle(randomized)
# And return only once
already_returned = set()
for e in randomized:
if e in already_returned:
continue
yield e
already_returned.add(e)
class TokenizationLayer:
def __init__(self, knowledge_base):
self.structural_elements = set()
self.token_chains = {}
self.tokens = set()
self.knowledge_base = knowledge_base
self.knowledge = knowledge_base.knowledge
def integrate(self, knowledge_base, data):
assert knowledge_base is self.knowledge_base
print(data)
assert 'text' in data
with session().log("Tokenize: {}".format(data['text'])):
for tokens in tokenization.to_tokens(self, data['text']):
data_with_row = copy.copy(data)
data_with_row['tokens'] = tokens
print(data_with_row)
yield data_with_row
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = list(tokenization.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = tokenization.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train({'text': row, 'tokens': chosen})
return chosen
return options
## Tokenization
def add_token_pair(self, precedent, consequent):
self.add_token(precedent)
self.add_token(consequent)
if precedent not in self.token_chains:
self.token_chains[precedent] = []
self.token_chains[precedent].append(consequent)
def add_token(self, token):
self.tokens.add(token)
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
session().annotate('Found new structural element “{}'.format(token))
self.structural_elements.add(token)
def expected_token_after_precedent(self, precedent=None):
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
return randomized_weighted_list(self.tokens)
return randomized_weighted_list(self.token_chains[precedent])
def train(self, example):
with session().log('Training tokenizer'):
session().annotate("Example: {}".format(example))
tokens = tokenization.integrate_tokenization(self, example)
# Integrate knowledge of concept
for token in tokens:
if not token in self.knowledge:
self.knowledge[token] = {}

View File

@ -99,14 +99,14 @@ examples = [
lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use']) lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use'])
),], ),],
}), }),
('full_example', # ('full_example',
{ # {
"text": "The dominant language in france is french?", # "text": "The dominant language in france is french?",
"affirmation": "The dominant language in france is french", # "affirmation": "The dominant language in france is french",
"parsed": ("question", # "parsed": ("question",
("property-has-value", "france", "dominant-language", "french")), # ("property-has-value", "france", "dominant-language", "french")),
"answer": True, # "answer": True,
}), # }),
# { # {
# "text": "was abraham lincoln once president of the united states?", # "text": "was abraham lincoln once president of the united states?",
# "affirmation": "was abraham lincoln once president of the united states?", # "affirmation": "was abraham lincoln once president of the united states?",

View File

@ -63,11 +63,11 @@ def main():
show_progbar(i, total, example['text']) show_progbar(i, total, example['text'])
if case_type == 'example': if case_type == 'example':
with session().log(example['text']): with session().log(example['text']):
knowledge.train_tokenizer(example) knowledge.layers.tokenization.train(example)
elif case_type == 'test': elif case_type == 'test':
with session().log(example['text']): with session().log(example['text']):
tokens = list(knowledge.tokenize(example['text'])) tokens = list(knowledge.layers.tokenization.tokenize(example['text']))
session().log('Expected “{}”, found “{}' session().log('Expected “{}”, found “{}'
.format(example['tokens'], tokens)) .format(example['tokens'], tokens))

View File

@ -26,4 +26,4 @@ BASIC_TOKENIZATION_EXAMPLES = (
def train_basic_tokenization(knowledge_base): def train_basic_tokenization(knowledge_base):
with session().log('Training basic tokenization'): with session().log('Training basic tokenization'):
for example in BASIC_TOKENIZATION_EXAMPLES: for example in BASIC_TOKENIZATION_EXAMPLES:
knowledge_base.train_tokenizer(example) knowledge_base.layers.tokenization.train(example)