Merge branch 'layered-model' into naive-nlu

This commit is contained in:
kenkeiras 2018-04-25 20:17:53 +02:00
commit 178dadc57f
10 changed files with 192 additions and 113 deletions

1
.gitignore vendored
View File

@ -1,5 +1,6 @@
*#*
*~
.vscode
*.ba?k
*.pyc
__pycache__

View File

@ -4,8 +4,7 @@ import logging
from .session.org_mode import global_session as session
from .atoms import Atom
from . import parsing
from . import tokenization
from . import layered_model
from . import knowledge_evaluation
from .modifiable_property import is_modifiable_property
import random
@ -15,21 +14,6 @@ def diff_knowledge(before, after):
return jsondiff.diff(before, after)
def randomized_weighted_list(elements):
# Randomized
randomized = list(elements)
random.shuffle(randomized)
# And return only once
already_returned = set()
for e in randomized:
if e in already_returned:
continue
yield e
already_returned.add(e)
class KnowledgeBase(object):
def __init__(self, knowledge={}, examples=[], trained=[]):
@ -37,41 +21,9 @@ class KnowledgeBase(object):
self.originals = []
self.examples = copy.copy(examples)
self.trained = copy.copy(trained)
self.structural_elements = set()
self.token_chains = {}
self.tokens = set()
def add_token_pair(self, precedent, consequent):
self.add_token(precedent)
self.add_token(consequent)
if precedent not in self.token_chains:
self.token_chains[precedent] = []
self.token_chains[precedent].append(consequent)
def add_token(self, token):
self.tokens.add(token)
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
session().annotate('Found new structural element “{}'.format(token))
self.structural_elements.add(token)
def expected_token_after_precedent(self, precedent=None):
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
return randomized_weighted_list(self.tokens)
return randomized_weighted_list(self.token_chains[precedent])
def train_tokenizer(self, example):
with session().log('Training tokenizer'):
session().annotate("Example: {}".format(example))
tokens = tokenization.integrate_tokenization(self, example)
# Integrate knowledge of concept
for token in tokens:
if not token in self.knowledge:
self.knowledge[token] = {}
self.layers = layered_model.BaseModel(self)
## Parsing
def train(self, examples):
knowledge_before = copy.deepcopy(self.knowledge)
with session().log('Train'):
@ -86,11 +38,12 @@ class KnowledgeBase(object):
self.act_upon(result)
with session().log("language integration"):
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
session().annotate("Tokens: {}".format(tokens))
session().annotate("Inferred tree: {}".format(inferred_tree))
for tokens, decomposition, inferred_tree in self.layers.integrate(self, example):
session().annotate("Tokens: {}".format(tokens))
session().annotate("Inferred tree: {}".format(inferred_tree))
with session().log("full information integration"):
tokens = self.layers.tokenization.tokenize(example['text'], return_one=True)
result = knowledge_evaluation.integrate_information(self.knowledge, {
"elements": tokens,
"decomposition": decomposition,
@ -105,7 +58,8 @@ class KnowledgeBase(object):
# Reduce values
with session().log("reprocessing"):
self.trained = parsing.reprocess_language_knowledge(self, self.examples)
res = self.layers.reprocess(self.examples)
self.trained = res
knowledge_after = copy.deepcopy(self.knowledge)
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
@ -113,29 +67,14 @@ class KnowledgeBase(object):
return knowledge_diff_getter
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = list(tokenization.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = tokenization.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train_tokenizer({'text': row, 'tokens': chosen})
return chosen
return options
def process(self, row):
knowledge_before = copy.deepcopy(self.knowledge)
with session().log("Process: {}".format(row)):
tokens = self.tokenize(row)
fit = parsing.get_fit(self, tokens)
if fit is None:
fit = list(self.layers.process(self, row))
if len(fit) == 0:
return None
tokens, inferred_tree = fit
tokens, inferred_tree = fit[0]
result = knowledge_evaluation.integrate_information(self.knowledge,
{
"elements": tokens,

View File

@ -0,0 +1,49 @@
from .layers import tokenization_layer
from .layers import parsing_layer
from .layers import parsing
from .session.org_mode import global_session as session
def make_yield_pipe(layers, knowledge_base, example, func):
if len(layers) < 1:
yield example
return
input_generator = make_yield_pipe(layers[:-1], knowledge_base, example, func)
for input in input_generator:
session().annotate("[{}] --> {}".format(len(layers), input))
for d in list(func(layers[-1], input)):
yield d
class BaseModel:
def __init__(self, knowledge_base):
self.tokenization = tokenization_layer.TokenizationLayer(knowledge_base)
self.parsing = parsing_layer.ParsingLayer()
self.layers = [
self.tokenization,
self.parsing,
]
def reprocess(self, examples):
pattern_examples = []
for i, sample in enumerate(examples):
other = examples[:i] + examples[i + 1:]
match = parsing.get_matching(sample, other)
if len(match) > 0:
sample = (match, sample[1],)
pattern_examples.append(sample)
return pattern_examples
def integrate(self, knowledge_base, example):
yield from make_yield_pipe(self.layers, knowledge_base,
example, lambda l, i: l.integrate(knowledge_base, i))
def process(self, knowledge_base, example):
yield from make_yield_pipe(self.layers, knowledge_base,
example, lambda l, i: l.process(knowledge_base, i))
def tokenize(self, row, return_one=True):
return self.tokenization.to_tokens(row)

View File

@ -1,18 +1,15 @@
#!/usr/bin/env python
from . import knowledge_evaluation
from . import tokenization
from . import depth_meter
from .session.org_mode import global_session as session
from ..session.org_mode import global_session as session
import re
import copy
from functools import reduce
from typing import List, Dict
from .modifiable_property import ModifiableProperty
from . import parameters
from .atoms import Atom, a, is_atom
from ..modifiable_property import ModifiableProperty
from .. import parameters
from ..atoms import Atom, a, is_atom
from .. import knowledge_evaluation
def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens)
@ -83,8 +80,8 @@ def integrate_language(knowledge_base, example):
text = example["text"].lower()
parsed = example["parsed"]
tokens = example['tokens']
resolved_parsed = copy.deepcopy(parsed)
tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base))
while True:
session().annotate("P: {}".format(resolved_parsed))
@ -95,14 +92,15 @@ def integrate_language(knowledge_base, example):
for position, atom in lower_levels:
with session().log("Atom {}".format(atom)):
result = None
similars = get_similar_tree(knowledge_base, atom, tokens)
for similar in similars:
result = build_remix_matrix(knowledge_base, tokens, atom, similar)
if result is not None:
break
else:
raise Exception('Similar not found')
if result is None:
raise Exception("No match found")
remix, (start_bounds, end_bounds) = result
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
@ -147,7 +145,7 @@ def integrate_language(knowledge_base, example):
session().annotate("M: {}".format(matcher))
session().annotate("R: {}".format(result))
session().annotate("---")
return tokens, matcher, result
yield tokens, matcher, result
def apply_remix(tokens, remix):
@ -319,7 +317,7 @@ def get_similar_tree(knowledge_base, atom, tokens):
sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True)
if len(sorted_possibilities) < 1:
return None
return []
for i, possibility in enumerate(sorted_possibilities):
similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility
@ -369,20 +367,6 @@ def get_matching(sample, other):
return matching
def reprocess_language_knowledge(knowledge_base, examples):
examples = knowledge_base.examples + examples
pattern_examples = []
for i, sample in enumerate(examples):
other = examples[:i] + examples[i + 1:]
match = get_matching(sample, other)
if len(match) > 0:
sample = (match, sample[1],)
pattern_examples.append(sample)
return pattern_examples
def reverse_remix(tree_section, remix):
result_section = []
offset = 0

View File

@ -0,0 +1,16 @@
from . import parsing
class ParsingLayer:
def __init__(self):
pass
def integrate(self, knowledge_base, example):
yield from parsing.integrate_language(knowledge_base, example)
def train(self, knowledge_base, example):
assert False
def process(self, knowledge_base, input):
fit = parsing.get_fit(knowledge_base, input)
if fit is not None:
yield fit

View File

@ -1,5 +1,5 @@
from .session.org_mode import global_session as session
from .atoms import Atom, a, is_atom
from ..session.org_mode import global_session as session
from ..atoms import Atom, a, is_atom
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements:

View File

@ -0,0 +1,90 @@
from ..session.org_mode import global_session as session
from ..atoms import Atom
from . import tokenization
import random
import copy
def randomized_weighted_list(elements):
# Randomized
randomized = list(elements)
random.shuffle(randomized)
# And return only once
already_returned = set()
for e in randomized:
if e in already_returned:
continue
yield e
already_returned.add(e)
class TokenizationLayer:
def __init__(self, knowledge_base):
self.structural_elements = set()
self.token_chains = {}
self.tokens = set()
self.knowledge_base = knowledge_base
self.knowledge = knowledge_base.knowledge
def integrate(self, knowledge_base, data):
assert knowledge_base is self.knowledge_base
assert 'text' in data
tokens = self.tokenize(data['text'])
data_with_row = copy.copy(data)
data_with_row['tokens'] = tokens
yield data_with_row
# with session().log("Tokenize: {}".format(data['text'])):
# for tokens in tokenization.to_tokens(self, data['text']):
# data_with_row = copy.copy(data)
# data_with_row['tokens'] = tokens
# yield data_with_row
def process(self, knowledge_base, row):
yield self.tokenize(row)
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = list(tokenization.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = tokenization.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train({'text': row, 'tokens': chosen})
return chosen
return options
## Tokenization
def add_token_pair(self, precedent, consequent):
self.add_token(precedent)
self.add_token(consequent)
if precedent not in self.token_chains:
self.token_chains[precedent] = []
self.token_chains[precedent].append(consequent)
def add_token(self, token):
self.tokens.add(token)
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
session().annotate('Found new structural element “{}'.format(token))
self.structural_elements.add(token)
def expected_token_after_precedent(self, precedent=None):
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
return randomized_weighted_list(self.tokens)
return randomized_weighted_list(self.token_chains[precedent])
def train(self, example):
with session().log('Training tokenizer'):
session().annotate("Example: {}".format(example))
tokens = tokenization.integrate_tokenization(self, example)
# Integrate knowledge of concept
for token in tokens:
if not token in self.knowledge:
self.knowledge[token] = {}

View File

@ -99,14 +99,14 @@ examples = [
lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use'])
),],
}),
('full_example',
{
"text": "The dominant language in france is french?",
"affirmation": "The dominant language in france is french",
"parsed": ("question",
("property-has-value", "france", "dominant-language", "french")),
"answer": True,
}),
# ('full_example',
# {
# "text": "The dominant language in france is french?",
# "affirmation": "The dominant language in france is french",
# "parsed": ("question",
# ("property-has-value", "france", "dominant-language", "french")),
# "answer": True,
# }),
# {
# "text": "was abraham lincoln once president of the united states?",
# "affirmation": "was abraham lincoln once president of the united states?",

View File

@ -63,11 +63,11 @@ def main():
show_progbar(i, total, example['text'])
if case_type == 'example':
with session().log(example['text']):
knowledge.train_tokenizer(example)
knowledge.layers.tokenization.train(example)
elif case_type == 'test':
with session().log(example['text']):
tokens = list(knowledge.tokenize(example['text']))
tokens = list(knowledge.layers.tokenization.tokenize(example['text']))
session().log('Expected “{}”, found “{}'
.format(example['tokens'], tokens))

View File

@ -26,4 +26,4 @@ BASIC_TOKENIZATION_EXAMPLES = (
def train_basic_tokenization(knowledge_base):
with session().log('Training basic tokenization'):
for example in BASIC_TOKENIZATION_EXAMPLES:
knowledge_base.train_tokenizer(example)
knowledge_base.layers.tokenization.train(example)