2017-05-11 17:54:02 +00:00
|
|
|
import copy
|
2017-05-17 21:54:14 +00:00
|
|
|
import logging
|
2017-05-23 17:04:10 +00:00
|
|
|
|
2017-09-29 23:32:04 +00:00
|
|
|
from .session.org_mode import global_session as session
|
|
|
|
|
2018-04-15 15:07:29 +00:00
|
|
|
from .atoms import Atom
|
2017-05-23 17:04:10 +00:00
|
|
|
from . import parsing
|
|
|
|
from . import knowledge_evaluation
|
|
|
|
from .modifiable_property import is_modifiable_property
|
2018-04-15 18:06:21 +00:00
|
|
|
import random
|
2017-05-11 17:54:02 +00:00
|
|
|
|
|
|
|
def diff_knowledge(before, after):
|
|
|
|
import jsondiff
|
|
|
|
return jsondiff.diff(before, after)
|
|
|
|
|
|
|
|
|
2018-04-15 18:06:21 +00:00
|
|
|
def randomized_weighted_list(elements):
|
|
|
|
# Randomized
|
|
|
|
randomized = list(elements)
|
|
|
|
random.shuffle(randomized)
|
|
|
|
|
|
|
|
# And return only once
|
|
|
|
already_returned = set()
|
|
|
|
for e in randomized:
|
|
|
|
if e in already_returned:
|
|
|
|
continue
|
|
|
|
|
|
|
|
yield e
|
|
|
|
already_returned.add(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
2017-05-11 17:54:02 +00:00
|
|
|
class KnowledgeBase(object):
|
2018-04-01 18:24:09 +00:00
|
|
|
def __init__(self, knowledge={}, examples=[], trained=[]):
|
2017-05-11 17:54:02 +00:00
|
|
|
self.knowledge = copy.copy(knowledge)
|
2017-09-29 22:55:42 +00:00
|
|
|
self.originals = []
|
2017-05-11 17:54:02 +00:00
|
|
|
self.examples = copy.copy(examples)
|
|
|
|
self.trained = copy.copy(trained)
|
2018-04-15 15:07:29 +00:00
|
|
|
self.structural_elements = set()
|
2018-04-15 18:06:21 +00:00
|
|
|
self.token_chains = {}
|
|
|
|
self.tokens = set()
|
|
|
|
|
|
|
|
def add_token_pair(self, precedent, consequent):
|
|
|
|
self.add_token(precedent)
|
|
|
|
self.add_token(consequent)
|
|
|
|
|
|
|
|
if precedent not in self.token_chains:
|
|
|
|
self.token_chains[precedent] = []
|
|
|
|
self.token_chains[precedent].append(consequent)
|
|
|
|
|
|
|
|
def add_token(self, token):
|
|
|
|
self.tokens.add(token)
|
|
|
|
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
|
|
|
|
session().annotate('Found new structural element “{}”'.format(token))
|
|
|
|
self.structural_elements.add(token)
|
|
|
|
|
|
|
|
def expected_token_after_precedent(self, precedent=None):
|
|
|
|
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
|
|
|
|
return randomized_weighted_list(self.tokens)
|
|
|
|
|
|
|
|
return randomized_weighted_list(self.token_chains[precedent])
|
2018-04-01 18:24:09 +00:00
|
|
|
|
|
|
|
def train_tokenizer(self, example):
|
2018-04-15 18:06:21 +00:00
|
|
|
with session().log('Training tokenizer'):
|
|
|
|
session().annotate("Example: {}".format(example))
|
|
|
|
tokens = parsing.integrate_tokenization(self, example)
|
|
|
|
|
|
|
|
# Integrate knowledge of concept
|
|
|
|
for token in tokens:
|
|
|
|
if not token in self.knowledge:
|
|
|
|
self.knowledge[token] = {}
|
|
|
|
|
2017-05-11 17:54:02 +00:00
|
|
|
|
|
|
|
def train(self, examples):
|
|
|
|
knowledge_before = copy.deepcopy(self.knowledge)
|
2017-10-01 18:46:48 +00:00
|
|
|
with session().log('Train'):
|
|
|
|
# Parse everything
|
|
|
|
for example in examples:
|
|
|
|
# If there's parsed data, leverage it ASAP
|
2018-04-01 18:24:09 +00:00
|
|
|
if 'parsed' in example and isinstance(example['parsed'], tuple):
|
2017-10-01 18:46:48 +00:00
|
|
|
with session().log('parsed information integration'):
|
|
|
|
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
|
|
|
"parsed": example['parsed'],
|
|
|
|
})
|
|
|
|
self.act_upon(result)
|
|
|
|
|
|
|
|
with session().log("language integration"):
|
|
|
|
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
|
2018-04-01 18:24:09 +00:00
|
|
|
session().annotate("Tokens: {}".format(tokens))
|
|
|
|
session().annotate("Inferred tree: {}".format(inferred_tree))
|
2017-10-01 18:46:48 +00:00
|
|
|
|
|
|
|
with session().log("full information integration"):
|
|
|
|
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
|
|
|
"elements": tokens,
|
|
|
|
"decomposition": decomposition,
|
|
|
|
"parsed": inferred_tree,
|
|
|
|
})
|
|
|
|
|
|
|
|
session().annotate("Result: {}".format(self.get_value(result)))
|
|
|
|
self.act_upon(result)
|
|
|
|
session().annotate("Set: {}".format(self.get_value(result)))
|
|
|
|
self.examples.append((decomposition, inferred_tree))
|
|
|
|
self.originals.append(example['text'])
|
|
|
|
|
|
|
|
# Reduce values
|
|
|
|
with session().log("reprocessing"):
|
|
|
|
self.trained = parsing.reprocess_language_knowledge(self, self.examples)
|
|
|
|
|
|
|
|
knowledge_after = copy.deepcopy(self.knowledge)
|
|
|
|
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
|
2017-05-11 17:54:02 +00:00
|
|
|
knowledge_after)
|
|
|
|
|
2017-10-01 18:46:48 +00:00
|
|
|
return knowledge_diff_getter
|
2017-05-11 17:54:02 +00:00
|
|
|
|
2018-04-01 18:24:09 +00:00
|
|
|
def tokenize(self, row, return_one=True):
|
2017-05-23 21:16:19 +00:00
|
|
|
row = row.lower()
|
2018-04-01 18:24:09 +00:00
|
|
|
with session().log("Tokenize: {}".format(row)):
|
2018-04-15 15:08:01 +00:00
|
|
|
options = list(parsing.to_tokens(self, row))
|
|
|
|
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
|
|
|
|
2018-04-01 18:24:09 +00:00
|
|
|
if return_one:
|
2018-04-15 15:47:04 +00:00
|
|
|
chosen = parsing.pick_one_tokenization(options, self)
|
2018-04-15 15:08:01 +00:00
|
|
|
session().log("Chosen: “{}”".format(chosen))
|
2018-04-15 18:45:24 +00:00
|
|
|
self.train_tokenizer({'text': row, 'tokens': chosen})
|
2018-04-15 15:08:01 +00:00
|
|
|
return chosen
|
2018-04-01 18:24:09 +00:00
|
|
|
return options
|
|
|
|
|
|
|
|
def process(self, row):
|
2017-05-11 17:54:02 +00:00
|
|
|
knowledge_before = copy.deepcopy(self.knowledge)
|
2017-10-01 18:46:48 +00:00
|
|
|
with session().log("Process: {}".format(row)):
|
2018-04-01 18:24:09 +00:00
|
|
|
tokens = self.tokenize(row)
|
|
|
|
|
2017-10-01 18:46:48 +00:00
|
|
|
fit = parsing.get_fit(self, tokens)
|
|
|
|
if fit is None:
|
|
|
|
return None
|
|
|
|
|
|
|
|
tokens, inferred_tree = fit
|
|
|
|
result = knowledge_evaluation.integrate_information(self.knowledge,
|
|
|
|
{
|
|
|
|
"elements": tokens,
|
|
|
|
"parsed": inferred_tree,
|
|
|
|
})
|
|
|
|
self.act_upon(result)
|
|
|
|
session().annotate("Result: {}".format(result))
|
|
|
|
|
|
|
|
knowledge_after = copy.deepcopy(self.knowledge)
|
|
|
|
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
|
|
|
|
knowledge_after)
|
2017-05-11 17:54:02 +00:00
|
|
|
|
2017-10-01 18:46:48 +00:00
|
|
|
return result, inferred_tree, knowledge_diff_getter
|
2017-05-11 18:24:29 +00:00
|
|
|
|
2017-05-15 14:51:39 +00:00
|
|
|
def get_value(self, result):
|
2017-05-22 18:20:53 +00:00
|
|
|
if is_modifiable_property(result):
|
2017-05-15 14:51:39 +00:00
|
|
|
return result.getter()
|
|
|
|
else:
|
|
|
|
return result
|
|
|
|
|
2017-05-11 18:24:29 +00:00
|
|
|
def act_upon(self, result):
|
2017-05-22 18:20:53 +00:00
|
|
|
if is_modifiable_property(result):
|
2017-05-11 18:24:29 +00:00
|
|
|
result.setter()
|
|
|
|
else:
|
2017-05-22 20:06:49 +00:00
|
|
|
logging.warning("Cannot act upon: {}".format(result))
|