lang-model/naive-nlu/tree_nlu/knowledge_base.py

126 lines
5.1 KiB
Python
Raw Normal View History

import copy
import logging
2017-05-23 17:04:10 +00:00
2017-09-29 23:32:04 +00:00
from .session.org_mode import global_session as session
2018-04-15 15:07:29 +00:00
from .atoms import Atom
2017-05-23 17:04:10 +00:00
from . import parsing
from . import knowledge_evaluation
from .modifiable_property import is_modifiable_property
def diff_knowledge(before, after):
import jsondiff
return jsondiff.diff(before, after)
class KnowledgeBase(object):
2018-04-01 18:24:09 +00:00
def __init__(self, knowledge={}, examples=[], trained=[]):
self.knowledge = copy.copy(knowledge)
self.originals = []
self.examples = copy.copy(examples)
self.trained = copy.copy(trained)
2018-04-01 18:24:09 +00:00
self.tokenization = set()
2018-04-15 15:07:29 +00:00
self.structural_elements = set()
2018-04-01 18:24:09 +00:00
def train_tokenizer(self, example):
with session().log('Train'):
parsing.integrate_tokenization(self, example)
def train(self, examples):
knowledge_before = copy.deepcopy(self.knowledge)
2017-10-01 18:46:48 +00:00
with session().log('Train'):
# Parse everything
for example in examples:
# If there's parsed data, leverage it ASAP
2018-04-01 18:24:09 +00:00
if 'parsed' in example and isinstance(example['parsed'], tuple):
2017-10-01 18:46:48 +00:00
with session().log('parsed information integration'):
result = knowledge_evaluation.integrate_information(self.knowledge, {
"parsed": example['parsed'],
})
self.act_upon(result)
with session().log("language integration"):
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
2018-04-01 18:24:09 +00:00
session().annotate("Tokens: {}".format(tokens))
session().annotate("Inferred tree: {}".format(inferred_tree))
2017-10-01 18:46:48 +00:00
with session().log("full information integration"):
result = knowledge_evaluation.integrate_information(self.knowledge, {
"elements": tokens,
"decomposition": decomposition,
"parsed": inferred_tree,
})
session().annotate("Result: {}".format(self.get_value(result)))
self.act_upon(result)
session().annotate("Set: {}".format(self.get_value(result)))
self.examples.append((decomposition, inferred_tree))
self.originals.append(example['text'])
# Reduce values
with session().log("reprocessing"):
self.trained = parsing.reprocess_language_knowledge(self, self.examples)
knowledge_after = copy.deepcopy(self.knowledge)
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
knowledge_after)
2017-10-01 18:46:48 +00:00
return knowledge_diff_getter
2018-04-01 18:24:09 +00:00
def tokenize(self, row, return_one=True):
row = row.lower()
2018-04-01 18:24:09 +00:00
with session().log("Tokenize: {}".format(row)):
options = list(parsing.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
2018-04-01 18:24:09 +00:00
if return_one:
chosen = parsing.pick_one_tokenization(options)
session().log("Chosen: “{}".format(chosen))
return chosen
2018-04-01 18:24:09 +00:00
return options
2018-04-15 15:07:29 +00:00
def add_tokenization(self, tokenization):
with session().log('Added tokenization: “{}'.format(tokenization)):
self.tokenization.add(tokenization)
for e in tokenization:
if (not isinstance(e, Atom)) and (e not in self.structural_elements):
session().annotate('Found new structural element “{}'.format(e))
self.structural_elements.add(e)
2018-04-01 18:24:09 +00:00
def process(self, row):
knowledge_before = copy.deepcopy(self.knowledge)
2017-10-01 18:46:48 +00:00
with session().log("Process: {}".format(row)):
2018-04-01 18:24:09 +00:00
tokens = self.tokenize(row)
2017-10-01 18:46:48 +00:00
fit = parsing.get_fit(self, tokens)
if fit is None:
return None
tokens, inferred_tree = fit
result = knowledge_evaluation.integrate_information(self.knowledge,
{
"elements": tokens,
"parsed": inferred_tree,
})
self.act_upon(result)
session().annotate("Result: {}".format(result))
knowledge_after = copy.deepcopy(self.knowledge)
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
knowledge_after)
2017-10-01 18:46:48 +00:00
return result, inferred_tree, knowledge_diff_getter
def get_value(self, result):
if is_modifiable_property(result):
return result.getter()
else:
return result
def act_upon(self, result):
if is_modifiable_property(result):
result.setter()
else:
logging.warning("Cannot act upon: {}".format(result))