import copy import logging from .session.org_mode import global_session as session from .atoms import Atom from . import parsing from . import knowledge_evaluation from .modifiable_property import is_modifiable_property def diff_knowledge(before, after): import jsondiff return jsondiff.diff(before, after) class KnowledgeBase(object): def __init__(self, knowledge={}, examples=[], trained=[]): self.knowledge = copy.copy(knowledge) self.originals = [] self.examples = copy.copy(examples) self.trained = copy.copy(trained) self.tokenization = set() self.structural_elements = set() def train_tokenizer(self, example): with session().log('Train'): parsing.integrate_tokenization(self, example) def train(self, examples): knowledge_before = copy.deepcopy(self.knowledge) with session().log('Train'): # Parse everything for example in examples: # If there's parsed data, leverage it ASAP if 'parsed' in example and isinstance(example['parsed'], tuple): with session().log('parsed information integration'): result = knowledge_evaluation.integrate_information(self.knowledge, { "parsed": example['parsed'], }) self.act_upon(result) with session().log("language integration"): tokens, decomposition, inferred_tree = parsing.integrate_language(self, example) session().annotate("Tokens: {}".format(tokens)) session().annotate("Inferred tree: {}".format(inferred_tree)) with session().log("full information integration"): result = knowledge_evaluation.integrate_information(self.knowledge, { "elements": tokens, "decomposition": decomposition, "parsed": inferred_tree, }) session().annotate("Result: {}".format(self.get_value(result))) self.act_upon(result) session().annotate("Set: {}".format(self.get_value(result))) self.examples.append((decomposition, inferred_tree)) self.originals.append(example['text']) # Reduce values with session().log("reprocessing"): self.trained = parsing.reprocess_language_knowledge(self, self.examples) knowledge_after = copy.deepcopy(self.knowledge) knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_after) return knowledge_diff_getter def tokenize(self, row, return_one=True): row = row.lower() with session().log("Tokenize: {}".format(row)): options = list(parsing.to_tokens(self, row)) session().log("Results:\n{}".format('\n'.join(map(str, options)))) if return_one: chosen = parsing.pick_one_tokenization(options, self) session().log("Chosen: “{}”".format(chosen)) return chosen return options def add_tokenization(self, tokenization): with session().log('Added tokenization: “{}”'.format(tokenization)): self.tokenization.add(tokenization) for e in tokenization: if (not isinstance(e, Atom)) and (e not in self.structural_elements): session().annotate('Found new structural element “{}”'.format(e)) self.structural_elements.add(e) def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): tokens = self.tokenize(row) print(tokens) fit = parsing.get_fit(self, tokens) if fit is None: return None tokens, inferred_tree = fit result = knowledge_evaluation.integrate_information(self.knowledge, { "elements": tokens, "parsed": inferred_tree, }) self.act_upon(result) session().annotate("Result: {}".format(result)) knowledge_after = copy.deepcopy(self.knowledge) knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_after) return result, inferred_tree, knowledge_diff_getter def get_value(self, result): if is_modifiable_property(result): return result.getter() else: return result def act_upon(self, result): if is_modifiable_property(result): result.setter() else: logging.warning("Cannot act upon: {}".format(result))