import copy import logging from .session.org_mode import global_session as session from .atoms import Atom from . import parsing from . import knowledge_evaluation from .modifiable_property import is_modifiable_property import random def diff_knowledge(before, after): import jsondiff return jsondiff.diff(before, after) def randomized_weighted_list(elements): # Randomized randomized = list(elements) random.shuffle(randomized) # And return only once already_returned = set() for e in randomized: if e in already_returned: continue yield e already_returned.add(e) class KnowledgeBase(object): def __init__(self, knowledge={}, examples=[], trained=[]): self.knowledge = copy.copy(knowledge) self.originals = [] self.examples = copy.copy(examples) self.trained = copy.copy(trained) self.structural_elements = set() self.token_chains = {} self.tokens = set() def add_token_pair(self, precedent, consequent): self.add_token(precedent) self.add_token(consequent) if precedent not in self.token_chains: self.token_chains[precedent] = [] self.token_chains[precedent].append(consequent) def add_token(self, token): self.tokens.add(token) if (not isinstance(token, Atom)) and (token not in self.structural_elements): session().annotate('Found new structural element “{}”'.format(token)) self.structural_elements.add(token) def expected_token_after_precedent(self, precedent=None): if precedent not in self.token_chains: # If there's no known precedent, just return all tokens return randomized_weighted_list(self.tokens) return randomized_weighted_list(self.token_chains[precedent]) def train_tokenizer(self, example): with session().log('Training tokenizer'): session().annotate("Example: {}".format(example)) tokens = parsing.integrate_tokenization(self, example) # Integrate knowledge of concept for token in tokens: if not token in self.knowledge: self.knowledge[token] = {} def train(self, examples): knowledge_before = copy.deepcopy(self.knowledge) with session().log('Train'): # Parse everything for example in examples: # If there's parsed data, leverage it ASAP if 'parsed' in example and isinstance(example['parsed'], tuple): with session().log('parsed information integration'): result = knowledge_evaluation.integrate_information(self.knowledge, { "parsed": example['parsed'], }) self.act_upon(result) with session().log("language integration"): tokens, decomposition, inferred_tree = parsing.integrate_language(self, example) session().annotate("Tokens: {}".format(tokens)) session().annotate("Inferred tree: {}".format(inferred_tree)) with session().log("full information integration"): result = knowledge_evaluation.integrate_information(self.knowledge, { "elements": tokens, "decomposition": decomposition, "parsed": inferred_tree, }) session().annotate("Result: {}".format(self.get_value(result))) self.act_upon(result) session().annotate("Set: {}".format(self.get_value(result))) self.examples.append((decomposition, inferred_tree)) self.originals.append(example['text']) # Reduce values with session().log("reprocessing"): self.trained = parsing.reprocess_language_knowledge(self, self.examples) knowledge_after = copy.deepcopy(self.knowledge) knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_after) return knowledge_diff_getter def tokenize(self, row, return_one=True): row = row.lower() with session().log("Tokenize: {}".format(row)): options = list(parsing.to_tokens(self, row)) session().log("Results:\n{}".format('\n'.join(map(str, options)))) if return_one: chosen = parsing.pick_one_tokenization(options, self) session().log("Chosen: “{}”".format(chosen)) self.train_tokenizer({'text': row, 'tokens': chosen}) return chosen return options def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): tokens = self.tokenize(row) fit = parsing.get_fit(self, tokens) if fit is None: return None tokens, inferred_tree = fit result = knowledge_evaluation.integrate_information(self.knowledge, { "elements": tokens, "parsed": inferred_tree, }) self.act_upon(result) session().annotate("Result: {}".format(result)) knowledge_after = copy.deepcopy(self.knowledge) knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_after) return result, inferred_tree, knowledge_diff_getter def get_value(self, result): if is_modifiable_property(result): return result.getter() else: return result def act_upon(self, result): if is_modifiable_property(result): result.setter() else: logging.warning("Cannot act upon: {}".format(result))