#!/usr/bin/env python import knowledge_evaluation import re from functools import reduce def make_template(knowledge_base, text, parsed): tokens = re.findall(r'(\w+|[^\s])', text) matcher = list(tokens) template = list(parsed) for i in range(len(matcher)): word = matcher[i] if word in template: template[template.index(word)] = i matcher[i] = { 'groups': set(knowledge_base.knowledge[word]['groups']) } return tokens, matcher, template def is_bottom_level(tree): for element in tree: if isinstance(element, list) or isinstance(element, tuple): return False return True def get_lower_levels(parsed): lower = [] def aux(subtree, top_level): nonlocal lower deeper = top_level for element in subtree: if isinstance(element, list) or isinstance(element, tuple): aux(element, top_level=False) deeper = True if not deeper: lower.append(subtree) aux(parsed, top_level=True) return lower def integrate_language(knowledge_base, example): text = example["text"].lower() parsed = example["parsed"] print("P:", parsed) while True: lower_levels = get_lower_levels(parsed) print("Lower:", lower_levels) if len(lower_levels) == 0: break for atom in lower_levels: print("\x1b[1mSelecting\x1b[0m:", atom) similar = get_similar_tree(knowledge_base, atom) print("___>", similar) remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, text, atom, similar) tokens, matcher, result = make_template(knowledge_base, text, atom) print("Tx:", tokens) print("Mx:", matcher) print("Rx:", result) print("Remix:", remix) after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix) assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens)) print(" \\->", after_remix) print("#########") break tokens, matcher, result = make_template(knowledge_base, text, parsed) print("T:", tokens) print("M:", matcher) print("R:", result) print() return tokens, matcher, result def apply_remix(tokens, remix): rebuilt = [] for i in remix: rebuilt.append(tokens[i]) return rebuilt def build_remix_matrix(knowledge_base, text, atom, similar): # print("+" * 20) tokens, matcher, result = make_template(knowledge_base, text, atom) similar_matcher, similar_result, similar_result_resolved, _ = similar # print("NEW:") # print("Tokens:", tokens) # print("Matcher:", matcher) # print("Result:", result) # print() # print("Similar:") # print("Matcher:", similar_matcher) # print("Result:", similar_result) start_bounds, end_bounds = find_bounds(matcher, similar_matcher) # print() # print("Bounds:") # print("Start:", start_bounds) # print("End: ", end_bounds) for i, element in (end_bounds + start_bounds[::-1]): matcher.pop(i) tokens.pop(i) possible_remixes = get_possible_remixes(matcher, similar_matcher) chosen_remix = possible_remixes[0] # print("New tokens:", tokens) # print("-" * 20) return chosen_remix, (start_bounds, end_bounds) def get_possible_remixes(matcher, similar_matcher): # print("*" * 20) # print(matcher) # print(similar_matcher) matrix = [] for element in matcher: assert(element in similar_matcher) indexes = all_indexes(similar_matcher, element) matrix.append(indexes) # print(matrix) # print([list(x) for x in list(zip(*matrix))]) # TODO: do some scoring to find the most "interesting combination" return [list(x) for x in list(zip(*matrix))] def all_indexes(collection, element): indexes = [] base = 0 for _ in range(collection.count(element)): i = collection.index(element, base) base = i + 1 indexes.append(i) return indexes def find_bounds(matcher, similar_matcher): start_bounds = [] for i, element in enumerate(matcher): if element in similar_matcher: break else: start_bounds.append((i, element)) end_bounds = [] for i, element in enumerate(matcher[::-1]): if element in similar_matcher: break else: end_bounds.append((len(matcher) - (i + 1), element)) return start_bounds, end_bounds def get_similar_tree(knowledge_base, atom): possibilities = [] # Find matching possibilities for entry, tree in knowledge_base.trained: if not is_bottom_level(tree): continue if tree[0] == atom[0]: possibilities.append((entry, tree)) # Sort by more matching elements sorted_possibilities = [] for (raw, possibility) in possibilities: resolved = [] for element in atom: if isinstance(element, str): resolved.append(element) else: resolved.append(knowledge_evaluation.resolve( knowledge_base.knowledge, element, raw)) # TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element score = sum([resolved[i] == atom[i] for i in range(min(len(resolved), len(atom)))]) sorted_possibilities.append((raw, possibility, resolved, score)) sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3], reverse=True) if len(sorted_possibilities) < 1: return None return sorted_possibilities[0] def get_matching(sample, other): l = len(sample[0]) other = list(filter(lambda x: len(x[0]) == l, other)) for i in range(l): if len(other) == 0: return [] if not isinstance(sample[0][i], str): other = list(filter(lambda x: not isinstance(x[0][i], str) and len(x[0][i]['groups'] & sample[0][i]['groups']) > 0, other)) return [sample[0][x] if isinstance(sample[0][x], str) else {'groups': sample[0][x]['groups'] & reduce(lambda a, b: a & b, map(lambda y: y[0][x]['groups'], other))} for x in range(l)] def reprocess_language_knowledge(knowledge_base, examples): examples = knowledge_base.examples + examples print('\n'.join(map(str, knowledge_base.examples))) print("--") pattern_examples = [] for i, sample in enumerate(examples): other = examples[:i] + examples[i + 1:] match = get_matching(sample, other) print("->", match) if len(match) > 0: sample = (match, sample[1],) pattern_examples.append(sample) print() print("\x1b[7m--\x1b[0m") return pattern_examples def get_fit(knowledge, row): row = row.lower().split() for sample, ast in knowledge.trained: if len(sample) != len(row): continue if all(map(lambda x: (not isinstance(sample[x], str) or sample[x] == row[x]), range(len(sample)))): return row, sample, ast else: return None