diff --git a/.gitignore b/.gitignore index e9d4714..3c698f6 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,5 @@ *#* *~ -.vscode *.ba?k *.pyc __pycache__ -treeNLU-*session*.org diff --git a/naive-nlu/cli.py b/naive-nlu/cli.py deleted file mode 100644 index b268191..0000000 --- a/naive-nlu/cli.py +++ /dev/null @@ -1,4 +0,0 @@ -from tree_nlu import cli - -if __name__ == '__main__': - cli.main() diff --git a/naive-nlu/requirements.txt b/naive-nlu/requirements.txt index 9891d55..4e6b173 100644 --- a/naive-nlu/requirements.txt +++ b/naive-nlu/requirements.txt @@ -1 +1,2 @@ jsondiff +hy diff --git a/naive-nlu/setup.py b/naive-nlu/setup.py index 8fdc33b..18db1cd 100644 --- a/naive-nlu/setup.py +++ b/naive-nlu/setup.py @@ -11,5 +11,6 @@ setup(name='tree_nlu', include_package_data=True, install_requires = [ 'jsondiff', + 'hy', ], zip_safe=False) diff --git a/naive-nlu/tree_nlu/atoms.py b/naive-nlu/tree_nlu/atoms.py deleted file mode 100644 index d1de20a..0000000 --- a/naive-nlu/tree_nlu/atoms.py +++ /dev/null @@ -1,23 +0,0 @@ -''' -Analogous to erlang ones. - -"An atom is a literal, a constant with name." -''' - -from collections import namedtuple - -Atom = namedtuple('Atom', field_names='name') - -def is_atom(element, name=None): - '''Check if an element is an atom with a specific name.''' - if not isinstance(element, Atom): - return False - - if name is None: - return True - - return element.name == name - -def a(name): - '''Build an atom with a given name.''' - return Atom(name) diff --git a/naive-nlu/tree_nlu/cli.py b/naive-nlu/tree_nlu/cli.py deleted file mode 100644 index 7434d12..0000000 --- a/naive-nlu/tree_nlu/cli.py +++ /dev/null @@ -1,65 +0,0 @@ -import logging -import datetime -from .session.org_mode import ( - global_session as session, - create_global_session, -) -from .knowledge_base import KnowledgeBase -from .visualization import ( - show_knowledge, - show_samples, -) -from .tests import gac_100 -from .modifiable_property import ( - ModifiableProperty, - ModifiablePropertyWithAst, - is_modifiable_property, -) - - -bye_phrases = ['bye', 'exit'] - - -def gen_session_name(): - now = datetime.datetime.utcnow() - return "treeNLU-cli-session-{}.org".format( - now.strftime("%y_%m_%d %H:%M:%S_%f")) - - -def main(): - create_global_session(gen_session_name()) - logging.getLogger().setLevel(logging.INFO) - knowledge = gac_100.main() - logging.getLogger().setLevel(logging.DEBUG) - while True: - try: - data = input("> ").strip() - except EOFError: - print("bye") - break - if data.lower() in bye_phrases: - break - if not data: - continue - - if data == '/show': - show_knowledge(knowledge) - continue - elif data == '/samples': - show_samples(knowledge) - continue - - with session().log(data): - ret = knowledge.process(data) - if ret: - result, _, _ = ret - if not is_modifiable_property(result): - print("<", result) - else: - result.setter() - print("OK") - elif ret is None: - print("- Couldn't understand that, oops... -") - else: - print("Unhandled response:", ret) - print("< Bye!") diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index f33b39f..4c27700 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -1,93 +1,70 @@ import copy + import logging -from .session.org_mode import global_session as session - -from .atoms import Atom -from . import layered_model +from . import parsing from . import knowledge_evaluation from .modifiable_property import is_modifiable_property -import random + def diff_knowledge(before, after): import jsondiff return jsondiff.diff(before, after) - class KnowledgeBase(object): - def __init__(self, knowledge={}, examples=[], trained=[]): + def __init__(self, knowledge, examples=[], trained=[]): self.knowledge = copy.copy(knowledge) - self.originals = [] self.examples = copy.copy(examples) self.trained = copy.copy(trained) - self.layers = layered_model.BaseModel(self) - ## Parsing def train(self, examples): knowledge_before = copy.deepcopy(self.knowledge) - with session().log('Train'): - # Parse everything - for example in examples: - # If there's parsed data, leverage it ASAP - if 'parsed' in example and isinstance(example['parsed'], tuple): - with session().log('parsed information integration'): - result = knowledge_evaluation.integrate_information(self.knowledge, { - "parsed": example['parsed'], - }) - self.act_upon(result) - with session().log("language integration"): - for tokens, decomposition, inferred_tree in self.layers.integrate(self, example): - session().annotate("Tokens: {}".format(tokens)) - session().annotate("Inferred tree: {}".format(inferred_tree)) + # Parse everything + parsed_examples = [] + for example in examples: + logging.info("\x1b[7;32m> {} \x1b[0m".format(example)) + tokens, decomposition, inferred_tree = parsing.integrate_language(self, example) + logging.info(tokens) + result = knowledge_evaluation.integrate_information(self.knowledge, { + "elements": tokens, + "decomposition": decomposition, + "parsed": inferred_tree, + }) - with session().log("full information integration"): - tokens = self.layers.tokenization.tokenize(example['text'], return_one=True) - result = knowledge_evaluation.integrate_information(self.knowledge, { - "elements": tokens, - "decomposition": decomposition, - "parsed": inferred_tree, - }) + logging.info("\x1b[7;33m< {} \x1b[0m".format(self.get_value(result))) + self.act_upon(result) + logging.info("\x1b[7;34m> set: {} \x1b[0m".format(self.get_value(result))) + self.examples.append((decomposition, inferred_tree)) - session().annotate("Result: {}".format(self.get_value(result))) - self.act_upon(result) - session().annotate("Set: {}".format(self.get_value(result))) - self.examples.append((decomposition, inferred_tree)) - self.originals.append(example['text']) + # Reduce values + self.trained = parsing.reprocess_language_knowledge(self, self.examples) - # Reduce values - with session().log("reprocessing"): - res = self.layers.reprocess(self.examples) - self.trained = res - - knowledge_after = copy.deepcopy(self.knowledge) - knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, + knowledge_after = copy.deepcopy(self.knowledge) + knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_after) - return knowledge_diff_getter + return knowledge_diff_getter + def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) - with session().log("Process: {}".format(row)): - fit = list(self.layers.process(self, row)) - if len(fit) == 0: - return None + logging.info("\x1b[7;32m> {} \x1b[0m".format(row)) + tokens = parsing.to_tokens(row) + tokens, inferred_tree = parsing.get_fit(self, tokens) + result = knowledge_evaluation.integrate_information(self.knowledge, + { + "elements": tokens, + "parsed": inferred_tree, + }) + self.act_upon(result) - tokens, inferred_tree = fit[0] - result = knowledge_evaluation.integrate_information(self.knowledge, - { - "elements": tokens, - "parsed": inferred_tree, - }) - self.act_upon(result) - session().annotate("Result: {}".format(result)) + knowledge_after = copy.deepcopy(self.knowledge) + knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, + knowledge_after) - knowledge_after = copy.deepcopy(self.knowledge) - knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, - knowledge_after) - - return result, inferred_tree, knowledge_diff_getter + return result, inferred_tree, knowledge_diff_getter def get_value(self, result): if is_modifiable_property(result): diff --git a/naive-nlu/tree_nlu/knowledge_evaluation.py b/naive-nlu/tree_nlu/knowledge_evaluation.py index e2704f9..a24c07d 100644 --- a/naive-nlu/tree_nlu/knowledge_evaluation.py +++ b/naive-nlu/tree_nlu/knowledge_evaluation.py @@ -1,5 +1,3 @@ -from .session.org_mode import global_session as session - from .modifiable_property import ( ModifiableProperty, ModifiablePropertyWithAst, @@ -11,7 +9,6 @@ def resolve(knowledge_base, elements, value): if isinstance(value, int): return elements[value] elif isinstance(value, tuple) or isinstance(value, list): - session().annotate("V: {} {}".format(value, elements)) return integrate_information(knowledge_base, { "elements": elements, "parsed": value, @@ -44,43 +41,17 @@ def get_subquery_type(knowledge_base, atom): def property_for_value(knowledge_base, value): - if value in knowledge_base: - # Annotate the property as property - groups = knowledge_base[value].get('groups', {'property'}) - groups.add('property') - knowledge_base[value]['groups'] = groups - - # And find the property "name" - if 'as_property' in knowledge_base[value]: - return knowledge_base[value]['as_property'] - - return knowledge_base[value].get('groups', {'property'}) - else: - # Consider that any property is... a property - knowledge_base[value] = {'groups': {'property'}} - return {'property'} + return knowledge_base[value]['as_property'] def modifiable_property_from_property(prop, path, value): def getter(): nonlocal prop, path, value - if isinstance(path, set): - # If the property is from a set, it's true if any possible - # path has a element as true - return any(map(lambda possible_path: ((possible_path in prop) - and - (prop[possible_path] == value)), - path)) - else: - return (path in prop) and prop[path] == value + return (path in prop) and prop[path] == value def setter(): nonlocal prop, path, value - if isinstance(path, set): - for possible_path in path: - prop[possible_path] = value - else: - prop[path] = value + prop[path] = value return ModifiableProperty( getter=getter, @@ -103,31 +74,12 @@ def exists_property_with_value(knowledge_base, elements, subj, value): def modifiable_element_for_existance_in_set(container, set_name, element): - session().annotate("-----({} {} {})".format(container, set_name, element)) - def getter(): nonlocal container, set_name, element - session().annotate(" get({} {} {})".format(container, set_name, element)) return (set_name in container) and (element in container[set_name]) def setter(): nonlocal container, set_name, element - session().annotate(" add({} {} {})".format(container, set_name, element)) - return container[set_name].add(element) - - return ModifiableProperty( - getter=getter, - setter=setter, - ) - -def modifiable_element_for_existance_in_group(container, element, backlink, set_name='groups'): - def getter(): - nonlocal container, element, backlink, set_name - return (set_name in container) and (element in container[set_name]) - - def setter(): - nonlocal container, set_name, element - backlink['groups'].add(set_name) return container[set_name].add(element) return ModifiableProperty( @@ -140,23 +92,18 @@ def pertenence_to_group(knowledge_base, elements, subj, group): group = resolve(knowledge_base, elements, group) if subj not in knowledge_base: - knowledge_base[subj] = {'groups': set()} + knowledge_base[subj] = {} if "groups" not in knowledge_base[subj]: knowledge_base[subj]["groups"] = set() - if group not in knowledge_base: - knowledge_base[group] = {'groups': set()} - - if "groups" not in knowledge_base[group]: - knowledge_base[group]["groups"] = set() - - return modifiable_element_for_existance_in_group( + return modifiable_element_for_existance_in_set( container=knowledge_base[subj], - element=group, - backlink=knowledge_base[group], + set_name="groups", + element=group ) + def has_capacity(knowledge_base, elements, subj, capacity): subj = resolve(knowledge_base, elements, subj) capacity = resolve(knowledge_base, elements, capacity) @@ -181,70 +128,12 @@ def question(knowledge_base, elements, subj): return subj.getter() return subj -def implies(knowledge_base, elements, precedent, consequent): - precedent = resolve(knowledge_base, elements, precedent) - consequent = resolve(knowledge_base, elements, consequent) - - if precedent not in knowledge_base: - knowledge_base[precedent] = {'groups': set()} - - if "implications" not in knowledge_base[precedent]: - knowledge_base[precedent]["implications"] = set() - - return modifiable_element_for_existance_in_set( - container=knowledge_base[precedent], - set_name="implications", - element=consequent - ) - - -def property_has_value(knowledge_base, elements, subj, prop, value): - subj = resolve(knowledge_base, elements, subj) - prop = resolve(knowledge_base, elements, prop) - value = resolve(knowledge_base, elements, value) - - if subj not in knowledge_base: - knowledge_base[subj] = {'groups': set()} - - if prop not in knowledge_base[subj]: - knowledge_base[subj][prop] = set() - - return modifiable_element_for_existance_in_set( - container=knowledge_base[subj], - set_name=prop, - element=value - ) - -def perform_verb_over_object(knowledge_base, elements, subj, verb, obj): - subj = resolve(knowledge_base, elements, subj) - verb = resolve(knowledge_base, elements, verb) - obj = resolve(knowledge_base, elements, obj) - session().annotate("({} {} {})".format(verb, subj, obj)) - - if subj not in knowledge_base: - knowledge_base[subj] = {'groups': set()} - - if 'performs-over' not in knowledge_base[subj]: - knowledge_base[subj]['performs-over'] = {} - - if verb not in knowledge_base[subj]['performs-over']: - knowledge_base[subj]['performs-over'][verb] = set() - - return modifiable_element_for_existance_in_set( - container=knowledge_base[subj]['performs-over'], - set_name=verb, - element=obj - ) - knowledge_ingestion = { "exists-property-with-value": exists_property_with_value, "pertenence-to-group": pertenence_to_group, "has-capacity": has_capacity, "question": question, - "implies": implies, - "property-has-value": property_has_value, - "perform-verb-over-object": perform_verb_over_object, } @@ -263,29 +152,6 @@ def integrate_information(knowledge_base, example): args = ast[1:] elements = example.get('elements', None) - session().annotate("Integrating:") - session().annotate("AST: {}".format(ast)) - session().annotate("ARG: {}".format(elements)) - session().annotate("------------") - return tagged_with_ast( ast, elements, knowledge_ingestion[method](knowledge_base, elements, *args)) - -def can_be_used_in_place(knowledge, token, minisegment): - if token not in knowledge.knowledge: - return True - - info = knowledge.knowledge[token] - info_groups = info.get('groups', set()) - minisegment_groups = minisegment.get('groups', set()) - - # Common group - if len(info_groups & minisegment_groups) > 0: - return True - - # Neither has a group - elif len(info_groups) == 0 == len(minisegment_groups): - return True - - return False diff --git a/naive-nlu/tree_nlu/layered_model.py b/naive-nlu/tree_nlu/layered_model.py deleted file mode 100644 index 0aee057..0000000 --- a/naive-nlu/tree_nlu/layered_model.py +++ /dev/null @@ -1,49 +0,0 @@ -from .layers import tokenization_layer -from .layers import parsing_layer -from .layers import parsing -from .session.org_mode import global_session as session - - -def make_yield_pipe(layers, knowledge_base, example, func): - if len(layers) < 1: - yield example - return - - input_generator = make_yield_pipe(layers[:-1], knowledge_base, example, func) - for input in input_generator: - session().annotate("[{}] --> {}".format(len(layers), input)) - for d in list(func(layers[-1], input)): - yield d - - -class BaseModel: - def __init__(self, knowledge_base): - self.tokenization = tokenization_layer.TokenizationLayer(knowledge_base) - self.parsing = parsing_layer.ParsingLayer() - - self.layers = [ - self.tokenization, - self.parsing, - ] - - def reprocess(self, examples): - pattern_examples = [] - for i, sample in enumerate(examples): - other = examples[:i] + examples[i + 1:] - match = parsing.get_matching(sample, other) - if len(match) > 0: - sample = (match, sample[1],) - pattern_examples.append(sample) - - return pattern_examples - - def integrate(self, knowledge_base, example): - yield from make_yield_pipe(self.layers, knowledge_base, - example, lambda l, i: l.integrate(knowledge_base, i)) - - def process(self, knowledge_base, example): - yield from make_yield_pipe(self.layers, knowledge_base, - example, lambda l, i: l.process(knowledge_base, i)) - - def tokenize(self, row, return_one=True): - return self.tokenization.to_tokens(row) diff --git a/naive-nlu/tree_nlu/layers/parsing.py b/naive-nlu/tree_nlu/layers/parsing.py deleted file mode 100644 index 69215d0..0000000 --- a/naive-nlu/tree_nlu/layers/parsing.py +++ /dev/null @@ -1,500 +0,0 @@ -#!/usr/bin/env python - -from ..session.org_mode import global_session as session -import re -import copy - -from functools import reduce -from typing import List, Dict -from ..modifiable_property import ModifiableProperty -from .. import parameters -from ..atoms import Atom, a, is_atom -from .. import knowledge_evaluation - -def make_template(knowledge_base, tokens, parsed): - matcher = list(tokens) - template = list(parsed) - session().annotate(" -- MK TEMPLATE --") - session().annotate("MATCHR: {}".format(matcher)) - session().annotate("TEMPLT: {}".format(template)) - for i in range(len(matcher)): - word = matcher[i] - if word in template: - template[template.index(word)] = i - matcher[i] = { - 'groups': set(knowledge_base.knowledge.get(word, {}).get('groups', set())), - } - return tokens, matcher, template - - -def is_bottom_level(tree): - for element in tree: - if isinstance(element, list) or isinstance(element, tuple): - return False - return True - - -def get_lower_levels(parsed): - lower = [] - def aux(subtree, path): - nonlocal lower - deeper = len(path) == 0 - for i, element in enumerate(subtree): - if isinstance(element, list) or isinstance(element, tuple): - aux(element, path + (i,)) - deeper = True - - if not deeper: - lower.append((path, subtree)) - - aux(parsed, path=()) - return lower - - -# TODO: probably optimize this, it creates lots of unnecessary tuples -def replace_position(tree, position, new_element): - session().annotate("REPLACE POSITIONS:") - session().annotate(" TREE : {}".format(tree)) - session().annotate("POSITION: {}".format(position)) - session().annotate("NEW ELEM: {}".format(new_element)) - session().annotate("------------------") - - def aux(current_tree, remaining_route): - if len(remaining_route) == 0: - return new_element - - else: - step = remaining_route[0] - return ( - tree[:step] - + (aux(tree[step], remaining_route[1:]),) - + tree[step + 2:] - ) - - result = aux(tree, position) - session().annotate("-RESULT: {}".format(result)) - return result - - -def integrate_language(knowledge_base, example): - text = example["text"].lower() - parsed = example["parsed"] - - tokens = example['tokens'] - resolved_parsed = copy.deepcopy(parsed) - - while True: - session().annotate("P: {}".format(resolved_parsed)) - lower_levels = get_lower_levels(resolved_parsed) - session().annotate("Lower: {}".format(lower_levels)) - if len(lower_levels) == 0: - break - - for position, atom in lower_levels: - with session().log("Atom {}".format(atom)): - result = None - similars = get_similar_tree(knowledge_base, atom, tokens) - for similar in similars: - result = build_remix_matrix(knowledge_base, tokens, atom, similar) - if result is not None: - break - else: - raise Exception('Similar not found') - - remix, (start_bounds, end_bounds) = result - - after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix) - session().annotate("--FIND MIX--") - session().annotate("-MIX- | {}".format(remix)) - session().annotate("-FRM- | {}".format(tokens)) - session().annotate("-AFT- | {}".format(after_remix)) - - session().annotate("--- TEMPLATE ---") - - _, matcher, result = make_template(knowledge_base, after_remix, atom) - session().annotate("Tx: {}".format(after_remix)) - session().annotate("Mx: {}".format(matcher)) - session().annotate("Rx: {}".format(result)) - session().annotate("Sx: {}".format(start_bounds)) - session().annotate("Ex: {}".format(end_bounds)) - - - assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens)) - session().annotate( " +-> {}".format(after_remix)) - subquery_type = knowledge_evaluation.get_subquery_type(knowledge_base.knowledge, atom) - session().annotate(r" \-> <{}>".format(subquery_type)) - - # Clean remaining tokens - new_tokens = list(tokens) - offset = len(start_bounds) - for _ in range(len(remix)): - new_tokens.pop(offset) - - # TODO: Get a specific types for... types - new_tokens.insert(offset, (subquery_type, remix)) - tokens = new_tokens - - resolved_parsed = replace_position(resolved_parsed, position, offset) - session().annotate("RP: {}".format(resolved_parsed)) - session().annotate("AT: {}".format(atom)) - session().annotate("#########") - - - tokens, matcher, result = make_template(knowledge_base, tokens, resolved_parsed) - session().annotate("T: {}".format(tokens)) - session().annotate("M: {}".format(matcher)) - session().annotate("R: {}".format(result)) - session().annotate("---") - yield tokens, matcher, result - - -def apply_remix(tokens, remix): - rebuilt = [] - for i in remix: - if isinstance(i, int): - if i >= len(tokens): - return None - rebuilt.append(tokens[i]) - else: - assert(isinstance(i, str)) - rebuilt.append(i) - return rebuilt - - -def build_remix_matrix(knowledge_base, tokens, atom, similar): - tokens = list(tokens) - with session().log("Remix matrix for {} - {}".format(tokens, atom)): - tokens, matcher, result = make_template(knowledge_base, tokens, atom) - similar_matcher, similar_result, similar_result_resolved, _, _ = similar - - start_bounds, end_bounds = find_bounds(knowledge_base, matcher, similar_matcher) - - for i, element in (end_bounds + start_bounds[::-1]): - matcher.pop(i) - tokens.pop(i) - - possible_remixes = get_possible_remixes(knowledge_base, matcher, similar_matcher) - session().annotate("Possible remixes: {}".format(possible_remixes)) - if len(possible_remixes) < 1: - return None - - chosen_remix = possible_remixes[0] - - return chosen_remix, (start_bounds, end_bounds) - - -def get_possible_remixes(knowledge_base, matcher, similar_matcher): - - matrix = [] - with session().log("Possible remixes from matcher: {}".format(matcher)): - for element in matcher: - with session().log("Element `{}`".format(element)): - session().annotate("Similar `{}`".format(similar_matcher)) - if element in similar_matcher or isinstance(element, dict): - if isinstance(element, dict): - indexes = all_matching_indexes(knowledge_base, similar_matcher, element) - session().annotate("Dict element matching: {}".format(indexes)) - else: - indexes = all_indexes(similar_matcher, element) - session().annotate("* element matching: {}".format(indexes)) - matrix.append(indexes) - else: - session().annotate("`else` element matching: [element]") - matrix.append([element]) - - # TODO: do some scoring to find the most "interesting combination" - return [list(x) for x in list(zip(*matrix))] - - -def all_indexes(collection, element): - indexes = [] - base = 0 - - for _ in range(collection.count(element)): - i = collection.index(element, base) - base = i + 1 - indexes.append(i) - - return indexes - - -def all_matching_indexes(knowledge_base, collection, element): - indexes = [] - - with session().log('Matching “{}”'.format(element)): - assert("groups" in element) - element = element["groups"] - for i, instance in enumerate(collection): - session().log('Checking “{}”'.format(instance)) - - if isinstance(instance, dict): - instance = instance["groups"] - elif instance in knowledge_base.knowledge: - session().log('Knowledge about “{}”: ”{}”'.format(instance, knowledge_base.knowledge[instance])) - - if "groups" not in knowledge_base.knowledge[instance]: - # This means that is only known as token - # so we should try to avoid using it - continue - - instance = knowledge_base.knowledge[instance]["groups"] - - intersection = set(instance) & set(element) - if (len(intersection) > 0 or (0 == len(instance) == len(element))): - indexes.append((i, intersection)) - - return [x[0] for x in sorted(indexes, key=lambda x: len(x[1]), reverse=True)] - - -def element_matches_groups(knowledge, element: Dict, groups): - with session().log("Checking if e “{}” matches groups “{}”".format(element, groups)): - if isinstance(groups, str) and groups in knowledge: - return len(knowledge[groups].get("groups", set()) & element['groups']) > 0 - elif isinstance(groups, dict): - return len(element.get("groups", set()) & element['groups']) > 0 - return False - - -def find_bounds(knowledge, matcher, similar_matcher): - start_bounds = [] - for i, element in enumerate(matcher): - if element in similar_matcher: - break - else: - start_bounds.append((i, element)) - - end_bounds = [] - for i, element in enumerate(matcher[::-1]): - in_similar = False - if isinstance(element, str): - in_similar = element in similar_matcher - elif isinstance(element, dict): - in_similar = any(map(lambda groups: element_matches_groups(knowledge.knowledge, - element, groups), - similar_matcher)) - - if in_similar: - break - else: - end_bounds.append((len(matcher) - (i + 1), element)) - - return start_bounds, end_bounds - - -def get_similar_tree(knowledge_base, atom, tokens): - possibilities = [] - - # Find matching possibilities - for entry, tree in knowledge_base.trained: - if not is_bottom_level(tree): - continue - if tree[0] == atom[0]: - possibilities.append((entry, tree)) - - # Sort by more matching elements - sorted_possibilities = [] - for (raw, possibility) in possibilities: - resolved = [] - for element in atom: - if isinstance(element, str): - resolved.append(element) - else: - resolved.append(knowledge_evaluation.resolve( - knowledge_base.knowledge, - element, - raw)) - - # TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element - atom_score = sum([resolved[i] == atom[i] - for i - in range(min(len(resolved), - len(atom)))]) - token_score = sum([similar_token in tokens - for similar_token - in raw]) - - sorted_possibilities.append((raw, possibility, resolved, atom_score, token_score)) - - sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True) - if len(sorted_possibilities) < 1: - return [] - - for i, possibility in enumerate(sorted_possibilities): - similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility - with session().log("Like {}".format(similar_matcher)): - session().annotate('AST: {}'.format(similar_result)) - session().annotate('Results on: {}'.format(similar_result_resolved)) - session().annotate('Atom score: {}'.format(_atom_score)) - session().annotate('Token score: {}'.format(_token_score)) - - return sorted_possibilities - - -# TODO: unroll this mess -def get_matching(sample, other): - l = len(sample[0]) - other = list(filter(lambda x: len(x[0]) == l, other)) - for i in range(l): - if len(other) == 0: - return [] - - if isinstance(sample[0][i], dict): # Dictionaries are compared by groups - other = list(filter(lambda x: isinstance(x[0][i], dict) and - len(x[0][i]['groups'] & sample[0][i]['groups']) > 0, - other)) - - elif isinstance(sample[0][i], tuple): # Tuples are compared by types [0] - other = list(filter(lambda x: isinstance(x[0][i], tuple) and - x[0][i][0] == sample[0][i][0], - other)) - - matching = [] - for x in range(l): # Generate the combination of this and other(s) matcher - first_sample_data = sample[0][x] - if isinstance(first_sample_data, str): - matching.append(first_sample_data) - elif isinstance(first_sample_data, tuple): - matching.append(first_sample_data) - else: - this_groups = sample[0][x]['groups'] - if len(other) > 0: - other_groups = reduce(lambda a, b: a & b, - map(lambda y: y[0][x]['groups'], - other)) - this_groups = this_groups & other_groups - - matching.append({'groups': this_groups}) - return matching - - -def reverse_remix(tree_section, remix): - result_section = [] - offset = 0 - for origin in remix: - if isinstance(origin, int): - if (origin + offset) >= len(tree_section): - return None - - result_section.append(copy.deepcopy(tree_section[origin + offset])) - else: - assert(isinstance(origin, str)) - offset += 1 - return result_section + tree_section[len(remix):] - - -def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS): - results = [] - for matcher, ast in knowledge.trained: - with session().log("{} <- {}".format(matcher, tokens)): - result = match_fit(knowledge, tokens, matcher, ast, - remaining_recursions) - - if result is not None: - with session().log("Result: {}".format(result)): - results.append(result) - - if len(results) > 0: - return results[0] - - -def is_definite_minisegment(minisegment): - return isinstance(minisegment, str) or isinstance(minisegment, dict) - - -def match_token(knowledge, next_token, minisegment): - if isinstance(minisegment, dict): - return knowledge_evaluation.can_be_used_in_place(knowledge, next_token, minisegment) - elif isinstance(minisegment, str): - # TODO: check if the two elements can be used in each other place - return next_token == minisegment - - return False - - -def resolve_fit(knowledge, fit, remaining_recursions): - fitted = [] - for element in fit: - if is_definite_minisegment(element): - fitted.append(element) - else: - with session().log("Resolving fit of `{}`".format(element)): - ((result_type, remixer), tokens) = element - remixed_tokens = reverse_remix(tokens, remixer) - if remixed_tokens is None: - return None - - minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1) - if minifit is None: - return None - - minitokens, miniast = minifit - session().annotate(" AST | {}".format(miniast)) - subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast) - fitted.append(subproperty) - - return fitted - - -def match_fit(knowledge, tokens, matcher, ast, remaining_recursions): - segment_possibilities = [([], tokens)] # Matched tokens, remaining tokens - indent = ' ' * (parameters.MAX_RECURSIONS - remaining_recursions) - session().annotate(indent + 'T> {}'.format(tokens)) - session().annotate(indent + 'M> {}'.format(matcher)) - for minisegment in matcher: - with session().log("Minisegment `{}`".format(minisegment)): - possibilities_after_round = [] - for matched_tokens, remaining_tokens in segment_possibilities: - if len(remaining_tokens) < 1: - continue - - session().annotate(indent + "RT {}".format(remaining_tokens[0])) - session().annotate(indent + "DEF {}".format(is_definite_minisegment(minisegment))) - if is_definite_minisegment(minisegment): - # What if not match -----< - if match_token(knowledge, remaining_tokens[0], minisegment): - possibilities_after_round.append(( - matched_tokens + [remaining_tokens[0]], - remaining_tokens[1:] - )) - else: - # What if not match!!!!!!-----< - # TODO: optimize this with a look ahead - for i in range(1, len(tokens)): - possibilities_after_round.append(( - matched_tokens + [(minisegment, remaining_tokens[:i])], - remaining_tokens[i:] - )) - session().annotate(indent + "## PA {}".format(possibilities_after_round)) - else: - segment_possibilities = possibilities_after_round - for possibility in segment_possibilities: - with session().log("Possibility: `{}`".format(possibility)): - pass - if len(segment_possibilities) < 1: - with session().log("NO POSSIBLE"): - pass - - fully_matched_segments = [(matched, remaining) - for (matched, remaining) - in segment_possibilities - if len(remaining) == 0] - - resolved_fits = [] - with session().log("Full matches"): - for fit, _ in fully_matched_segments: - with session().log(fit): # REMIXES HAVE TO BE APPLIED BEFORE!!! - pass - - with session().log("Resolutions"): - for fit, _ in fully_matched_segments: - with session().log("Resolving {}".format(fit)): # REMIXES HAVE TO BE APPLIED BEFORE!!! - resolved_fit = resolve_fit(knowledge, fit, remaining_recursions) - if resolved_fit is not None: - resolved_fits.append(resolved_fit) - else: - session().annotate("Not resolved") - - if len(resolved_fits) == 0: - return None - - return resolved_fits[0], ast diff --git a/naive-nlu/tree_nlu/layers/parsing_layer.py b/naive-nlu/tree_nlu/layers/parsing_layer.py deleted file mode 100644 index 2bfda2a..0000000 --- a/naive-nlu/tree_nlu/layers/parsing_layer.py +++ /dev/null @@ -1,16 +0,0 @@ -from . import parsing - -class ParsingLayer: - def __init__(self): - pass - - def integrate(self, knowledge_base, example): - yield from parsing.integrate_language(knowledge_base, example) - - def train(self, knowledge_base, example): - assert False - - def process(self, knowledge_base, input): - fit = parsing.get_fit(knowledge_base, input) - if fit is not None: - yield fit \ No newline at end of file diff --git a/naive-nlu/tree_nlu/layers/tokenization.py b/naive-nlu/tree_nlu/layers/tokenization.py deleted file mode 100644 index ec3f0a8..0000000 --- a/naive-nlu/tree_nlu/layers/tokenization.py +++ /dev/null @@ -1,186 +0,0 @@ -from ..session.org_mode import global_session as session -from ..atoms import Atom, a, is_atom - -def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): - for se in knowledge_base.structural_elements: - found_position = remaining.find(se) - found = found_position >= 0 - session().annotate('Looking for structure with “{}”, found? {}'.format(se, found)) - if found: - return [ - (remaining[:found_position], se, remaining[found_position + len(se):]) - ] - - for token in knowledge_base.knowledge.keys(): - found_position = remaining.find(token) - found = found_position >= 0 - session().annotate('Looking for token “{}”, found? {}'.format(token, found)) - if found: - return [ - (remaining[:found_position], token, remaining[found_position + len(token):]) - ] - - return None - - - -def to_tokens(knowledge_base, text, precedent=None): - if len(text) == 0: - session().annotate("No text remaining") - yield [''] - return - - with session().log("Tokenizing {}".format(text)): - for option in knowledge_base.expected_token_after_precedent(precedent): - with session().log("Next: “{}”".format(option)): - with session().log("Matching “{}” on “{}”".format(option, text)): - for token_match in tokenization_match(option, text, knowledge_base): - if token_match is None: - session().annotate("No match") - - match, remaining = token_match - if len(remaining) == len(text): - raise Exception('No text consumed in match') - - session().annotate('Match: “{}”'.format(match)) - with session().log('Remaining “{}”'.format(remaining)): - for sublevel in to_tokens(knowledge_base, remaining, match): - candidate = list(filter(lambda x: x != '', [match] + sublevel)) - session().annotate('Yielding candidate “{}”'.format(candidate)) - yield candidate - - -def tokenization_match(element, text, knowledge_base): - # Constant/structural string matching - if isinstance(element, str): - if text.find(element) == 0: - # This match comes from a structuring element - # It doesn't appear on the tokenization - # So we should return it as an empty string - yield ('', text[len(element):]) - return - else: - # No match found - return - - elif is_atom(element, 'token'): - yield from match_single_token(text, knowledge_base) - return - raise NotImplementedError() - - -def match_single_token(text, knowledge_base): - found_token = False - for token in knowledge_base.knowledge.keys(): - if text.find(token) == 0: - yield token, text[len(token):] - found_token = True - - if found_token: - return - - session().annotate('No token found at the start of ”{}”'.format(text)) - session().annotate('using structural elements to infer it') - # TODO: review this when multiple structural elements are available - for se in knowledge_base.structural_elements: - session().annotate('Looking for se “{}” in “{}”'.format(se, text)) - position = text.find(se, 0) - found = position > 0 # 0 is not considered a valid position for this kind of split - if found: - session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position])) - yield text[:position], text[position:] - - session().annotate('No structural element or token found, inferring only token remaining') - yield text, '' - - # Using other tokens for cutoff - for token in knowledge_base.knowledge.keys(): - session().annotate('Looking for token “{}” in “{}”'.format(token, text)) - position = text.find(token) - found = position >= 0 - if found: - session().annotate('Found ”{}”, in position ”{}”'.format(token, position)) - yield text[:position], text[position:] - - -def integrate_tokenization(knowledge_base, example): - text = example['text'] - tokens = example['tokens'] - meaning = example.get('meaning') - - return integrate_token_to_text_matching(knowledge_base, text, tokens) - - -def integrate_token_to_text_matching(knowledge_base, text, tokens): - texts = [text] - - # Convert to tokens - for token_id, token in enumerate(tokens): - # Look for token in texts - for i, text in enumerate(texts): - if isinstance(text, int): - continue - - if token in text: - before, after = text.split(token, maxsplit=1) - texts = (texts[:i] + [before] - + [a('token')] - + [after] + texts[i + 1:]) - break - else: - raise Exception('Token not found') - - # Remove leftovers from splits - texts = list(filter(lambda x: x != '', texts)) - session().log("Tokenized as {} over {}".format(texts, tokens)) - - for i, element in enumerate(texts[:-1]): - learn_token_pair(element, texts[i + 1], knowledge_base) - - return tokens - -def learn_token_pair(precedent, consequent, knowledge_base): - knowledge_base.add_token_pair(precedent, consequent) - - -def pick_one_tokenization(options, knowledge_base): - ''' - Heuristic function to pick the most probable tokenization. - - Just pick the one with more results. - ''' - options = list(options) - with session().log("Picking among: {} options".format(len(options))): - session().log("Options: \n{}".format('\n'.join(map(str, options)))) - return pick_by_score(options, - [ - # By number of splits without structuring elements - lambda tokenization: sum(map( - lambda split: sum(map( - lambda se: se in split, knowledge_base.structural_elements - )), tokenization)), - - # By number of unknown tokens - lambda tokenization: len(list(filter(lambda token: - (token not in knowledge_base.knowledge.keys()) and - (token not in knowledge_base.structural_elements), - tokenization))), - - # By number of splits - lambda tokenization: -len(tokenization), - ]) - -def pick_by_score(options, heuristics): - for heuristic in heuristics: - assert(len(options) > 0) - options = list(map(lambda opt: (heuristic(opt), opt), options)) - sorted_options = sorted(options, key=lambda x: x[0], reverse=False) - - heuristic_cutoff = sorted_options[0][0] - session().annotate(sorted_options) - pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff] - options = pass_heuristic - - session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options)))) - return options[0] - diff --git a/naive-nlu/tree_nlu/layers/tokenization_layer.py b/naive-nlu/tree_nlu/layers/tokenization_layer.py deleted file mode 100644 index 28852fc..0000000 --- a/naive-nlu/tree_nlu/layers/tokenization_layer.py +++ /dev/null @@ -1,90 +0,0 @@ -from ..session.org_mode import global_session as session -from ..atoms import Atom -from . import tokenization -import random -import copy - -def randomized_weighted_list(elements): - # Randomized - randomized = list(elements) - random.shuffle(randomized) - - # And return only once - already_returned = set() - for e in randomized: - if e in already_returned: - continue - - yield e - already_returned.add(e) - -class TokenizationLayer: - def __init__(self, knowledge_base): - self.structural_elements = set() - self.token_chains = {} - self.tokens = set() - self.knowledge_base = knowledge_base - self.knowledge = knowledge_base.knowledge - - def integrate(self, knowledge_base, data): - assert knowledge_base is self.knowledge_base - - assert 'text' in data - tokens = self.tokenize(data['text']) - data_with_row = copy.copy(data) - data_with_row['tokens'] = tokens - yield data_with_row - - # with session().log("Tokenize: {}".format(data['text'])): - # for tokens in tokenization.to_tokens(self, data['text']): - # data_with_row = copy.copy(data) - # data_with_row['tokens'] = tokens - # yield data_with_row - - def process(self, knowledge_base, row): - yield self.tokenize(row) - - - def tokenize(self, row, return_one=True): - row = row.lower() - with session().log("Tokenize: {}".format(row)): - options = list(tokenization.to_tokens(self, row)) - session().log("Results:\n{}".format('\n'.join(map(str, options)))) - - if return_one: - chosen = tokenization.pick_one_tokenization(options, self) - session().log("Chosen: “{}”".format(chosen)) - self.train({'text': row, 'tokens': chosen}) - return chosen - return options - - ## Tokenization - def add_token_pair(self, precedent, consequent): - self.add_token(precedent) - self.add_token(consequent) - - if precedent not in self.token_chains: - self.token_chains[precedent] = [] - self.token_chains[precedent].append(consequent) - - def add_token(self, token): - self.tokens.add(token) - if (not isinstance(token, Atom)) and (token not in self.structural_elements): - session().annotate('Found new structural element “{}”'.format(token)) - self.structural_elements.add(token) - - def expected_token_after_precedent(self, precedent=None): - if precedent not in self.token_chains: # If there's no known precedent, just return all tokens - return randomized_weighted_list(self.tokens) - - return randomized_weighted_list(self.token_chains[precedent]) - - def train(self, example): - with session().log('Training tokenizer'): - session().annotate("Example: {}".format(example)) - tokens = tokenization.integrate_tokenization(self, example) - - # Integrate knowledge of concept - for token in tokens: - if not token in self.knowledge: - self.knowledge[token] = {} \ No newline at end of file diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py new file mode 100644 index 0000000..fa16a33 --- /dev/null +++ b/naive-nlu/tree_nlu/parsing.py @@ -0,0 +1,384 @@ +#!/usr/bin/env python + +from . import knowledge_evaluation + +from . import depth_meter +import logging +import re +import copy + +from functools import reduce +from typing import List +from .modifiable_property import ModifiableProperty +from . import parameters + +# TODO: more flexible tokenization +def to_tokens(text): + return re.findall(r'(\w+|[^\s])', text) + + +def make_template(knowledge_base, tokens, parsed): + matcher = list(tokens) + template = list(parsed) + for i in range(len(matcher)): + word = matcher[i] + if word in template: + template[template.index(word)] = i + matcher[i] = { + 'groups': set(knowledge_base.knowledge[word]['groups']) + } + return tokens, matcher, template + + +def is_bottom_level(tree): + for element in tree: + if isinstance(element, list) or isinstance(element, tuple): + return False + return True + + +def get_lower_levels(parsed): + lower = [] + def aux(subtree, path): + nonlocal lower + deeper = len(path) == 0 + for i, element in enumerate(subtree): + if isinstance(element, list) or isinstance(element, tuple): + aux(element, path + (i,)) + deeper = True + + if not deeper: + lower.append((path, subtree)) + + aux(parsed, path=()) + return lower + + +# TODO: probably optimize this, it creates lots of unnecessary tuples +def replace_position(tree, position, new_element): + + def aux(current_tree, remaining_route): + if len(remaining_route) == 0: + return new_element + + else: + step = remaining_route[0] + return ( + tree[:step] + + (aux(tree[step], remaining_route[1:]),) + + tree[step + 2:] + ) + + return aux(tree, position) + + +def integrate_language(knowledge_base, example): + text = example["text"].lower() + parsed = example["parsed"] + + resolved_parsed = copy.deepcopy(parsed) + tokens = to_tokens(text) + + while True: + logging.debug("P: {}".format(resolved_parsed)) + lower_levels = get_lower_levels(resolved_parsed) + logging.debug("Lower: {}".format(lower_levels)) + if len(lower_levels) == 0: + break + + for position, atom in lower_levels: + logging.debug("\x1b[1mSelecting\x1b[0m: {}".format(atom)) + similar = get_similar_tree(knowledge_base, atom) + remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, tokens, atom, similar) + _, matcher, result = make_template(knowledge_base, tokens, atom) + logging.debug("Tx: {}".format(tokens)) + logging.debug("Mx: {}".format(matcher)) + logging.debug("Rx: {}".format(result)) + logging.debug("Remix: {}".format(remix)) + + after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix) + assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens)) + logging.debug( " +-> {}".format(after_remix)) + subquery_type = knowledge_evaluation.get_subquery_type(knowledge_base.knowledge, atom) + logging.debug(r" \-> <{}>".format(subquery_type)) + + # Clean remaining tokens + new_tokens = list(tokens) + offset = len(start_bounds) + for _ in range(len(remix)): + new_tokens.pop(offset) + + # TODO: Get a specific types for... types + new_tokens.insert(offset, (subquery_type, remix)) + tokens = new_tokens + + resolved_parsed = replace_position(resolved_parsed, position, offset) + logging.debug("#########") + + + tokens, matcher, result = make_template(knowledge_base, tokens, resolved_parsed) + logging.debug("T: {}".format(tokens)) + logging.debug("M: {}".format(matcher)) + logging.debug("R: {}".format(result)) + logging.debug("---") + return tokens, matcher, result + + +def apply_remix(tokens, remix): + rebuilt = [] + for i in remix: + rebuilt.append(tokens[i]) + return rebuilt + + +def build_remix_matrix(knowledge_base, tokens, atom, similar): + tokens = list(tokens) + tokens, matcher, result = make_template(knowledge_base, tokens, atom) + similar_matcher, similar_result, similar_result_resolved, _ = similar + + start_bounds, end_bounds = find_bounds(matcher, similar_matcher) + + for i, element in (end_bounds + start_bounds[::-1]): + matcher.pop(i) + tokens.pop(i) + + possible_remixes = get_possible_remixes(knowledge_base, matcher, similar_matcher) + chosen_remix = possible_remixes[0] + + return chosen_remix, (start_bounds, end_bounds) + + +def get_possible_remixes(knowledge_base, matcher, similar_matcher): + + matrix = [] + for element in matcher: + logging.debug("- {}".format(element)) + logging.debug("+ {}".format(similar_matcher)) + assert(element in similar_matcher or isinstance(element, dict)) + + if isinstance(element, dict): + indexes = all_matching_indexes(knowledge_base, similar_matcher, element) + else: + indexes = all_indexes(similar_matcher, element) + matrix.append(indexes) + + # TODO: do some scoring to find the most "interesting combination" + return [list(x) for x in list(zip(*matrix))] + + +def all_indexes(collection, element): + indexes = [] + base = 0 + + for _ in range(collection.count(element)): + i = collection.index(element, base) + base = i + 1 + indexes.append(i) + + return indexes + + +def all_matching_indexes(knowledge_base, collection, element): + indexes = [] + + assert("groups" in element) + element = element["groups"] + for i, instance in enumerate(collection): + if isinstance(instance, dict): + instance = instance["groups"] + elif instance in knowledge_base.knowledge: + instance = knowledge_base.knowledge[instance]["groups"] + + intersection = set(instance) & set(element) + if len(intersection) > 0: + indexes.append((i, intersection)) + + return [x[0] for x in sorted(indexes, key=lambda x: len(x[1]), reverse=True)] + + +def find_bounds(matcher, similar_matcher): + start_bounds = [] + for i, element in enumerate(matcher): + if element in similar_matcher: + break + else: + start_bounds.append((i, element)) + + end_bounds = [] + for i, element in enumerate(matcher[::-1]): + if element in similar_matcher: + break + else: + end_bounds.append((len(matcher) - (i + 1), element)) + + return start_bounds, end_bounds + + +def get_similar_tree(knowledge_base, atom): + possibilities = [] + + # Find matching possibilities + for entry, tree in knowledge_base.trained: + if not is_bottom_level(tree): + continue + if tree[0] == atom[0]: + possibilities.append((entry, tree)) + + # Sort by more matching elements + sorted_possibilities = [] + for (raw, possibility) in possibilities: + resolved = [] + for element in atom: + if isinstance(element, str): + resolved.append(element) + else: + resolved.append(knowledge_evaluation.resolve( + knowledge_base.knowledge, + element, + raw)) + + # TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element + score = sum([resolved[i] == atom[i] + for i + in range(min(len(resolved), + len(atom)))]) + sorted_possibilities.append((raw, possibility, resolved, score)) + sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3], reverse=True) + if len(sorted_possibilities) < 1: + return None + + return sorted_possibilities[0] + + +# TODO: unroll this mess +def get_matching(sample, other): + l = len(sample[0]) + other = list(filter(lambda x: len(x[0]) == l, other)) + for i in range(l): + if len(other) == 0: + return [] + + if isinstance(sample[0][i], dict): # Dictionaries are compared by groups + other = list(filter(lambda x: isinstance(x[0][i], dict) and + len(x[0][i]['groups'] & sample[0][i]['groups']) > 0, + other)) + + elif isinstance(sample[0][i], tuple): # Tuples are compared by types [0] + other = list(filter(lambda x: isinstance(x[0][i], tuple) and + x[0][i][0] == sample[0][i][0], + other)) + + return [sample[0][x] if isinstance(sample[0][x], str) + else + sample[0][x] if isinstance(sample[0][x], tuple) + else {'groups': sample[0][x]['groups'] & reduce(lambda a, b: a & b, + map(lambda y: y[0][x]['groups'], + other))} + for x + in range(l)] + + +def reprocess_language_knowledge(knowledge_base, examples): + examples = knowledge_base.examples + examples + + pattern_examples = [] + for i, sample in enumerate(examples): + other = examples[:i] + examples[i + 1:] + match = get_matching(sample, other) + if len(match) > 0: + sample = (match, sample[1],) + pattern_examples.append(sample) + + return pattern_examples + + +def reverse_remix(tree_section, remix): + result_section = [] + for origin in remix: + result_section.append(copy.deepcopy(tree_section[origin])) + return result_section + tree_section[len(remix):] + + +def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS): + for matcher, ast in knowledge.trained: + result = match_fit(knowledge, tokens, matcher, ast, + remaining_recursions) + if result is not None: + return result + + return None + + +def is_definite_minisegment(minisegment): + return isinstance(minisegment, str) or isinstance(minisegment, dict) + + +def match_token(knowledge, next_token, minisegment): + if isinstance(minisegment, dict): + # TODO: check if the dictionary matches the values + return True + elif isinstance(minisegment, str): + # TODO: check if the two elements can be used in each other place + return next_token == minisegment + + return False + + +def resolve_fit(knowledge, fit, remaining_recursions): + fitted = [] + for element in fit: + if is_definite_minisegment(element): + fitted.append(element) + else: + ((result_type, remixer), tokens) = element + remixed_tokens = reverse_remix(tokens, remixer) + minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1) + if minifit is None: + return None + + minitokens, miniast = minifit + subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast) + fitted.append(subproperty) + + return fitted + + +def match_fit(knowledge, tokens, matcher, ast, remaining_recursions): + segment_possibilities = [([], tokens)] # Matched tokens, remaining tokens + for minisegment in matcher: + possibilities_after_round = [] + for matched_tokens, remaining_tokens in segment_possibilities: + if len(remaining_tokens) < 1: + continue + + if is_definite_minisegment(minisegment): + if match_token(knowledge, remaining_tokens[0], minisegment): + possibilities_after_round.append(( + matched_tokens + [remaining_tokens[0]], + remaining_tokens[1:] + )) + else: + # TODO: optimize this with a look ahead + for i in range(1, len(tokens)): + possibilities_after_round.append(( + matched_tokens + [(minisegment, remaining_tokens[:i])], + remaining_tokens[i:] + )) + else: + segment_possibilities = possibilities_after_round + + fully_matched_segments = [(matched, remaining) + for (matched, remaining) + in segment_possibilities + if len(remaining) == 0] + + resolved_fits = [] + for fit, _ in fully_matched_segments: + resolved_fit = resolve_fit(knowledge, fit, remaining_recursions) + if resolved_fit is not None: + resolved_fits.append(resolved_fit) + + if len(resolved_fits) == 0: + return None + + return resolved_fits[0], ast diff --git a/naive-nlu/tree_nlu/session/org_mode.py b/naive-nlu/tree_nlu/session/org_mode.py deleted file mode 100644 index 3258d82..0000000 --- a/naive-nlu/tree_nlu/session/org_mode.py +++ /dev/null @@ -1,79 +0,0 @@ -import logging -import datetime - -SESSION = None - -def __gen_session_name__(): - now = datetime.datetime.utcnow() - return "treeNLU-session-{}.org".format( - now.strftime("%y_%m_%d %H:%M:%S_%f")) - - -def create_global_session(fname): - global SESSION - SESSION = OrgModeSession(fname) - - -def global_session(): - if SESSION is None: - session_name = __gen_session_name__() - logging.warn("Session not created, saved on {}".format(session_name)) - create_global_session(session_name) - - assert(SESSION is not None) - return SESSION - - -def get_header(): - now = datetime.datetime.utcnow() - return ("# Ran on {}\n".format( - now.strftime("%y/%m/%d %H:%M:%S.%f"))) - -class LevelContext: - def __init__(self, increaser, decreaser): - self.increaser = increaser - self.decreaser = decreaser - - def __enter__(self): - self.increaser() - - def __exit__(self, _type, _value, _traceback): - self.decreaser() - - -class OrgModeSession: - def __init__(self, fname): - self.f = open(fname, 'wt') - self.level = 0 - self.dirty = False - - self.f.write(get_header()) - - def annotate(self, annotation): - if self.dirty: - self.f.write("{indentation} {data}\n".format( - indentation='*' * (self.level + 1), - data="---")) - self.dirty = False - - self.f.write("{indentation} {data}\n".format( - indentation=' ' * (self.level + 2 + 1), - data=annotation)) - - def log(self, string): - self.f.write("{indentation} {data}\n".format( - indentation='*' * (self.level + 1), - data=string)) - self.dirty = False - - return LevelContext(self.inc_level, self.dec_level) - - def inc_level(self): - self.level += 1 - - def dec_level(self): - self.level -= 1 - self.dirty = True - - def close(self): - self.f.close() diff --git a/naive-nlu/tree_nlu/test.py b/naive-nlu/tree_nlu/test.py index 683f85e..bbda1a2 100644 --- a/naive-nlu/tree_nlu/test.py +++ b/naive-nlu/tree_nlu/test.py @@ -1,50 +1,61 @@ -import traceback +import json import logging -from .session import org_mode -from .tests import tokenization -from .tests import basic -from .tests import gac_100 -from .tests import gac_extension +logging.getLogger().setLevel(logging.INFO) -logging.getLogger().setLevel(logging.ERROR) +from .knowledge_base import KnowledgeBase +from .modifiable_property import is_modifiable_property -tests = ( - ("tokenization", tokenization), - ("basic", basic), - ("gac 100", gac_100), - ("gac+", gac_extension), -) +import hy +from .tests import base +def test_assumption(expectedResponse, knowledge, query): + logging.info("Query: {}".format(query['text'])) + logging.info("Expected: {}".format(expectedResponse)) -def gen_session_name(): - return "treeNLU-test-session.org" + result, abstract_tree, diff = knowledge.process(query['text']) + end_result = result.getter() if is_modifiable_property(result) else result + + logging.info("\x1b[0;3{}mResult: {}\x1b[0m".format("1" if end_result != expectedResponse else "2", end_result)) + assert(end_result == expectedResponse) def main(): - org_mode.create_global_session(gen_session_name()) - failed = False - for test_name, test_module in tests: - try: - with org_mode.global_session().log(test_name): - test_module.main() - print(" \x1b[1;32m✓\x1b[0m {}".format(test_name)) - except AssertionError as ae: - print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name, - ('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0 - else '')) - traceback.print_exc() - failed = True + base.run_tests() + knowledge = KnowledgeBase( + knowledge=base_knowledge, + ) - except Exception as e: - print(" \x1b[1;7;31m!\x1b[0m {}\n [Exception] {}".format(test_name, e)) - failed = True - traceback.print_exc() - raise - org_mode.global_session().close() + differences = knowledge.train(examples) - if failed: - exit(1) + logging.info("----") + logging.info(differences()) + logging.info("----") + + test_assumption(True, knowledge, {'text': 'earth is a planet'}) + test_assumption(True, knowledge, {'text': 'is lava dangerous?'}) + for test in [{'text': 'a bus can run'}, {'text': 'io is a moon'}]: + row = test['text'] + result, inferred_tree, differences = knowledge.process(row) + + logging.info("result:", result) + logging.info(differences()) + logging.info("---") + logging.info('-----') + logging.info(json.dumps(sorted(knowledge.knowledge.keys()), indent=4)) + logging.info('-----') + + queryTrue = { + "text": "is io a moon?", + "parsed": ("question", ("pertenence-to-group", "io", "moon")) + } + queryFalse = { + "text": "is io a planet?", + "parsed": ("question", ("pertenence-to-group", "io", "planet")) + } + + test_assumption(False, knowledge, queryFalse) + test_assumption(True, knowledge, queryTrue) if __name__ == '__main__': main() diff --git a/naive-nlu/tree_nlu/tests/base.hy b/naive-nlu/tree_nlu/tests/base.hy new file mode 100644 index 0000000..65f37fb --- /dev/null +++ b/naive-nlu/tree_nlu/tests/base.hy @@ -0,0 +1,62 @@ +(import [..knowledge_base [KnowledgeBase]]) + +(setv knowledge-base + { + "icecream" { "groups" (set ["noun" "object" "comestible" "sweet"]) } + + "lava" { "groups" (set ["noun" "object"]) } + "earth" { "groups" (set ["noun" "object" "planet"]) } + "io" { "groups" (set ["noun" "object"]) } + "green" { "groups" (set ["noun" "color" "concept"]) } + "plane" { "groups" (set ["noun" "object" "vehicle" "fast"]) } + "car" { "groups" (set ["noun" "object" "vehicle" "slow-ish"]) } + "wale" { "groups" (set ["noun" "object" "living-being"]) } + "cold" { "groups" (set ["property" "temperature"]) "as_property" "temperature" } + "dangerous" { "groups" (set ["property"]) "as_property" "safety" } + "planet" { "groups" (set ["noun" "group"]) } + "moon" { "groups" (set ["noun" "group"]) } + "color" { "groups" (set ["property" "group"]) } + "fly" { "groups" (set ["verb"]) } + "swim" { "groups" (set ["verb"]) } + } + ) + +(setv examples + [ + { "text" "icecream is cold" + "parsed" '(exists-property-with-value icecream cold) } + { "text" "is icecream cold?" + "parsed" '(question (exists-property-with-value icecream cold)) } + { "text" "lava is dangerous" + "parsed" '(exists-property-with-value lava dangerous) } + { "text" "is lava dangerous?" + "parsed" '(question (exists-property-with-value lava dangerous)) } + { "text" "earth is a planet" + "parsed" '(pertenence-to-group earth planet) } + { "text" "io is a moon" + "parsed" '(pertenence-to-group io moon) } + { "text" "is earth a moon?" + "parsed" '(question (pertenence-to-group earth moon)) } + { "text" "Green is a color" + "parsed" '(pertenence-to-group green color) } + { "text" "a plane can fly" + "parsed" '(has-capacity plane fly) } + { "text" "a wale can swim" + "parsed" '(has-capacity wale swim) } + { + "text" "if earth is a planet it is big" + "parsed" '(implies + (pertenence-to-group earth planet) + (exists-property-with-value earth big)) } + + ] + ) + +(defn run_tests [] + [ + (setv knowledge (KnowledgeBase + knowledge=base_knowledge, + ) + ) + ] + ) diff --git a/naive-nlu/tree_nlu/tests/basic.py b/naive-nlu/tree_nlu/tests/basic.py deleted file mode 100644 index bda8261..0000000 --- a/naive-nlu/tree_nlu/tests/basic.py +++ /dev/null @@ -1,166 +0,0 @@ -from ..session.org_mode import global_session as session -import json - -from ..knowledge_base import KnowledgeBase -from ..modifiable_property import is_modifiable_property -from ..utils.tokenization import train_basic_tokenization - -examples = [ - { - "text": "icecream is cold", - "parsed": ("exists-property-with-value", 'icecream', 'cold'), - }, - { - "text": "is icecream cold?", - "parsed": ("question", ("exists-property-with-value", 'icecream', 'cold')) - }, - { - "text": "lava is dangerous", - "parsed": ("exists-property-with-value", 'lava', 'dangerous') - }, - { - "text": "is lava dangerous?", - "parsed": ("question", ("exists-property-with-value", 'lava', 'dangerous')), - }, - { - "text": "earth is a planet", - "parsed": ("pertenence-to-group", 'earth', 'planet'), - }, - { - "text": "io is a moon", - "parsed": ("pertenence-to-group", 'io', 'moon'), - }, - { - "text": "is earth a moon?", - "parsed": ("question", ("pertenence-to-group", 'earth', 'moon')), - }, - { - "text": "Green is a color", - "parsed": ("pertenence-to-group", 'green', 'color'), - }, - { - "text": "a plane can fly", - "parsed": ("has-capacity", 'plane', 'fly') - }, - { - "text": "a wale can swim", - "parsed": ("has-capacity", 'wale', 'swim') - }, - # { - # "text": "if earth is a planet, it is big", - # "parsed": ("implies", - # ("pertenence-to-group", 'earth', 'planet'), - # ("exists-property-with-value", 'earth', 'big')), - # }, -] - -base_knowledge = { - 'icecream': { - "groups": {'noun', 'object', 'comestible', 'sweet'}, - }, - 'lava': { - "groups": {'noun', 'object'}, - }, - 'earth': { - "groups": {'noun', 'object', 'planet'}, - }, - 'io': { - "groups": {'noun', 'object'}, - }, - 'green': { - "groups": {'noun', 'color', 'concept'}, - }, - 'plane': { - "groups": {'noun', 'object', 'vehicle', 'fast'}, - }, - 'car': { - "groups": {'noun', 'object', 'vehicle', 'slow-ish'}, - }, - 'wale': { - "groups": {'noun', 'object', 'living-being'}, - }, - 'cold': { - "groups": {'property', 'temperature'}, - "as_property": "temperature", - }, - 'dangerous': { - "groups": {'property'}, - "as_property": "safety", - }, - 'planet': { - "groups": {'noun', 'group'}, - }, - 'moon': { - "groups": {'noun', 'group'}, - }, - 'color': { - "groups": {'property', 'group'}, - }, - 'fly': { - "groups": {'verb'}, - }, - 'bus': { - "groups": {'noun'}, - }, - 'run': { - "groups": {'verb'}, - }, - 'swim': { - "groups": {'verb'}, - }, - 'planet': { - 'groups': {'noun'} - } -} - -def test_assumption(expectedResponse, knowledge, query): - with session().log(query['text']): - session().annotate("Expected: {}".format(expectedResponse)) - - result, abstract_tree, diff = knowledge.process(query['text']) - end_result = result.getter() if is_modifiable_property(result) else result - - session().annotate("Result: {}".format(end_result)) - if end_result != expectedResponse: - raise AssertionError('{} is not {}'.format(end_result, expectedResponse)) - -def main(): - knowledge = KnowledgeBase( - knowledge=base_knowledge, - ) - - train_basic_tokenization(knowledge) - - for example in examples: - with session().log(example['text']): - differences = knowledge.train([example]) - - session().annotate("----") - session().annotate(differences()) - session().annotate("----") - - test_assumption(True, knowledge, {'text': 'earth is a planet'}) - test_assumption(True, knowledge, {'text': 'is lava dangerous?'}) - for test in [{'text': 'a bus can run'}, {'text': 'io is a moon'}]: - row = test['text'] - result, inferred_tree, differences = knowledge.process(row) - - session().annotate("result: {}".format(result)) - session().annotate(differences()) - session().annotate("---") - session().annotate('-----') - session().annotate(json.dumps(sorted(knowledge.knowledge.keys()), indent=4)) - session().annotate('-----') - - queryTrue = { - "text": "is io a moon?", - "parsed": ("question", ("pertenence-to-group", "io", "moon")) - } - queryFalse = { - "text": "is io a planet?", - "parsed": ("question", ("pertenence-to-group", "io", "planet")) - } - - test_assumption(False, knowledge, queryFalse) - test_assumption(True, knowledge, queryTrue) - return knowledge diff --git a/naive-nlu/tree_nlu/tests/gac_100.py b/naive-nlu/tree_nlu/tests/gac_100.py deleted file mode 100644 index 71469ac..0000000 --- a/naive-nlu/tree_nlu/tests/gac_100.py +++ /dev/null @@ -1,736 +0,0 @@ -from ..session.org_mode import global_session as session -from ..knowledge_base import KnowledgeBase -from ..utils.visuals import show_progbar -from ..visualization import show_knowledge -from ..utils.tokenization import train_basic_tokenization - -def _assert(args): - assert(args) - -def _assert_msg(args, msg): - assert args, msg - -examples = [ - ('full_example', - { - "text": "is icecream cold?", - "affirmation": "icecream is cold", - "parsed": ("question", - ("exists-property-with-value", 'icecream', 'cold')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('cold' in knowledge.knowledge['icecream']['property']) - ),], - }), - ('full_example', - { - "text": "is earth a planet?", - "affirmation": "earth is a planet", - "parsed": ("question", - ("pertenence-to-group", 'earth', 'planet')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('planet' in knowledge.knowledge['earth']['groups']) - ),], - }), - ('full_example', - { - "text": "Is green a color?", - "affirmation": "green is a color", - "parsed": ("question", - ("pertenence-to-group", 'green', 'color')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('color' in knowledge.knowledge['green']['groups']) - ),], - }), - ('full_example', - { - "text": "do airplanes fly?", - "affirmation": "airplanes fly", - "parsed": ("question", - ("has-capacity", 'plane', 'fly')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('fly' in knowledge.knowledge['plane']['capacities']) - ),], - }), - ('full_example', - { - "text": "Is it hot during the summer?", - "affirmation": "it is hot during summer", - "parsed": ("question", - ("implies", 'summer', 'hot')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('hot' in knowledge.knowledge['summer']['implications']) - ),], - }), - ('full_example', - { - "text": "is chile in south america ?", - "affirmation": "chile is in south america", - "parsed": ("question", - ("property-has-value", 'chile', 'location', 'south america')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('south america' in knowledge.knowledge['chile']['location']) - ),], - }), - ('full_example', - { - "text": "Was Socrates a man?", - "affirmation": "Socrates was a man", - "parsed": ("question", - ("pertenence-to-group", 'socrates', 'man')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('man' in knowledge.knowledge['socrates']['groups']) - ),], - }), - ('full_example', - { - "text": "Computers use electricity?", - "affirmation": "Computers use electricity", - "parsed": ("question", - ('perform-verb-over-object', 'computers', 'use', 'electricity')), - "answer": True, - "after_execution": [( - lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use']) - ),], - }), - # ('full_example', - # { - # "text": "The dominant language in france is french?", - # "affirmation": "The dominant language in france is french", - # "parsed": ("question", - # ("property-has-value", "france", "dominant-language", "french")), - # "answer": True, - # }), - # { - # "text": "was abraham lincoln once president of the united states?", - # "affirmation": "was abraham lincoln once president of the united states?", - # "parsed": (), - # "answer": None, - # }, - ('text_example', - { - "question": "is milk white?", - "affirmation": "milk is white", - "answer": True, - }), - # { - # "text": "do people have emotions?", - # "affirmation": "do people have emotions?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "do objects appear smaller as they move away from you?", - # "affirmation": "do objects appear smaller as they move away from you?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Does the human species have a male and female gender?", - # "affirmation": "Does the human species have a male and female gender?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is a mountain mostly made of rock?", - # "affirmation": "Is a mountain mostly made of rock?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is sun microsystems a computer company?", - # "affirmation": "is sun microsystems a computer company?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do you see with your eyes and smell with your nose?", - # "affirmation": "Do you see with your eyes and smell with your nose?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is smoking bad for your health?", - # "affirmation": "Is smoking bad for your health?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Does a dog have four legs?", - # "affirmation": "Does a dog have four legs?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do mammals have hearts?", - # "affirmation": "Do mammals have hearts?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is the Earth a planet?", - # "affirmation": "is the Earth a planet?", - # "parsed": (), - # "answer": None, - # }, - # ('text_example', - # { - # "question": "is water a liquid?", - # "affirmation": "water is a liquid", - # "answer": True, - # }), - # { - # "text": "Is Bugs Bunny a cartoon character?", - # "affirmation": "Is Bugs Bunny a cartoon character?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do Humans communicate by Telephone?", - # "affirmation": "Do Humans communicate by Telephone?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is beer a drink ?", - # "affirmation": "is beer a drink ?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "are there 12 months in a year?", - # "affirmation": "are there 12 months in a year?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "does the sun hurt your eyes when you look at it?", - # "affirmation": "does the sun hurt your eyes when you look at it?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do most cars have doors?", - # "affirmation": "Do most cars have doors?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is orange both a fruit and a colour?", - # "affirmation": "is orange both a fruit and a colour?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is water a necessity?", - # "affirmation": "Is water a necessity?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do CDs have better quality sound than Cassettes?", - # "affirmation": "Do CDs have better quality sound than Cassettes?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "do animals die?", - # "affirmation": "do animals die?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is the arctic cold?", - # "affirmation": "Is the arctic cold?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do people have 2 eyes?", - # "affirmation": "Do people have 2 eyes?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "does a person have a brain?", - # "affirmation": "does a person have a brain?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is the rain wet?", - # "affirmation": "Is the rain wet?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is division a mathematical operation?", - # "affirmation": "Is division a mathematical operation?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is 400 greater than 399?", - # "affirmation": "is 400 greater than 399?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is magenta a color?", - # "affirmation": "is magenta a color?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are books educational?", - # "affirmation": "Are books educational?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Was the Great Wall of China built by humans?", - # "affirmation": "Was the Great Wall of China built by humans?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are pianos musical instruments?", - # "affirmation": "Are pianos musical instruments?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Has Bill Clinton been President of the United States?", - # "affirmation": "Has Bill Clinton been President of the United States?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is a whale a mammal?", - # "affirmation": "Is a whale a mammal?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are lemons yellow?", - # "affirmation": "Are lemons yellow?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is the South Pole cold?", - # "affirmation": "Is the South Pole cold?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is Africa warm?", - # "affirmation": "Is Africa warm?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is Antarctica cold?", - # "affirmation": "Is Antarctica cold?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is rock is generally harder than wood?", - # "affirmation": "Is rock is generally harder than wood?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do dogs chase cats?", - # "affirmation": "Do dogs chase cats?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "can humans die from cold temperatures?", - # "affirmation": "can humans die from cold temperatures?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "do people enjoy conversation?", - # "affirmation": "do people enjoy conversation?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is Bill Clinton the President of the United States?", - # "affirmation": "Is Bill Clinton the President of the United States?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are books a good source of information?", - # "affirmation": "Are books a good source of information?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "are friends different than enemies?", - # "affirmation": "are friends different than enemies?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "are people alive?", - # "affirmation": "are people alive?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do triangles have 3 sides?", - # "affirmation": "Do triangles have 3 sides?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is Ice cream cold?", - # "affirmation": "Is Ice cream cold?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are all sides of a square the same length?", - # "affirmation": "Are all sides of a square the same length?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do all people eat food?", - # "affirmation": "Do all people eat food?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "do dentists repair teeth?", - # "affirmation": "do dentists repair teeth?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is America bigger than Japan?", - # "affirmation": "Is America bigger than Japan?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do all triangles have three sides?", - # "affirmation": "Do all triangles have three sides?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "A grocery store sales food?", - # "affirmation": "A grocery store sales food?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Does a sunburn cause pain?", - # "affirmation": "Does a sunburn cause pain?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is a computer an invention?", - # "affirmation": "Is a computer an invention?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "have humans visited the moon?", - # "affirmation": "have humans visited the moon?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are there people in India?", - # "affirmation": "Are there people in India?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Was Einstein a genius?", - # "affirmation": "Was Einstein a genius?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are we on the planet earth?", - # "affirmation": "Are we on the planet earth?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "do people comb their hair in the morning?", - # "affirmation": "do people comb their hair in the morning?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Does it hurt to lose a friend?", - # "affirmation": "Does it hurt to lose a friend?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are there people on the earth?", - # "affirmation": "Are there people on the earth?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Was George Washington a president of the United States of America?", - # "affirmation": "Was George Washington a president of the United States of America?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Does an ocean have salt water in it?", - # "affirmation": "Does an ocean have salt water in it?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is night darker than day?", - # "affirmation": "Is night darker than day?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Does a triangle have three sides?", - # "affirmation": "Does a triangle have three sides?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are peaches fruit?", - # "affirmation": "Are peaches fruit?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do people urinate?", - # "affirmation": "Do people urinate?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is Germany located in Europe?", - # "affirmation": "Is Germany located in Europe?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do mirrors reflect light?", - # "affirmation": "Do mirrors reflect light?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are people born naked?", - # "affirmation": "Are people born naked?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is it hot near the equator?", - # "affirmation": "Is it hot near the equator?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is paper made from trees?", - # "affirmation": "is paper made from trees?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Can a female have children?", - # "affirmation": "Can a female have children?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are people born every day?", - # "affirmation": "Are people born every day?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are shoes worn on the feet?", - # "affirmation": "Are shoes worn on the feet?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "does it get wet when it rains?", - # "affirmation": "does it get wet when it rains?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are there plants and insects in the rainforest which have no names?", - # "affirmation": "Are there plants and insects in the rainforest which have no names?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do people eat pigs?", - # "affirmation": "Do people eat pigs?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do businessmen wear ties?", - # "affirmation": "Do businessmen wear ties?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is New York in the United States?", - # "affirmation": "Is New York in the United States?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are humans more intelligent than ants?", - # "affirmation": "Are humans more intelligent than ants?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are ravens black?", - # "affirmation": "Are ravens black?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Are there rats on ships?", - # "affirmation": "Are there rats on ships?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "are lions animals?", - # "affirmation": "are lions animals?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "6 is greater than 5?", - # "affirmation": "6 is greater than 5?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Is water made of hydrogen and oxygen?", - # "affirmation": "Is water made of hydrogen and oxygen?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "is the sky blue on a clear day?", - # "affirmation": "is the sky blue on a clear day?", - # "parsed": (), - # "answer": None, - # }, - # { - # "text": "Do most people work during the day?", - # "affirmation": "Do most people work during the day?", - # "parsed": (), - # "answer": None, - # }, -] - -base_knowledge = { - 'summer': { - "groups": {'epoch'}, - }, - 'fly': { - "groups": {'verb'}, - }, - 'use': { - "groups": {'verb'}, - }, - 'electricity': { - "groups": {'power'}, - }, - 'airplanes': {}, - 'white': { - 'groups': {'property'}, - } -} - -def main(): - knowledge = KnowledgeBase( - knowledge=base_knowledge, - ) - - train_basic_tokenization(knowledge) - - total = len(examples) - - for i, (example_type, data) in enumerate(examples): - if example_type == 'full_example': - affirmation = { - 'text': data['affirmation'], - 'parsed': data['parsed'][1], - } - question = data - - with session().log(data['affirmation']): - show_progbar(i, total, data['affirmation']) - differences = knowledge.train([affirmation]) - - with session().log(data['text']): - show_progbar(i, total, data['text']) - differences = knowledge.train([question]) - session().annotate(differences()) - - result, _, _ = knowledge.process(data['text']) - - if "after_execution" in data: - for f in data["after_execution"]: - f(knowledge) - - if result != data['answer']: - raise AssertionError('{} is not {}'.format(result, data['answer'])) - - elif example_type == 'text_example': - with session().log(data['affirmation']): - show_progbar(i, total, data['affirmation']) - affirmation = data['affirmation'] - session().annotate("Processing affirmation: {}".format(affirmation)) - _, _, _ = knowledge.process(affirmation) - - with session().log(data['question']): - show_progbar(i, total, data['question']) - question = data['question'] - session().annotate("Processing question : {}".format(question)) - result, _, _ = knowledge.process(question) - - if result != data['answer']: - raise AssertionError('{} is not {}'.format(result, data['answer'])) - - else: - raise NotImplementedError('Example type: {}'.format(example_type)) - - print("\r\x1b[K", end='') - return knowledge - - -if __name__ == '__main__': - show_knowledge(main()) diff --git a/naive-nlu/tree_nlu/tests/gac_extension.py b/naive-nlu/tree_nlu/tests/gac_extension.py deleted file mode 100644 index abb87ba..0000000 --- a/naive-nlu/tree_nlu/tests/gac_extension.py +++ /dev/null @@ -1,26 +0,0 @@ -from ..knowledge_base import KnowledgeBase -from ..session.org_mode import global_session as session - -from . import gac_100 - - -def ask_then_learn_test(knowledge: KnowledgeBase): - with session().log("is icecream blue?"): - ret, _, _ = knowledge.process("is icecream blue?") - assert(ret is False) - - with session().log("icecream is blue"): - ret, _, _ = knowledge.process("icecream is blue") - - with session().log("is icecream blue?"): - ret, _, _ = knowledge.process("is icecream blue?") - assert(ret is True) - - return knowledge - - -def main(): - knowledge = gac_100.main() - - knowledge.knowledge['blue'] = {'groups': {'property'}} - knowledge = ask_then_learn_test(knowledge) diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py deleted file mode 100644 index 9e32588..0000000 --- a/naive-nlu/tree_nlu/tests/tokenization.py +++ /dev/null @@ -1,80 +0,0 @@ -from ..session.org_mode import global_session as session -from ..knowledge_base import KnowledgeBase -from ..utils.visuals import show_progbar -from ..visualization import show_knowledge - - -def _assert(args): - assert(args) - - -def _assert_msg(args, msg): - assert args, msg - - -EXAMPLES = [ - ('example', { - "text": 'cat', - "tokens": ['cat'], - }), - ('example', { - "text": 'cats', - "tokens": ['cats'], - "meaning": { 'cats': ('add-modifier', 'cat', 'plural') }, - }), - ('example', { - "text": 'text separated by spaces', - "tokens": ['text', 'separated', 'by', 'spaces'], - }), - ('example', { - "text": 'is earth a planet?', - "tokens": ['is', 'earth', 'a', 'planet', '?'], - }), - ('test', { - "text": 'plane', - "tokens": ['plane'], - }), - # ('test', { - # "text": 'planes', - # "tokens": ['planes'], - # "meaning": { 'planes': ('add-modifier', 'plane', 'plural') }, - # }), - ('test', { - "text": 'some other text', - "tokens": ['some', 'other', 'text'], - }), - ('test', { - "text": 'is the sun a star?', - "tokens": ['is', 'the', 'sun', 'a', 'star', '?'], - }), - ('test', { - "text": 'sometextnotseparatedbyspaces', - "tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'], - }) -] - - -def main(): - knowledge = KnowledgeBase() - - total = len(EXAMPLES) - - for i, (case_type, example) in enumerate(EXAMPLES): - show_progbar(i, total, example['text']) - if case_type == 'example': - with session().log(example['text']): - knowledge.layers.tokenization.train(example) - - elif case_type == 'test': - with session().log(example['text']): - tokens = list(knowledge.layers.tokenization.tokenize(example['text'])) - - session().log('Expected “{}”, found “{}”' - .format(example['tokens'], tokens)) - assert example['tokens'] == tokens - - else: - raise Exception('Not implemented case {}'.format(case_type)) - - print("\r\x1b[K", end='') - return knowledge diff --git a/naive-nlu/tree_nlu/utils/json_dumper.py b/naive-nlu/tree_nlu/utils/json_dumper.py deleted file mode 100644 index 061dd68..0000000 --- a/naive-nlu/tree_nlu/utils/json_dumper.py +++ /dev/null @@ -1,4 +0,0 @@ -def dumper(obj): - if isinstance(obj, set): - return list(obj) - return obj diff --git a/naive-nlu/tree_nlu/utils/tokenization.py b/naive-nlu/tree_nlu/utils/tokenization.py deleted file mode 100644 index f13c798..0000000 --- a/naive-nlu/tree_nlu/utils/tokenization.py +++ /dev/null @@ -1,29 +0,0 @@ -from ..session.org_mode import ( - global_session as session, -) - -BASIC_TOKENIZATION_EXAMPLES = ( - ({ - "text": 'cat', - "tokens": ['cat'], - }), - ({ - "text": 'cats', - "tokens": ['cats'], - "meaning": { 'cats': ('add-modifier', 'cat', 'plural') }, - }), - ({ - "text": 'text separated by spaces', - "tokens": ['text', 'separated', 'by', 'spaces'], - }), - ({ - "text": 'is earth a planet?', - "tokens": ['is', 'earth', 'a', 'planet', '?'], - }), -) - - -def train_basic_tokenization(knowledge_base): - with session().log('Training basic tokenization'): - for example in BASIC_TOKENIZATION_EXAMPLES: - knowledge_base.layers.tokenization.train(example) diff --git a/naive-nlu/tree_nlu/utils/visuals.py b/naive-nlu/tree_nlu/utils/visuals.py deleted file mode 100644 index a6dd611..0000000 --- a/naive-nlu/tree_nlu/utils/visuals.py +++ /dev/null @@ -1,15 +0,0 @@ -def show_progbar(done, total, msg=''): - total_blocks = 10 - blocks_done = (done * total_blocks) // total - blocks_to_go = total_blocks - blocks_done - - print('\r\x1b[K' # Go to the start of the line - '\x1b[0m' # Restart the "style" - '|' # Put the first "|" - + blocks_done * '█' # Completed blocks - + blocks_to_go * ' ' # Uncompleted blocks - + '\x1b[7m|\x1b[0m' # End the bar - + ' ' - + msg # Add message - + '\r' # Go back to the start - , end='') diff --git a/naive-nlu/tree_nlu/visualization.py b/naive-nlu/tree_nlu/visualization.py deleted file mode 100644 index 6f07325..0000000 --- a/naive-nlu/tree_nlu/visualization.py +++ /dev/null @@ -1,8 +0,0 @@ -def show_knowledge(knowledge): - for key in knowledge.knowledge: - print("\x1b[1m{}\x1b[0m {}".format(key, knowledge.knowledge[key])) - - -def show_samples(knowledge): - for example in knowledge.originals: - print("{}".format(example))