Compare commits

..

1 Commits

Author SHA1 Message Date
fe7b550cdb Try to move the tests to hy. 2017-05-23 20:40:09 +02:00
25 changed files with 541 additions and 2317 deletions

2
.gitignore vendored
View File

@ -1,7 +1,5 @@
*#* *#*
*~ *~
.vscode
*.ba?k *.ba?k
*.pyc *.pyc
__pycache__ __pycache__
treeNLU-*session*.org

View File

@ -1,4 +0,0 @@
from tree_nlu import cli
if __name__ == '__main__':
cli.main()

View File

@ -1 +1,2 @@
jsondiff jsondiff
hy

View File

@ -11,5 +11,6 @@ setup(name='tree_nlu',
include_package_data=True, include_package_data=True,
install_requires = [ install_requires = [
'jsondiff', 'jsondiff',
'hy',
], ],
zip_safe=False) zip_safe=False)

View File

@ -1,23 +0,0 @@
'''
Analogous to erlang ones.
"An atom is a literal, a constant with name."
'''
from collections import namedtuple
Atom = namedtuple('Atom', field_names='name')
def is_atom(element, name=None):
'''Check if an element is an atom with a specific name.'''
if not isinstance(element, Atom):
return False
if name is None:
return True
return element.name == name
def a(name):
'''Build an atom with a given name.'''
return Atom(name)

View File

@ -1,65 +0,0 @@
import logging
import datetime
from .session.org_mode import (
global_session as session,
create_global_session,
)
from .knowledge_base import KnowledgeBase
from .visualization import (
show_knowledge,
show_samples,
)
from .tests import gac_100
from .modifiable_property import (
ModifiableProperty,
ModifiablePropertyWithAst,
is_modifiable_property,
)
bye_phrases = ['bye', 'exit']
def gen_session_name():
now = datetime.datetime.utcnow()
return "treeNLU-cli-session-{}.org".format(
now.strftime("%y_%m_%d %H:%M:%S_%f"))
def main():
create_global_session(gen_session_name())
logging.getLogger().setLevel(logging.INFO)
knowledge = gac_100.main()
logging.getLogger().setLevel(logging.DEBUG)
while True:
try:
data = input("> ").strip()
except EOFError:
print("bye")
break
if data.lower() in bye_phrases:
break
if not data:
continue
if data == '/show':
show_knowledge(knowledge)
continue
elif data == '/samples':
show_samples(knowledge)
continue
with session().log(data):
ret = knowledge.process(data)
if ret:
result, _, _ = ret
if not is_modifiable_property(result):
print("<", result)
else:
result.setter()
print("OK")
elif ret is None:
print("- Couldn't understand that, oops... -")
else:
print("Unhandled response:", ret)
print("< Bye!")

View File

@ -1,65 +1,45 @@
import copy import copy
import logging import logging
from .session.org_mode import global_session as session from . import parsing
from .atoms import Atom
from . import layered_model
from . import knowledge_evaluation from . import knowledge_evaluation
from .modifiable_property import is_modifiable_property from .modifiable_property import is_modifiable_property
import random
def diff_knowledge(before, after): def diff_knowledge(before, after):
import jsondiff import jsondiff
return jsondiff.diff(before, after) return jsondiff.diff(before, after)
class KnowledgeBase(object): class KnowledgeBase(object):
def __init__(self, knowledge={}, examples=[], trained=[]): def __init__(self, knowledge, examples=[], trained=[]):
self.knowledge = copy.copy(knowledge) self.knowledge = copy.copy(knowledge)
self.originals = []
self.examples = copy.copy(examples) self.examples = copy.copy(examples)
self.trained = copy.copy(trained) self.trained = copy.copy(trained)
self.layers = layered_model.BaseModel(self)
## Parsing
def train(self, examples): def train(self, examples):
knowledge_before = copy.deepcopy(self.knowledge) knowledge_before = copy.deepcopy(self.knowledge)
with session().log('Train'):
# Parse everything # Parse everything
parsed_examples = []
for example in examples: for example in examples:
# If there's parsed data, leverage it ASAP logging.info("\x1b[7;32m> {} \x1b[0m".format(example))
if 'parsed' in example and isinstance(example['parsed'], tuple): tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
with session().log('parsed information integration'): logging.info(tokens)
result = knowledge_evaluation.integrate_information(self.knowledge, {
"parsed": example['parsed'],
})
self.act_upon(result)
with session().log("language integration"):
for tokens, decomposition, inferred_tree in self.layers.integrate(self, example):
session().annotate("Tokens: {}".format(tokens))
session().annotate("Inferred tree: {}".format(inferred_tree))
with session().log("full information integration"):
tokens = self.layers.tokenization.tokenize(example['text'], return_one=True)
result = knowledge_evaluation.integrate_information(self.knowledge, { result = knowledge_evaluation.integrate_information(self.knowledge, {
"elements": tokens, "elements": tokens,
"decomposition": decomposition, "decomposition": decomposition,
"parsed": inferred_tree, "parsed": inferred_tree,
}) })
session().annotate("Result: {}".format(self.get_value(result))) logging.info("\x1b[7;33m< {} \x1b[0m".format(self.get_value(result)))
self.act_upon(result) self.act_upon(result)
session().annotate("Set: {}".format(self.get_value(result))) logging.info("\x1b[7;34m> set: {} \x1b[0m".format(self.get_value(result)))
self.examples.append((decomposition, inferred_tree)) self.examples.append((decomposition, inferred_tree))
self.originals.append(example['text'])
# Reduce values # Reduce values
with session().log("reprocessing"): self.trained = parsing.reprocess_language_knowledge(self, self.examples)
res = self.layers.reprocess(self.examples)
self.trained = res
knowledge_after = copy.deepcopy(self.knowledge) knowledge_after = copy.deepcopy(self.knowledge)
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
@ -67,21 +47,18 @@ class KnowledgeBase(object):
return knowledge_diff_getter return knowledge_diff_getter
def process(self, row): def process(self, row):
knowledge_before = copy.deepcopy(self.knowledge) knowledge_before = copy.deepcopy(self.knowledge)
with session().log("Process: {}".format(row)): logging.info("\x1b[7;32m> {} \x1b[0m".format(row))
fit = list(self.layers.process(self, row)) tokens = parsing.to_tokens(row)
if len(fit) == 0: tokens, inferred_tree = parsing.get_fit(self, tokens)
return None
tokens, inferred_tree = fit[0]
result = knowledge_evaluation.integrate_information(self.knowledge, result = knowledge_evaluation.integrate_information(self.knowledge,
{ {
"elements": tokens, "elements": tokens,
"parsed": inferred_tree, "parsed": inferred_tree,
}) })
self.act_upon(result) self.act_upon(result)
session().annotate("Result: {}".format(result))
knowledge_after = copy.deepcopy(self.knowledge) knowledge_after = copy.deepcopy(self.knowledge)
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,

View File

@ -1,5 +1,3 @@
from .session.org_mode import global_session as session
from .modifiable_property import ( from .modifiable_property import (
ModifiableProperty, ModifiableProperty,
ModifiablePropertyWithAst, ModifiablePropertyWithAst,
@ -11,7 +9,6 @@ def resolve(knowledge_base, elements, value):
if isinstance(value, int): if isinstance(value, int):
return elements[value] return elements[value]
elif isinstance(value, tuple) or isinstance(value, list): elif isinstance(value, tuple) or isinstance(value, list):
session().annotate("V: {} {}".format(value, elements))
return integrate_information(knowledge_base, { return integrate_information(knowledge_base, {
"elements": elements, "elements": elements,
"parsed": value, "parsed": value,
@ -44,42 +41,16 @@ def get_subquery_type(knowledge_base, atom):
def property_for_value(knowledge_base, value): def property_for_value(knowledge_base, value):
if value in knowledge_base:
# Annotate the property as property
groups = knowledge_base[value].get('groups', {'property'})
groups.add('property')
knowledge_base[value]['groups'] = groups
# And find the property "name"
if 'as_property' in knowledge_base[value]:
return knowledge_base[value]['as_property'] return knowledge_base[value]['as_property']
return knowledge_base[value].get('groups', {'property'})
else:
# Consider that any property is... a property
knowledge_base[value] = {'groups': {'property'}}
return {'property'}
def modifiable_property_from_property(prop, path, value): def modifiable_property_from_property(prop, path, value):
def getter(): def getter():
nonlocal prop, path, value nonlocal prop, path, value
if isinstance(path, set):
# If the property is from a set, it's true if any possible
# path has a element as true
return any(map(lambda possible_path: ((possible_path in prop)
and
(prop[possible_path] == value)),
path))
else:
return (path in prop) and prop[path] == value return (path in prop) and prop[path] == value
def setter(): def setter():
nonlocal prop, path, value nonlocal prop, path, value
if isinstance(path, set):
for possible_path in path:
prop[possible_path] = value
else:
prop[path] = value prop[path] = value
return ModifiableProperty( return ModifiableProperty(
@ -103,31 +74,12 @@ def exists_property_with_value(knowledge_base, elements, subj, value):
def modifiable_element_for_existance_in_set(container, set_name, element): def modifiable_element_for_existance_in_set(container, set_name, element):
session().annotate("-----({} {} {})".format(container, set_name, element))
def getter(): def getter():
nonlocal container, set_name, element nonlocal container, set_name, element
session().annotate(" get({} {} {})".format(container, set_name, element))
return (set_name in container) and (element in container[set_name]) return (set_name in container) and (element in container[set_name])
def setter(): def setter():
nonlocal container, set_name, element nonlocal container, set_name, element
session().annotate(" add({} {} {})".format(container, set_name, element))
return container[set_name].add(element)
return ModifiableProperty(
getter=getter,
setter=setter,
)
def modifiable_element_for_existance_in_group(container, element, backlink, set_name='groups'):
def getter():
nonlocal container, element, backlink, set_name
return (set_name in container) and (element in container[set_name])
def setter():
nonlocal container, set_name, element
backlink['groups'].add(set_name)
return container[set_name].add(element) return container[set_name].add(element)
return ModifiableProperty( return ModifiableProperty(
@ -140,23 +92,18 @@ def pertenence_to_group(knowledge_base, elements, subj, group):
group = resolve(knowledge_base, elements, group) group = resolve(knowledge_base, elements, group)
if subj not in knowledge_base: if subj not in knowledge_base:
knowledge_base[subj] = {'groups': set()} knowledge_base[subj] = {}
if "groups" not in knowledge_base[subj]: if "groups" not in knowledge_base[subj]:
knowledge_base[subj]["groups"] = set() knowledge_base[subj]["groups"] = set()
if group not in knowledge_base: return modifiable_element_for_existance_in_set(
knowledge_base[group] = {'groups': set()}
if "groups" not in knowledge_base[group]:
knowledge_base[group]["groups"] = set()
return modifiable_element_for_existance_in_group(
container=knowledge_base[subj], container=knowledge_base[subj],
element=group, set_name="groups",
backlink=knowledge_base[group], element=group
) )
def has_capacity(knowledge_base, elements, subj, capacity): def has_capacity(knowledge_base, elements, subj, capacity):
subj = resolve(knowledge_base, elements, subj) subj = resolve(knowledge_base, elements, subj)
capacity = resolve(knowledge_base, elements, capacity) capacity = resolve(knowledge_base, elements, capacity)
@ -181,70 +128,12 @@ def question(knowledge_base, elements, subj):
return subj.getter() return subj.getter()
return subj return subj
def implies(knowledge_base, elements, precedent, consequent):
precedent = resolve(knowledge_base, elements, precedent)
consequent = resolve(knowledge_base, elements, consequent)
if precedent not in knowledge_base:
knowledge_base[precedent] = {'groups': set()}
if "implications" not in knowledge_base[precedent]:
knowledge_base[precedent]["implications"] = set()
return modifiable_element_for_existance_in_set(
container=knowledge_base[precedent],
set_name="implications",
element=consequent
)
def property_has_value(knowledge_base, elements, subj, prop, value):
subj = resolve(knowledge_base, elements, subj)
prop = resolve(knowledge_base, elements, prop)
value = resolve(knowledge_base, elements, value)
if subj not in knowledge_base:
knowledge_base[subj] = {'groups': set()}
if prop not in knowledge_base[subj]:
knowledge_base[subj][prop] = set()
return modifiable_element_for_existance_in_set(
container=knowledge_base[subj],
set_name=prop,
element=value
)
def perform_verb_over_object(knowledge_base, elements, subj, verb, obj):
subj = resolve(knowledge_base, elements, subj)
verb = resolve(knowledge_base, elements, verb)
obj = resolve(knowledge_base, elements, obj)
session().annotate("({} {} {})".format(verb, subj, obj))
if subj not in knowledge_base:
knowledge_base[subj] = {'groups': set()}
if 'performs-over' not in knowledge_base[subj]:
knowledge_base[subj]['performs-over'] = {}
if verb not in knowledge_base[subj]['performs-over']:
knowledge_base[subj]['performs-over'][verb] = set()
return modifiable_element_for_existance_in_set(
container=knowledge_base[subj]['performs-over'],
set_name=verb,
element=obj
)
knowledge_ingestion = { knowledge_ingestion = {
"exists-property-with-value": exists_property_with_value, "exists-property-with-value": exists_property_with_value,
"pertenence-to-group": pertenence_to_group, "pertenence-to-group": pertenence_to_group,
"has-capacity": has_capacity, "has-capacity": has_capacity,
"question": question, "question": question,
"implies": implies,
"property-has-value": property_has_value,
"perform-verb-over-object": perform_verb_over_object,
} }
@ -263,29 +152,6 @@ def integrate_information(knowledge_base, example):
args = ast[1:] args = ast[1:]
elements = example.get('elements', None) elements = example.get('elements', None)
session().annotate("Integrating:")
session().annotate("AST: {}".format(ast))
session().annotate("ARG: {}".format(elements))
session().annotate("------------")
return tagged_with_ast( return tagged_with_ast(
ast, elements, ast, elements,
knowledge_ingestion[method](knowledge_base, elements, *args)) knowledge_ingestion[method](knowledge_base, elements, *args))
def can_be_used_in_place(knowledge, token, minisegment):
if token not in knowledge.knowledge:
return True
info = knowledge.knowledge[token]
info_groups = info.get('groups', set())
minisegment_groups = minisegment.get('groups', set())
# Common group
if len(info_groups & minisegment_groups) > 0:
return True
# Neither has a group
elif len(info_groups) == 0 == len(minisegment_groups):
return True
return False

View File

@ -1,49 +0,0 @@
from .layers import tokenization_layer
from .layers import parsing_layer
from .layers import parsing
from .session.org_mode import global_session as session
def make_yield_pipe(layers, knowledge_base, example, func):
if len(layers) < 1:
yield example
return
input_generator = make_yield_pipe(layers[:-1], knowledge_base, example, func)
for input in input_generator:
session().annotate("[{}] --> {}".format(len(layers), input))
for d in list(func(layers[-1], input)):
yield d
class BaseModel:
def __init__(self, knowledge_base):
self.tokenization = tokenization_layer.TokenizationLayer(knowledge_base)
self.parsing = parsing_layer.ParsingLayer()
self.layers = [
self.tokenization,
self.parsing,
]
def reprocess(self, examples):
pattern_examples = []
for i, sample in enumerate(examples):
other = examples[:i] + examples[i + 1:]
match = parsing.get_matching(sample, other)
if len(match) > 0:
sample = (match, sample[1],)
pattern_examples.append(sample)
return pattern_examples
def integrate(self, knowledge_base, example):
yield from make_yield_pipe(self.layers, knowledge_base,
example, lambda l, i: l.integrate(knowledge_base, i))
def process(self, knowledge_base, example):
yield from make_yield_pipe(self.layers, knowledge_base,
example, lambda l, i: l.process(knowledge_base, i))
def tokenize(self, row, return_one=True):
return self.tokenization.to_tokens(row)

View File

@ -1,500 +0,0 @@
#!/usr/bin/env python
from ..session.org_mode import global_session as session
import re
import copy
from functools import reduce
from typing import List, Dict
from ..modifiable_property import ModifiableProperty
from .. import parameters
from ..atoms import Atom, a, is_atom
from .. import knowledge_evaluation
def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens)
template = list(parsed)
session().annotate(" -- MK TEMPLATE --")
session().annotate("MATCHR: {}".format(matcher))
session().annotate("TEMPLT: {}".format(template))
for i in range(len(matcher)):
word = matcher[i]
if word in template:
template[template.index(word)] = i
matcher[i] = {
'groups': set(knowledge_base.knowledge.get(word, {}).get('groups', set())),
}
return tokens, matcher, template
def is_bottom_level(tree):
for element in tree:
if isinstance(element, list) or isinstance(element, tuple):
return False
return True
def get_lower_levels(parsed):
lower = []
def aux(subtree, path):
nonlocal lower
deeper = len(path) == 0
for i, element in enumerate(subtree):
if isinstance(element, list) or isinstance(element, tuple):
aux(element, path + (i,))
deeper = True
if not deeper:
lower.append((path, subtree))
aux(parsed, path=())
return lower
# TODO: probably optimize this, it creates lots of unnecessary tuples
def replace_position(tree, position, new_element):
session().annotate("REPLACE POSITIONS:")
session().annotate(" TREE : {}".format(tree))
session().annotate("POSITION: {}".format(position))
session().annotate("NEW ELEM: {}".format(new_element))
session().annotate("------------------")
def aux(current_tree, remaining_route):
if len(remaining_route) == 0:
return new_element
else:
step = remaining_route[0]
return (
tree[:step]
+ (aux(tree[step], remaining_route[1:]),)
+ tree[step + 2:]
)
result = aux(tree, position)
session().annotate("-RESULT: {}".format(result))
return result
def integrate_language(knowledge_base, example):
text = example["text"].lower()
parsed = example["parsed"]
tokens = example['tokens']
resolved_parsed = copy.deepcopy(parsed)
while True:
session().annotate("P: {}".format(resolved_parsed))
lower_levels = get_lower_levels(resolved_parsed)
session().annotate("Lower: {}".format(lower_levels))
if len(lower_levels) == 0:
break
for position, atom in lower_levels:
with session().log("Atom {}".format(atom)):
result = None
similars = get_similar_tree(knowledge_base, atom, tokens)
for similar in similars:
result = build_remix_matrix(knowledge_base, tokens, atom, similar)
if result is not None:
break
else:
raise Exception('Similar not found')
remix, (start_bounds, end_bounds) = result
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
session().annotate("--FIND MIX--")
session().annotate("-MIX- | {}".format(remix))
session().annotate("-FRM- | {}".format(tokens))
session().annotate("-AFT- | {}".format(after_remix))
session().annotate("--- TEMPLATE ---")
_, matcher, result = make_template(knowledge_base, after_remix, atom)
session().annotate("Tx: {}".format(after_remix))
session().annotate("Mx: {}".format(matcher))
session().annotate("Rx: {}".format(result))
session().annotate("Sx: {}".format(start_bounds))
session().annotate("Ex: {}".format(end_bounds))
assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens))
session().annotate( " +-> {}".format(after_remix))
subquery_type = knowledge_evaluation.get_subquery_type(knowledge_base.knowledge, atom)
session().annotate(r" \-> <{}>".format(subquery_type))
# Clean remaining tokens
new_tokens = list(tokens)
offset = len(start_bounds)
for _ in range(len(remix)):
new_tokens.pop(offset)
# TODO: Get a specific types for... types
new_tokens.insert(offset, (subquery_type, remix))
tokens = new_tokens
resolved_parsed = replace_position(resolved_parsed, position, offset)
session().annotate("RP: {}".format(resolved_parsed))
session().annotate("AT: {}".format(atom))
session().annotate("#########")
tokens, matcher, result = make_template(knowledge_base, tokens, resolved_parsed)
session().annotate("T: {}".format(tokens))
session().annotate("M: {}".format(matcher))
session().annotate("R: {}".format(result))
session().annotate("---")
yield tokens, matcher, result
def apply_remix(tokens, remix):
rebuilt = []
for i in remix:
if isinstance(i, int):
if i >= len(tokens):
return None
rebuilt.append(tokens[i])
else:
assert(isinstance(i, str))
rebuilt.append(i)
return rebuilt
def build_remix_matrix(knowledge_base, tokens, atom, similar):
tokens = list(tokens)
with session().log("Remix matrix for {} - {}".format(tokens, atom)):
tokens, matcher, result = make_template(knowledge_base, tokens, atom)
similar_matcher, similar_result, similar_result_resolved, _, _ = similar
start_bounds, end_bounds = find_bounds(knowledge_base, matcher, similar_matcher)
for i, element in (end_bounds + start_bounds[::-1]):
matcher.pop(i)
tokens.pop(i)
possible_remixes = get_possible_remixes(knowledge_base, matcher, similar_matcher)
session().annotate("Possible remixes: {}".format(possible_remixes))
if len(possible_remixes) < 1:
return None
chosen_remix = possible_remixes[0]
return chosen_remix, (start_bounds, end_bounds)
def get_possible_remixes(knowledge_base, matcher, similar_matcher):
matrix = []
with session().log("Possible remixes from matcher: {}".format(matcher)):
for element in matcher:
with session().log("Element `{}`".format(element)):
session().annotate("Similar `{}`".format(similar_matcher))
if element in similar_matcher or isinstance(element, dict):
if isinstance(element, dict):
indexes = all_matching_indexes(knowledge_base, similar_matcher, element)
session().annotate("Dict element matching: {}".format(indexes))
else:
indexes = all_indexes(similar_matcher, element)
session().annotate("* element matching: {}".format(indexes))
matrix.append(indexes)
else:
session().annotate("`else` element matching: [element]")
matrix.append([element])
# TODO: do some scoring to find the most "interesting combination"
return [list(x) for x in list(zip(*matrix))]
def all_indexes(collection, element):
indexes = []
base = 0
for _ in range(collection.count(element)):
i = collection.index(element, base)
base = i + 1
indexes.append(i)
return indexes
def all_matching_indexes(knowledge_base, collection, element):
indexes = []
with session().log('Matching “{}'.format(element)):
assert("groups" in element)
element = element["groups"]
for i, instance in enumerate(collection):
session().log('Checking “{}'.format(instance))
if isinstance(instance, dict):
instance = instance["groups"]
elif instance in knowledge_base.knowledge:
session().log('Knowledge about “{}”: ”{}'.format(instance, knowledge_base.knowledge[instance]))
if "groups" not in knowledge_base.knowledge[instance]:
# This means that is only known as token
# so we should try to avoid using it
continue
instance = knowledge_base.knowledge[instance]["groups"]
intersection = set(instance) & set(element)
if (len(intersection) > 0 or (0 == len(instance) == len(element))):
indexes.append((i, intersection))
return [x[0] for x in sorted(indexes, key=lambda x: len(x[1]), reverse=True)]
def element_matches_groups(knowledge, element: Dict, groups):
with session().log("Checking if e “{}” matches groups “{}".format(element, groups)):
if isinstance(groups, str) and groups in knowledge:
return len(knowledge[groups].get("groups", set()) & element['groups']) > 0
elif isinstance(groups, dict):
return len(element.get("groups", set()) & element['groups']) > 0
return False
def find_bounds(knowledge, matcher, similar_matcher):
start_bounds = []
for i, element in enumerate(matcher):
if element in similar_matcher:
break
else:
start_bounds.append((i, element))
end_bounds = []
for i, element in enumerate(matcher[::-1]):
in_similar = False
if isinstance(element, str):
in_similar = element in similar_matcher
elif isinstance(element, dict):
in_similar = any(map(lambda groups: element_matches_groups(knowledge.knowledge,
element, groups),
similar_matcher))
if in_similar:
break
else:
end_bounds.append((len(matcher) - (i + 1), element))
return start_bounds, end_bounds
def get_similar_tree(knowledge_base, atom, tokens):
possibilities = []
# Find matching possibilities
for entry, tree in knowledge_base.trained:
if not is_bottom_level(tree):
continue
if tree[0] == atom[0]:
possibilities.append((entry, tree))
# Sort by more matching elements
sorted_possibilities = []
for (raw, possibility) in possibilities:
resolved = []
for element in atom:
if isinstance(element, str):
resolved.append(element)
else:
resolved.append(knowledge_evaluation.resolve(
knowledge_base.knowledge,
element,
raw))
# TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element
atom_score = sum([resolved[i] == atom[i]
for i
in range(min(len(resolved),
len(atom)))])
token_score = sum([similar_token in tokens
for similar_token
in raw])
sorted_possibilities.append((raw, possibility, resolved, atom_score, token_score))
sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True)
if len(sorted_possibilities) < 1:
return []
for i, possibility in enumerate(sorted_possibilities):
similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility
with session().log("Like {}".format(similar_matcher)):
session().annotate('AST: {}'.format(similar_result))
session().annotate('Results on: {}'.format(similar_result_resolved))
session().annotate('Atom score: {}'.format(_atom_score))
session().annotate('Token score: {}'.format(_token_score))
return sorted_possibilities
# TODO: unroll this mess
def get_matching(sample, other):
l = len(sample[0])
other = list(filter(lambda x: len(x[0]) == l, other))
for i in range(l):
if len(other) == 0:
return []
if isinstance(sample[0][i], dict): # Dictionaries are compared by groups
other = list(filter(lambda x: isinstance(x[0][i], dict) and
len(x[0][i]['groups'] & sample[0][i]['groups']) > 0,
other))
elif isinstance(sample[0][i], tuple): # Tuples are compared by types [0]
other = list(filter(lambda x: isinstance(x[0][i], tuple) and
x[0][i][0] == sample[0][i][0],
other))
matching = []
for x in range(l): # Generate the combination of this and other(s) matcher
first_sample_data = sample[0][x]
if isinstance(first_sample_data, str):
matching.append(first_sample_data)
elif isinstance(first_sample_data, tuple):
matching.append(first_sample_data)
else:
this_groups = sample[0][x]['groups']
if len(other) > 0:
other_groups = reduce(lambda a, b: a & b,
map(lambda y: y[0][x]['groups'],
other))
this_groups = this_groups & other_groups
matching.append({'groups': this_groups})
return matching
def reverse_remix(tree_section, remix):
result_section = []
offset = 0
for origin in remix:
if isinstance(origin, int):
if (origin + offset) >= len(tree_section):
return None
result_section.append(copy.deepcopy(tree_section[origin + offset]))
else:
assert(isinstance(origin, str))
offset += 1
return result_section + tree_section[len(remix):]
def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS):
results = []
for matcher, ast in knowledge.trained:
with session().log("{} <- {}".format(matcher, tokens)):
result = match_fit(knowledge, tokens, matcher, ast,
remaining_recursions)
if result is not None:
with session().log("Result: {}".format(result)):
results.append(result)
if len(results) > 0:
return results[0]
def is_definite_minisegment(minisegment):
return isinstance(minisegment, str) or isinstance(minisegment, dict)
def match_token(knowledge, next_token, minisegment):
if isinstance(minisegment, dict):
return knowledge_evaluation.can_be_used_in_place(knowledge, next_token, minisegment)
elif isinstance(minisegment, str):
# TODO: check if the two elements can be used in each other place
return next_token == minisegment
return False
def resolve_fit(knowledge, fit, remaining_recursions):
fitted = []
for element in fit:
if is_definite_minisegment(element):
fitted.append(element)
else:
with session().log("Resolving fit of `{}`".format(element)):
((result_type, remixer), tokens) = element
remixed_tokens = reverse_remix(tokens, remixer)
if remixed_tokens is None:
return None
minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1)
if minifit is None:
return None
minitokens, miniast = minifit
session().annotate(" AST | {}".format(miniast))
subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast)
fitted.append(subproperty)
return fitted
def match_fit(knowledge, tokens, matcher, ast, remaining_recursions):
segment_possibilities = [([], tokens)] # Matched tokens, remaining tokens
indent = ' ' * (parameters.MAX_RECURSIONS - remaining_recursions)
session().annotate(indent + 'T> {}'.format(tokens))
session().annotate(indent + 'M> {}'.format(matcher))
for minisegment in matcher:
with session().log("Minisegment `{}`".format(minisegment)):
possibilities_after_round = []
for matched_tokens, remaining_tokens in segment_possibilities:
if len(remaining_tokens) < 1:
continue
session().annotate(indent + "RT {}".format(remaining_tokens[0]))
session().annotate(indent + "DEF {}".format(is_definite_minisegment(minisegment)))
if is_definite_minisegment(minisegment):
# What if not match -----<
if match_token(knowledge, remaining_tokens[0], minisegment):
possibilities_after_round.append((
matched_tokens + [remaining_tokens[0]],
remaining_tokens[1:]
))
else:
# What if not match!!!!!!-----<
# TODO: optimize this with a look ahead
for i in range(1, len(tokens)):
possibilities_after_round.append((
matched_tokens + [(minisegment, remaining_tokens[:i])],
remaining_tokens[i:]
))
session().annotate(indent + "## PA {}".format(possibilities_after_round))
else:
segment_possibilities = possibilities_after_round
for possibility in segment_possibilities:
with session().log("Possibility: `{}`".format(possibility)):
pass
if len(segment_possibilities) < 1:
with session().log("NO POSSIBLE"):
pass
fully_matched_segments = [(matched, remaining)
for (matched, remaining)
in segment_possibilities
if len(remaining) == 0]
resolved_fits = []
with session().log("Full matches"):
for fit, _ in fully_matched_segments:
with session().log(fit): # REMIXES HAVE TO BE APPLIED BEFORE!!!
pass
with session().log("Resolutions"):
for fit, _ in fully_matched_segments:
with session().log("Resolving {}".format(fit)): # REMIXES HAVE TO BE APPLIED BEFORE!!!
resolved_fit = resolve_fit(knowledge, fit, remaining_recursions)
if resolved_fit is not None:
resolved_fits.append(resolved_fit)
else:
session().annotate("Not resolved")
if len(resolved_fits) == 0:
return None
return resolved_fits[0], ast

View File

@ -1,16 +0,0 @@
from . import parsing
class ParsingLayer:
def __init__(self):
pass
def integrate(self, knowledge_base, example):
yield from parsing.integrate_language(knowledge_base, example)
def train(self, knowledge_base, example):
assert False
def process(self, knowledge_base, input):
fit = parsing.get_fit(knowledge_base, input)
if fit is not None:
yield fit

View File

@ -1,186 +0,0 @@
from ..session.org_mode import global_session as session
from ..atoms import Atom, a, is_atom
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
for se in knowledge_base.structural_elements:
found_position = remaining.find(se)
found = found_position >= 0
session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
if found:
return [
(remaining[:found_position], se, remaining[found_position + len(se):])
]
for token in knowledge_base.knowledge.keys():
found_position = remaining.find(token)
found = found_position >= 0
session().annotate('Looking for token “{}”, found? {}'.format(token, found))
if found:
return [
(remaining[:found_position], token, remaining[found_position + len(token):])
]
return None
def to_tokens(knowledge_base, text, precedent=None):
if len(text) == 0:
session().annotate("No text remaining")
yield ['']
return
with session().log("Tokenizing {}".format(text)):
for option in knowledge_base.expected_token_after_precedent(precedent):
with session().log("Next: “{}".format(option)):
with session().log("Matching “{}” on “{}".format(option, text)):
for token_match in tokenization_match(option, text, knowledge_base):
if token_match is None:
session().annotate("No match")
match, remaining = token_match
if len(remaining) == len(text):
raise Exception('No text consumed in match')
session().annotate('Match: “{}'.format(match))
with session().log('Remaining “{}'.format(remaining)):
for sublevel in to_tokens(knowledge_base, remaining, match):
candidate = list(filter(lambda x: x != '', [match] + sublevel))
session().annotate('Yielding candidate “{}'.format(candidate))
yield candidate
def tokenization_match(element, text, knowledge_base):
# Constant/structural string matching
if isinstance(element, str):
if text.find(element) == 0:
# This match comes from a structuring element
# It doesn't appear on the tokenization
# So we should return it as an empty string
yield ('', text[len(element):])
return
else:
# No match found
return
elif is_atom(element, 'token'):
yield from match_single_token(text, knowledge_base)
return
raise NotImplementedError()
def match_single_token(text, knowledge_base):
found_token = False
for token in knowledge_base.knowledge.keys():
if text.find(token) == 0:
yield token, text[len(token):]
found_token = True
if found_token:
return
session().annotate('No token found at the start of ”{}'.format(text))
session().annotate('using structural elements to infer it')
# TODO: review this when multiple structural elements are available
for se in knowledge_base.structural_elements:
session().annotate('Looking for se “{}” in “{}'.format(se, text))
position = text.find(se, 0)
found = position > 0 # 0 is not considered a valid position for this kind of split
if found:
session().annotate('Found ”{}”, inferring “{}'.format(se, text[:position]))
yield text[:position], text[position:]
session().annotate('No structural element or token found, inferring only token remaining')
yield text, ''
# Using other tokens for cutoff
for token in knowledge_base.knowledge.keys():
session().annotate('Looking for token “{}” in “{}'.format(token, text))
position = text.find(token)
found = position >= 0
if found:
session().annotate('Found ”{}”, in position ”{}'.format(token, position))
yield text[:position], text[position:]
def integrate_tokenization(knowledge_base, example):
text = example['text']
tokens = example['tokens']
meaning = example.get('meaning')
return integrate_token_to_text_matching(knowledge_base, text, tokens)
def integrate_token_to_text_matching(knowledge_base, text, tokens):
texts = [text]
# Convert to tokens
for token_id, token in enumerate(tokens):
# Look for token in texts
for i, text in enumerate(texts):
if isinstance(text, int):
continue
if token in text:
before, after = text.split(token, maxsplit=1)
texts = (texts[:i] + [before]
+ [a('token')]
+ [after] + texts[i + 1:])
break
else:
raise Exception('Token not found')
# Remove leftovers from splits
texts = list(filter(lambda x: x != '', texts))
session().log("Tokenized as {} over {}".format(texts, tokens))
for i, element in enumerate(texts[:-1]):
learn_token_pair(element, texts[i + 1], knowledge_base)
return tokens
def learn_token_pair(precedent, consequent, knowledge_base):
knowledge_base.add_token_pair(precedent, consequent)
def pick_one_tokenization(options, knowledge_base):
'''
Heuristic function to pick the most probable tokenization.
Just pick the one with more results.
'''
options = list(options)
with session().log("Picking among: {} options".format(len(options))):
session().log("Options: \n{}".format('\n'.join(map(str, options))))
return pick_by_score(options,
[
# By number of splits without structuring elements
lambda tokenization: sum(map(
lambda split: sum(map(
lambda se: se in split, knowledge_base.structural_elements
)), tokenization)),
# By number of unknown tokens
lambda tokenization: len(list(filter(lambda token:
(token not in knowledge_base.knowledge.keys()) and
(token not in knowledge_base.structural_elements),
tokenization))),
# By number of splits
lambda tokenization: -len(tokenization),
])
def pick_by_score(options, heuristics):
for heuristic in heuristics:
assert(len(options) > 0)
options = list(map(lambda opt: (heuristic(opt), opt), options))
sorted_options = sorted(options, key=lambda x: x[0], reverse=False)
heuristic_cutoff = sorted_options[0][0]
session().annotate(sorted_options)
pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
options = pass_heuristic
session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
return options[0]

View File

@ -1,90 +0,0 @@
from ..session.org_mode import global_session as session
from ..atoms import Atom
from . import tokenization
import random
import copy
def randomized_weighted_list(elements):
# Randomized
randomized = list(elements)
random.shuffle(randomized)
# And return only once
already_returned = set()
for e in randomized:
if e in already_returned:
continue
yield e
already_returned.add(e)
class TokenizationLayer:
def __init__(self, knowledge_base):
self.structural_elements = set()
self.token_chains = {}
self.tokens = set()
self.knowledge_base = knowledge_base
self.knowledge = knowledge_base.knowledge
def integrate(self, knowledge_base, data):
assert knowledge_base is self.knowledge_base
assert 'text' in data
tokens = self.tokenize(data['text'])
data_with_row = copy.copy(data)
data_with_row['tokens'] = tokens
yield data_with_row
# with session().log("Tokenize: {}".format(data['text'])):
# for tokens in tokenization.to_tokens(self, data['text']):
# data_with_row = copy.copy(data)
# data_with_row['tokens'] = tokens
# yield data_with_row
def process(self, knowledge_base, row):
yield self.tokenize(row)
def tokenize(self, row, return_one=True):
row = row.lower()
with session().log("Tokenize: {}".format(row)):
options = list(tokenization.to_tokens(self, row))
session().log("Results:\n{}".format('\n'.join(map(str, options))))
if return_one:
chosen = tokenization.pick_one_tokenization(options, self)
session().log("Chosen: “{}".format(chosen))
self.train({'text': row, 'tokens': chosen})
return chosen
return options
## Tokenization
def add_token_pair(self, precedent, consequent):
self.add_token(precedent)
self.add_token(consequent)
if precedent not in self.token_chains:
self.token_chains[precedent] = []
self.token_chains[precedent].append(consequent)
def add_token(self, token):
self.tokens.add(token)
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
session().annotate('Found new structural element “{}'.format(token))
self.structural_elements.add(token)
def expected_token_after_precedent(self, precedent=None):
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
return randomized_weighted_list(self.tokens)
return randomized_weighted_list(self.token_chains[precedent])
def train(self, example):
with session().log('Training tokenizer'):
session().annotate("Example: {}".format(example))
tokens = tokenization.integrate_tokenization(self, example)
# Integrate knowledge of concept
for token in tokens:
if not token in self.knowledge:
self.knowledge[token] = {}

View File

@ -0,0 +1,384 @@
#!/usr/bin/env python
from . import knowledge_evaluation
from . import depth_meter
import logging
import re
import copy
from functools import reduce
from typing import List
from .modifiable_property import ModifiableProperty
from . import parameters
# TODO: more flexible tokenization
def to_tokens(text):
return re.findall(r'(\w+|[^\s])', text)
def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens)
template = list(parsed)
for i in range(len(matcher)):
word = matcher[i]
if word in template:
template[template.index(word)] = i
matcher[i] = {
'groups': set(knowledge_base.knowledge[word]['groups'])
}
return tokens, matcher, template
def is_bottom_level(tree):
for element in tree:
if isinstance(element, list) or isinstance(element, tuple):
return False
return True
def get_lower_levels(parsed):
lower = []
def aux(subtree, path):
nonlocal lower
deeper = len(path) == 0
for i, element in enumerate(subtree):
if isinstance(element, list) or isinstance(element, tuple):
aux(element, path + (i,))
deeper = True
if not deeper:
lower.append((path, subtree))
aux(parsed, path=())
return lower
# TODO: probably optimize this, it creates lots of unnecessary tuples
def replace_position(tree, position, new_element):
def aux(current_tree, remaining_route):
if len(remaining_route) == 0:
return new_element
else:
step = remaining_route[0]
return (
tree[:step]
+ (aux(tree[step], remaining_route[1:]),)
+ tree[step + 2:]
)
return aux(tree, position)
def integrate_language(knowledge_base, example):
text = example["text"].lower()
parsed = example["parsed"]
resolved_parsed = copy.deepcopy(parsed)
tokens = to_tokens(text)
while True:
logging.debug("P: {}".format(resolved_parsed))
lower_levels = get_lower_levels(resolved_parsed)
logging.debug("Lower: {}".format(lower_levels))
if len(lower_levels) == 0:
break
for position, atom in lower_levels:
logging.debug("\x1b[1mSelecting\x1b[0m: {}".format(atom))
similar = get_similar_tree(knowledge_base, atom)
remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, tokens, atom, similar)
_, matcher, result = make_template(knowledge_base, tokens, atom)
logging.debug("Tx: {}".format(tokens))
logging.debug("Mx: {}".format(matcher))
logging.debug("Rx: {}".format(result))
logging.debug("Remix: {}".format(remix))
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens))
logging.debug( " +-> {}".format(after_remix))
subquery_type = knowledge_evaluation.get_subquery_type(knowledge_base.knowledge, atom)
logging.debug(r" \-> <{}>".format(subquery_type))
# Clean remaining tokens
new_tokens = list(tokens)
offset = len(start_bounds)
for _ in range(len(remix)):
new_tokens.pop(offset)
# TODO: Get a specific types for... types
new_tokens.insert(offset, (subquery_type, remix))
tokens = new_tokens
resolved_parsed = replace_position(resolved_parsed, position, offset)
logging.debug("#########")
tokens, matcher, result = make_template(knowledge_base, tokens, resolved_parsed)
logging.debug("T: {}".format(tokens))
logging.debug("M: {}".format(matcher))
logging.debug("R: {}".format(result))
logging.debug("---")
return tokens, matcher, result
def apply_remix(tokens, remix):
rebuilt = []
for i in remix:
rebuilt.append(tokens[i])
return rebuilt
def build_remix_matrix(knowledge_base, tokens, atom, similar):
tokens = list(tokens)
tokens, matcher, result = make_template(knowledge_base, tokens, atom)
similar_matcher, similar_result, similar_result_resolved, _ = similar
start_bounds, end_bounds = find_bounds(matcher, similar_matcher)
for i, element in (end_bounds + start_bounds[::-1]):
matcher.pop(i)
tokens.pop(i)
possible_remixes = get_possible_remixes(knowledge_base, matcher, similar_matcher)
chosen_remix = possible_remixes[0]
return chosen_remix, (start_bounds, end_bounds)
def get_possible_remixes(knowledge_base, matcher, similar_matcher):
matrix = []
for element in matcher:
logging.debug("- {}".format(element))
logging.debug("+ {}".format(similar_matcher))
assert(element in similar_matcher or isinstance(element, dict))
if isinstance(element, dict):
indexes = all_matching_indexes(knowledge_base, similar_matcher, element)
else:
indexes = all_indexes(similar_matcher, element)
matrix.append(indexes)
# TODO: do some scoring to find the most "interesting combination"
return [list(x) for x in list(zip(*matrix))]
def all_indexes(collection, element):
indexes = []
base = 0
for _ in range(collection.count(element)):
i = collection.index(element, base)
base = i + 1
indexes.append(i)
return indexes
def all_matching_indexes(knowledge_base, collection, element):
indexes = []
assert("groups" in element)
element = element["groups"]
for i, instance in enumerate(collection):
if isinstance(instance, dict):
instance = instance["groups"]
elif instance in knowledge_base.knowledge:
instance = knowledge_base.knowledge[instance]["groups"]
intersection = set(instance) & set(element)
if len(intersection) > 0:
indexes.append((i, intersection))
return [x[0] for x in sorted(indexes, key=lambda x: len(x[1]), reverse=True)]
def find_bounds(matcher, similar_matcher):
start_bounds = []
for i, element in enumerate(matcher):
if element in similar_matcher:
break
else:
start_bounds.append((i, element))
end_bounds = []
for i, element in enumerate(matcher[::-1]):
if element in similar_matcher:
break
else:
end_bounds.append((len(matcher) - (i + 1), element))
return start_bounds, end_bounds
def get_similar_tree(knowledge_base, atom):
possibilities = []
# Find matching possibilities
for entry, tree in knowledge_base.trained:
if not is_bottom_level(tree):
continue
if tree[0] == atom[0]:
possibilities.append((entry, tree))
# Sort by more matching elements
sorted_possibilities = []
for (raw, possibility) in possibilities:
resolved = []
for element in atom:
if isinstance(element, str):
resolved.append(element)
else:
resolved.append(knowledge_evaluation.resolve(
knowledge_base.knowledge,
element,
raw))
# TODO: Probably should take into account the categories of the elements in the "intake" ([0]) element
score = sum([resolved[i] == atom[i]
for i
in range(min(len(resolved),
len(atom)))])
sorted_possibilities.append((raw, possibility, resolved, score))
sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3], reverse=True)
if len(sorted_possibilities) < 1:
return None
return sorted_possibilities[0]
# TODO: unroll this mess
def get_matching(sample, other):
l = len(sample[0])
other = list(filter(lambda x: len(x[0]) == l, other))
for i in range(l):
if len(other) == 0:
return []
if isinstance(sample[0][i], dict): # Dictionaries are compared by groups
other = list(filter(lambda x: isinstance(x[0][i], dict) and
len(x[0][i]['groups'] & sample[0][i]['groups']) > 0,
other))
elif isinstance(sample[0][i], tuple): # Tuples are compared by types [0]
other = list(filter(lambda x: isinstance(x[0][i], tuple) and
x[0][i][0] == sample[0][i][0],
other))
return [sample[0][x] if isinstance(sample[0][x], str)
else
sample[0][x] if isinstance(sample[0][x], tuple)
else {'groups': sample[0][x]['groups'] & reduce(lambda a, b: a & b,
map(lambda y: y[0][x]['groups'],
other))}
for x
in range(l)]
def reprocess_language_knowledge(knowledge_base, examples):
examples = knowledge_base.examples + examples
pattern_examples = []
for i, sample in enumerate(examples):
other = examples[:i] + examples[i + 1:]
match = get_matching(sample, other)
if len(match) > 0:
sample = (match, sample[1],)
pattern_examples.append(sample)
return pattern_examples
def reverse_remix(tree_section, remix):
result_section = []
for origin in remix:
result_section.append(copy.deepcopy(tree_section[origin]))
return result_section + tree_section[len(remix):]
def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS):
for matcher, ast in knowledge.trained:
result = match_fit(knowledge, tokens, matcher, ast,
remaining_recursions)
if result is not None:
return result
return None
def is_definite_minisegment(minisegment):
return isinstance(minisegment, str) or isinstance(minisegment, dict)
def match_token(knowledge, next_token, minisegment):
if isinstance(minisegment, dict):
# TODO: check if the dictionary matches the values
return True
elif isinstance(minisegment, str):
# TODO: check if the two elements can be used in each other place
return next_token == minisegment
return False
def resolve_fit(knowledge, fit, remaining_recursions):
fitted = []
for element in fit:
if is_definite_minisegment(element):
fitted.append(element)
else:
((result_type, remixer), tokens) = element
remixed_tokens = reverse_remix(tokens, remixer)
minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1)
if minifit is None:
return None
minitokens, miniast = minifit
subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast)
fitted.append(subproperty)
return fitted
def match_fit(knowledge, tokens, matcher, ast, remaining_recursions):
segment_possibilities = [([], tokens)] # Matched tokens, remaining tokens
for minisegment in matcher:
possibilities_after_round = []
for matched_tokens, remaining_tokens in segment_possibilities:
if len(remaining_tokens) < 1:
continue
if is_definite_minisegment(minisegment):
if match_token(knowledge, remaining_tokens[0], minisegment):
possibilities_after_round.append((
matched_tokens + [remaining_tokens[0]],
remaining_tokens[1:]
))
else:
# TODO: optimize this with a look ahead
for i in range(1, len(tokens)):
possibilities_after_round.append((
matched_tokens + [(minisegment, remaining_tokens[:i])],
remaining_tokens[i:]
))
else:
segment_possibilities = possibilities_after_round
fully_matched_segments = [(matched, remaining)
for (matched, remaining)
in segment_possibilities
if len(remaining) == 0]
resolved_fits = []
for fit, _ in fully_matched_segments:
resolved_fit = resolve_fit(knowledge, fit, remaining_recursions)
if resolved_fit is not None:
resolved_fits.append(resolved_fit)
if len(resolved_fits) == 0:
return None
return resolved_fits[0], ast

View File

@ -1,79 +0,0 @@
import logging
import datetime
SESSION = None
def __gen_session_name__():
now = datetime.datetime.utcnow()
return "treeNLU-session-{}.org".format(
now.strftime("%y_%m_%d %H:%M:%S_%f"))
def create_global_session(fname):
global SESSION
SESSION = OrgModeSession(fname)
def global_session():
if SESSION is None:
session_name = __gen_session_name__()
logging.warn("Session not created, saved on {}".format(session_name))
create_global_session(session_name)
assert(SESSION is not None)
return SESSION
def get_header():
now = datetime.datetime.utcnow()
return ("# Ran on {}\n".format(
now.strftime("%y/%m/%d %H:%M:%S.%f")))
class LevelContext:
def __init__(self, increaser, decreaser):
self.increaser = increaser
self.decreaser = decreaser
def __enter__(self):
self.increaser()
def __exit__(self, _type, _value, _traceback):
self.decreaser()
class OrgModeSession:
def __init__(self, fname):
self.f = open(fname, 'wt')
self.level = 0
self.dirty = False
self.f.write(get_header())
def annotate(self, annotation):
if self.dirty:
self.f.write("{indentation} {data}\n".format(
indentation='*' * (self.level + 1),
data="---"))
self.dirty = False
self.f.write("{indentation} {data}\n".format(
indentation=' ' * (self.level + 2 + 1),
data=annotation))
def log(self, string):
self.f.write("{indentation} {data}\n".format(
indentation='*' * (self.level + 1),
data=string))
self.dirty = False
return LevelContext(self.inc_level, self.dec_level)
def inc_level(self):
self.level += 1
def dec_level(self):
self.level -= 1
self.dirty = True
def close(self):
self.f.close()

View File

@ -1,50 +1,61 @@
import traceback import json
import logging import logging
from .session import org_mode
from .tests import tokenization logging.getLogger().setLevel(logging.INFO)
from .tests import basic
from .tests import gac_100
from .tests import gac_extension
logging.getLogger().setLevel(logging.ERROR) from .knowledge_base import KnowledgeBase
from .modifiable_property import is_modifiable_property
tests = ( import hy
("tokenization", tokenization), from .tests import base
("basic", basic),
("gac 100", gac_100),
("gac+", gac_extension),
)
def test_assumption(expectedResponse, knowledge, query):
logging.info("Query: {}".format(query['text']))
logging.info("Expected: {}".format(expectedResponse))
def gen_session_name(): result, abstract_tree, diff = knowledge.process(query['text'])
return "treeNLU-test-session.org" end_result = result.getter() if is_modifiable_property(result) else result
logging.info("\x1b[0;3{}mResult: {}\x1b[0m".format("1" if end_result != expectedResponse else "2", end_result))
assert(end_result == expectedResponse)
def main(): def main():
org_mode.create_global_session(gen_session_name()) base.run_tests()
failed = False knowledge = KnowledgeBase(
for test_name, test_module in tests: knowledge=base_knowledge,
try: )
with org_mode.global_session().log(test_name):
test_module.main()
print(" \x1b[1;32m✓\x1b[0m {}".format(test_name))
except AssertionError as ae:
print(" \x1b[1;31m✗\x1b[0m {}{}".format(test_name,
('\n [Assertion] {}'.format(ae.args[0])) if len(ae.args) > 0
else ''))
traceback.print_exc()
failed = True
except Exception as e: differences = knowledge.train(examples)
print(" \x1b[1;7;31m!\x1b[0m {}\n [Exception] {}".format(test_name, e))
failed = True
traceback.print_exc()
raise
org_mode.global_session().close()
if failed: logging.info("----")
exit(1) logging.info(differences())
logging.info("----")
test_assumption(True, knowledge, {'text': 'earth is a planet'})
test_assumption(True, knowledge, {'text': 'is lava dangerous?'})
for test in [{'text': 'a bus can run'}, {'text': 'io is a moon'}]:
row = test['text']
result, inferred_tree, differences = knowledge.process(row)
logging.info("result:", result)
logging.info(differences())
logging.info("---")
logging.info('-----')
logging.info(json.dumps(sorted(knowledge.knowledge.keys()), indent=4))
logging.info('-----')
queryTrue = {
"text": "is io a moon?",
"parsed": ("question", ("pertenence-to-group", "io", "moon"))
}
queryFalse = {
"text": "is io a planet?",
"parsed": ("question", ("pertenence-to-group", "io", "planet"))
}
test_assumption(False, knowledge, queryFalse)
test_assumption(True, knowledge, queryTrue)
if __name__ == '__main__': if __name__ == '__main__':
main() main()

View File

@ -0,0 +1,62 @@
(import [..knowledge_base [KnowledgeBase]])
(setv knowledge-base
{
"icecream" { "groups" (set ["noun" "object" "comestible" "sweet"]) }
"lava" { "groups" (set ["noun" "object"]) }
"earth" { "groups" (set ["noun" "object" "planet"]) }
"io" { "groups" (set ["noun" "object"]) }
"green" { "groups" (set ["noun" "color" "concept"]) }
"plane" { "groups" (set ["noun" "object" "vehicle" "fast"]) }
"car" { "groups" (set ["noun" "object" "vehicle" "slow-ish"]) }
"wale" { "groups" (set ["noun" "object" "living-being"]) }
"cold" { "groups" (set ["property" "temperature"]) "as_property" "temperature" }
"dangerous" { "groups" (set ["property"]) "as_property" "safety" }
"planet" { "groups" (set ["noun" "group"]) }
"moon" { "groups" (set ["noun" "group"]) }
"color" { "groups" (set ["property" "group"]) }
"fly" { "groups" (set ["verb"]) }
"swim" { "groups" (set ["verb"]) }
}
)
(setv examples
[
{ "text" "icecream is cold"
"parsed" '(exists-property-with-value icecream cold) }
{ "text" "is icecream cold?"
"parsed" '(question (exists-property-with-value icecream cold)) }
{ "text" "lava is dangerous"
"parsed" '(exists-property-with-value lava dangerous) }
{ "text" "is lava dangerous?"
"parsed" '(question (exists-property-with-value lava dangerous)) }
{ "text" "earth is a planet"
"parsed" '(pertenence-to-group earth planet) }
{ "text" "io is a moon"
"parsed" '(pertenence-to-group io moon) }
{ "text" "is earth a moon?"
"parsed" '(question (pertenence-to-group earth moon)) }
{ "text" "Green is a color"
"parsed" '(pertenence-to-group green color) }
{ "text" "a plane can fly"
"parsed" '(has-capacity plane fly) }
{ "text" "a wale can swim"
"parsed" '(has-capacity wale swim) }
{
"text" "if earth is a planet it is big"
"parsed" '(implies
(pertenence-to-group earth planet)
(exists-property-with-value earth big)) }
]
)
(defn run_tests []
[
(setv knowledge (KnowledgeBase
knowledge=base_knowledge,
)
)
]
)

View File

@ -1,166 +0,0 @@
from ..session.org_mode import global_session as session
import json
from ..knowledge_base import KnowledgeBase
from ..modifiable_property import is_modifiable_property
from ..utils.tokenization import train_basic_tokenization
examples = [
{
"text": "icecream is cold",
"parsed": ("exists-property-with-value", 'icecream', 'cold'),
},
{
"text": "is icecream cold?",
"parsed": ("question", ("exists-property-with-value", 'icecream', 'cold'))
},
{
"text": "lava is dangerous",
"parsed": ("exists-property-with-value", 'lava', 'dangerous')
},
{
"text": "is lava dangerous?",
"parsed": ("question", ("exists-property-with-value", 'lava', 'dangerous')),
},
{
"text": "earth is a planet",
"parsed": ("pertenence-to-group", 'earth', 'planet'),
},
{
"text": "io is a moon",
"parsed": ("pertenence-to-group", 'io', 'moon'),
},
{
"text": "is earth a moon?",
"parsed": ("question", ("pertenence-to-group", 'earth', 'moon')),
},
{
"text": "Green is a color",
"parsed": ("pertenence-to-group", 'green', 'color'),
},
{
"text": "a plane can fly",
"parsed": ("has-capacity", 'plane', 'fly')
},
{
"text": "a wale can swim",
"parsed": ("has-capacity", 'wale', 'swim')
},
# {
# "text": "if earth is a planet, it is big",
# "parsed": ("implies",
# ("pertenence-to-group", 'earth', 'planet'),
# ("exists-property-with-value", 'earth', 'big')),
# },
]
base_knowledge = {
'icecream': {
"groups": {'noun', 'object', 'comestible', 'sweet'},
},
'lava': {
"groups": {'noun', 'object'},
},
'earth': {
"groups": {'noun', 'object', 'planet'},
},
'io': {
"groups": {'noun', 'object'},
},
'green': {
"groups": {'noun', 'color', 'concept'},
},
'plane': {
"groups": {'noun', 'object', 'vehicle', 'fast'},
},
'car': {
"groups": {'noun', 'object', 'vehicle', 'slow-ish'},
},
'wale': {
"groups": {'noun', 'object', 'living-being'},
},
'cold': {
"groups": {'property', 'temperature'},
"as_property": "temperature",
},
'dangerous': {
"groups": {'property'},
"as_property": "safety",
},
'planet': {
"groups": {'noun', 'group'},
},
'moon': {
"groups": {'noun', 'group'},
},
'color': {
"groups": {'property', 'group'},
},
'fly': {
"groups": {'verb'},
},
'bus': {
"groups": {'noun'},
},
'run': {
"groups": {'verb'},
},
'swim': {
"groups": {'verb'},
},
'planet': {
'groups': {'noun'}
}
}
def test_assumption(expectedResponse, knowledge, query):
with session().log(query['text']):
session().annotate("Expected: {}".format(expectedResponse))
result, abstract_tree, diff = knowledge.process(query['text'])
end_result = result.getter() if is_modifiable_property(result) else result
session().annotate("Result: {}".format(end_result))
if end_result != expectedResponse:
raise AssertionError('{} is not {}'.format(end_result, expectedResponse))
def main():
knowledge = KnowledgeBase(
knowledge=base_knowledge,
)
train_basic_tokenization(knowledge)
for example in examples:
with session().log(example['text']):
differences = knowledge.train([example])
session().annotate("----")
session().annotate(differences())
session().annotate("----")
test_assumption(True, knowledge, {'text': 'earth is a planet'})
test_assumption(True, knowledge, {'text': 'is lava dangerous?'})
for test in [{'text': 'a bus can run'}, {'text': 'io is a moon'}]:
row = test['text']
result, inferred_tree, differences = knowledge.process(row)
session().annotate("result: {}".format(result))
session().annotate(differences())
session().annotate("---")
session().annotate('-----')
session().annotate(json.dumps(sorted(knowledge.knowledge.keys()), indent=4))
session().annotate('-----')
queryTrue = {
"text": "is io a moon?",
"parsed": ("question", ("pertenence-to-group", "io", "moon"))
}
queryFalse = {
"text": "is io a planet?",
"parsed": ("question", ("pertenence-to-group", "io", "planet"))
}
test_assumption(False, knowledge, queryFalse)
test_assumption(True, knowledge, queryTrue)
return knowledge

View File

@ -1,736 +0,0 @@
from ..session.org_mode import global_session as session
from ..knowledge_base import KnowledgeBase
from ..utils.visuals import show_progbar
from ..visualization import show_knowledge
from ..utils.tokenization import train_basic_tokenization
def _assert(args):
assert(args)
def _assert_msg(args, msg):
assert args, msg
examples = [
('full_example',
{
"text": "is icecream cold?",
"affirmation": "icecream is cold",
"parsed": ("question",
("exists-property-with-value", 'icecream', 'cold')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('cold' in knowledge.knowledge['icecream']['property'])
),],
}),
('full_example',
{
"text": "is earth a planet?",
"affirmation": "earth is a planet",
"parsed": ("question",
("pertenence-to-group", 'earth', 'planet')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('planet' in knowledge.knowledge['earth']['groups'])
),],
}),
('full_example',
{
"text": "Is green a color?",
"affirmation": "green is a color",
"parsed": ("question",
("pertenence-to-group", 'green', 'color')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('color' in knowledge.knowledge['green']['groups'])
),],
}),
('full_example',
{
"text": "do airplanes fly?",
"affirmation": "airplanes fly",
"parsed": ("question",
("has-capacity", 'plane', 'fly')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('fly' in knowledge.knowledge['plane']['capacities'])
),],
}),
('full_example',
{
"text": "Is it hot during the summer?",
"affirmation": "it is hot during summer",
"parsed": ("question",
("implies", 'summer', 'hot')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('hot' in knowledge.knowledge['summer']['implications'])
),],
}),
('full_example',
{
"text": "is chile in south america ?",
"affirmation": "chile is in south america",
"parsed": ("question",
("property-has-value", 'chile', 'location', 'south america')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('south america' in knowledge.knowledge['chile']['location'])
),],
}),
('full_example',
{
"text": "Was Socrates a man?",
"affirmation": "Socrates was a man",
"parsed": ("question",
("pertenence-to-group", 'socrates', 'man')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('man' in knowledge.knowledge['socrates']['groups'])
),],
}),
('full_example',
{
"text": "Computers use electricity?",
"affirmation": "Computers use electricity",
"parsed": ("question",
('perform-verb-over-object', 'computers', 'use', 'electricity')),
"answer": True,
"after_execution": [(
lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use'])
),],
}),
# ('full_example',
# {
# "text": "The dominant language in france is french?",
# "affirmation": "The dominant language in france is french",
# "parsed": ("question",
# ("property-has-value", "france", "dominant-language", "french")),
# "answer": True,
# }),
# {
# "text": "was abraham lincoln once president of the united states?",
# "affirmation": "was abraham lincoln once president of the united states?",
# "parsed": (),
# "answer": None,
# },
('text_example',
{
"question": "is milk white?",
"affirmation": "milk is white",
"answer": True,
}),
# {
# "text": "do people have emotions?",
# "affirmation": "do people have emotions?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "do objects appear smaller as they move away from you?",
# "affirmation": "do objects appear smaller as they move away from you?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Does the human species have a male and female gender?",
# "affirmation": "Does the human species have a male and female gender?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is a mountain mostly made of rock?",
# "affirmation": "Is a mountain mostly made of rock?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is sun microsystems a computer company?",
# "affirmation": "is sun microsystems a computer company?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do you see with your eyes and smell with your nose?",
# "affirmation": "Do you see with your eyes and smell with your nose?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is smoking bad for your health?",
# "affirmation": "Is smoking bad for your health?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Does a dog have four legs?",
# "affirmation": "Does a dog have four legs?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do mammals have hearts?",
# "affirmation": "Do mammals have hearts?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is the Earth a planet?",
# "affirmation": "is the Earth a planet?",
# "parsed": (),
# "answer": None,
# },
# ('text_example',
# {
# "question": "is water a liquid?",
# "affirmation": "water is a liquid",
# "answer": True,
# }),
# {
# "text": "Is Bugs Bunny a cartoon character?",
# "affirmation": "Is Bugs Bunny a cartoon character?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do Humans communicate by Telephone?",
# "affirmation": "Do Humans communicate by Telephone?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is beer a drink ?",
# "affirmation": "is beer a drink ?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "are there 12 months in a year?",
# "affirmation": "are there 12 months in a year?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "does the sun hurt your eyes when you look at it?",
# "affirmation": "does the sun hurt your eyes when you look at it?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do most cars have doors?",
# "affirmation": "Do most cars have doors?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is orange both a fruit and a colour?",
# "affirmation": "is orange both a fruit and a colour?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is water a necessity?",
# "affirmation": "Is water a necessity?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do CDs have better quality sound than Cassettes?",
# "affirmation": "Do CDs have better quality sound than Cassettes?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "do animals die?",
# "affirmation": "do animals die?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is the arctic cold?",
# "affirmation": "Is the arctic cold?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do people have 2 eyes?",
# "affirmation": "Do people have 2 eyes?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "does a person have a brain?",
# "affirmation": "does a person have a brain?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is the rain wet?",
# "affirmation": "Is the rain wet?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is division a mathematical operation?",
# "affirmation": "Is division a mathematical operation?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is 400 greater than 399?",
# "affirmation": "is 400 greater than 399?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is magenta a color?",
# "affirmation": "is magenta a color?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are books educational?",
# "affirmation": "Are books educational?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Was the Great Wall of China built by humans?",
# "affirmation": "Was the Great Wall of China built by humans?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are pianos musical instruments?",
# "affirmation": "Are pianos musical instruments?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Has Bill Clinton been President of the United States?",
# "affirmation": "Has Bill Clinton been President of the United States?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is a whale a mammal?",
# "affirmation": "Is a whale a mammal?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are lemons yellow?",
# "affirmation": "Are lemons yellow?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is the South Pole cold?",
# "affirmation": "Is the South Pole cold?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is Africa warm?",
# "affirmation": "Is Africa warm?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is Antarctica cold?",
# "affirmation": "Is Antarctica cold?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is rock is generally harder than wood?",
# "affirmation": "Is rock is generally harder than wood?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do dogs chase cats?",
# "affirmation": "Do dogs chase cats?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "can humans die from cold temperatures?",
# "affirmation": "can humans die from cold temperatures?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "do people enjoy conversation?",
# "affirmation": "do people enjoy conversation?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is Bill Clinton the President of the United States?",
# "affirmation": "Is Bill Clinton the President of the United States?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are books a good source of information?",
# "affirmation": "Are books a good source of information?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "are friends different than enemies?",
# "affirmation": "are friends different than enemies?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "are people alive?",
# "affirmation": "are people alive?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do triangles have 3 sides?",
# "affirmation": "Do triangles have 3 sides?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is Ice cream cold?",
# "affirmation": "Is Ice cream cold?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are all sides of a square the same length?",
# "affirmation": "Are all sides of a square the same length?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do all people eat food?",
# "affirmation": "Do all people eat food?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "do dentists repair teeth?",
# "affirmation": "do dentists repair teeth?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is America bigger than Japan?",
# "affirmation": "Is America bigger than Japan?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do all triangles have three sides?",
# "affirmation": "Do all triangles have three sides?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "A grocery store sales food?",
# "affirmation": "A grocery store sales food?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Does a sunburn cause pain?",
# "affirmation": "Does a sunburn cause pain?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is a computer an invention?",
# "affirmation": "Is a computer an invention?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "have humans visited the moon?",
# "affirmation": "have humans visited the moon?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are there people in India?",
# "affirmation": "Are there people in India?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Was Einstein a genius?",
# "affirmation": "Was Einstein a genius?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are we on the planet earth?",
# "affirmation": "Are we on the planet earth?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "do people comb their hair in the morning?",
# "affirmation": "do people comb their hair in the morning?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Does it hurt to lose a friend?",
# "affirmation": "Does it hurt to lose a friend?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are there people on the earth?",
# "affirmation": "Are there people on the earth?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Was George Washington a president of the United States of America?",
# "affirmation": "Was George Washington a president of the United States of America?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Does an ocean have salt water in it?",
# "affirmation": "Does an ocean have salt water in it?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is night darker than day?",
# "affirmation": "Is night darker than day?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Does a triangle have three sides?",
# "affirmation": "Does a triangle have three sides?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are peaches fruit?",
# "affirmation": "Are peaches fruit?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do people urinate?",
# "affirmation": "Do people urinate?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is Germany located in Europe?",
# "affirmation": "Is Germany located in Europe?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do mirrors reflect light?",
# "affirmation": "Do mirrors reflect light?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are people born naked?",
# "affirmation": "Are people born naked?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is it hot near the equator?",
# "affirmation": "Is it hot near the equator?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is paper made from trees?",
# "affirmation": "is paper made from trees?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Can a female have children?",
# "affirmation": "Can a female have children?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are people born every day?",
# "affirmation": "Are people born every day?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are shoes worn on the feet?",
# "affirmation": "Are shoes worn on the feet?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "does it get wet when it rains?",
# "affirmation": "does it get wet when it rains?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are there plants and insects in the rainforest which have no names?",
# "affirmation": "Are there plants and insects in the rainforest which have no names?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do people eat pigs?",
# "affirmation": "Do people eat pigs?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do businessmen wear ties?",
# "affirmation": "Do businessmen wear ties?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is New York in the United States?",
# "affirmation": "Is New York in the United States?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are humans more intelligent than ants?",
# "affirmation": "Are humans more intelligent than ants?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are ravens black?",
# "affirmation": "Are ravens black?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Are there rats on ships?",
# "affirmation": "Are there rats on ships?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "are lions animals?",
# "affirmation": "are lions animals?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "6 is greater than 5?",
# "affirmation": "6 is greater than 5?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Is water made of hydrogen and oxygen?",
# "affirmation": "Is water made of hydrogen and oxygen?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "is the sky blue on a clear day?",
# "affirmation": "is the sky blue on a clear day?",
# "parsed": (),
# "answer": None,
# },
# {
# "text": "Do most people work during the day?",
# "affirmation": "Do most people work during the day?",
# "parsed": (),
# "answer": None,
# },
]
base_knowledge = {
'summer': {
"groups": {'epoch'},
},
'fly': {
"groups": {'verb'},
},
'use': {
"groups": {'verb'},
},
'electricity': {
"groups": {'power'},
},
'airplanes': {},
'white': {
'groups': {'property'},
}
}
def main():
knowledge = KnowledgeBase(
knowledge=base_knowledge,
)
train_basic_tokenization(knowledge)
total = len(examples)
for i, (example_type, data) in enumerate(examples):
if example_type == 'full_example':
affirmation = {
'text': data['affirmation'],
'parsed': data['parsed'][1],
}
question = data
with session().log(data['affirmation']):
show_progbar(i, total, data['affirmation'])
differences = knowledge.train([affirmation])
with session().log(data['text']):
show_progbar(i, total, data['text'])
differences = knowledge.train([question])
session().annotate(differences())
result, _, _ = knowledge.process(data['text'])
if "after_execution" in data:
for f in data["after_execution"]:
f(knowledge)
if result != data['answer']:
raise AssertionError('{} is not {}'.format(result, data['answer']))
elif example_type == 'text_example':
with session().log(data['affirmation']):
show_progbar(i, total, data['affirmation'])
affirmation = data['affirmation']
session().annotate("Processing affirmation: {}".format(affirmation))
_, _, _ = knowledge.process(affirmation)
with session().log(data['question']):
show_progbar(i, total, data['question'])
question = data['question']
session().annotate("Processing question : {}".format(question))
result, _, _ = knowledge.process(question)
if result != data['answer']:
raise AssertionError('{} is not {}'.format(result, data['answer']))
else:
raise NotImplementedError('Example type: {}'.format(example_type))
print("\r\x1b[K", end='')
return knowledge
if __name__ == '__main__':
show_knowledge(main())

View File

@ -1,26 +0,0 @@
from ..knowledge_base import KnowledgeBase
from ..session.org_mode import global_session as session
from . import gac_100
def ask_then_learn_test(knowledge: KnowledgeBase):
with session().log("is icecream blue?"):
ret, _, _ = knowledge.process("is icecream blue?")
assert(ret is False)
with session().log("icecream is blue"):
ret, _, _ = knowledge.process("icecream is blue")
with session().log("is icecream blue?"):
ret, _, _ = knowledge.process("is icecream blue?")
assert(ret is True)
return knowledge
def main():
knowledge = gac_100.main()
knowledge.knowledge['blue'] = {'groups': {'property'}}
knowledge = ask_then_learn_test(knowledge)

View File

@ -1,80 +0,0 @@
from ..session.org_mode import global_session as session
from ..knowledge_base import KnowledgeBase
from ..utils.visuals import show_progbar
from ..visualization import show_knowledge
def _assert(args):
assert(args)
def _assert_msg(args, msg):
assert args, msg
EXAMPLES = [
('example', {
"text": 'cat',
"tokens": ['cat'],
}),
('example', {
"text": 'cats',
"tokens": ['cats'],
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
}),
('example', {
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],
}),
('example', {
"text": 'is earth a planet?',
"tokens": ['is', 'earth', 'a', 'planet', '?'],
}),
('test', {
"text": 'plane',
"tokens": ['plane'],
}),
# ('test', {
# "text": 'planes',
# "tokens": ['planes'],
# "meaning": { 'planes': ('add-modifier', 'plane', 'plural') },
# }),
('test', {
"text": 'some other text',
"tokens": ['some', 'other', 'text'],
}),
('test', {
"text": 'is the sun a star?',
"tokens": ['is', 'the', 'sun', 'a', 'star', '?'],
}),
('test', {
"text": 'sometextnotseparatedbyspaces',
"tokens": ['some', 'text', 'not', 'separated', 'by', 'spaces'],
})
]
def main():
knowledge = KnowledgeBase()
total = len(EXAMPLES)
for i, (case_type, example) in enumerate(EXAMPLES):
show_progbar(i, total, example['text'])
if case_type == 'example':
with session().log(example['text']):
knowledge.layers.tokenization.train(example)
elif case_type == 'test':
with session().log(example['text']):
tokens = list(knowledge.layers.tokenization.tokenize(example['text']))
session().log('Expected “{}”, found “{}'
.format(example['tokens'], tokens))
assert example['tokens'] == tokens
else:
raise Exception('Not implemented case {}'.format(case_type))
print("\r\x1b[K", end='')
return knowledge

View File

@ -1,4 +0,0 @@
def dumper(obj):
if isinstance(obj, set):
return list(obj)
return obj

View File

@ -1,29 +0,0 @@
from ..session.org_mode import (
global_session as session,
)
BASIC_TOKENIZATION_EXAMPLES = (
({
"text": 'cat',
"tokens": ['cat'],
}),
({
"text": 'cats',
"tokens": ['cats'],
"meaning": { 'cats': ('add-modifier', 'cat', 'plural') },
}),
({
"text": 'text separated by spaces',
"tokens": ['text', 'separated', 'by', 'spaces'],
}),
({
"text": 'is earth a planet?',
"tokens": ['is', 'earth', 'a', 'planet', '?'],
}),
)
def train_basic_tokenization(knowledge_base):
with session().log('Training basic tokenization'):
for example in BASIC_TOKENIZATION_EXAMPLES:
knowledge_base.layers.tokenization.train(example)

View File

@ -1,15 +0,0 @@
def show_progbar(done, total, msg=''):
total_blocks = 10
blocks_done = (done * total_blocks) // total
blocks_to_go = total_blocks - blocks_done
print('\r\x1b[K' # Go to the start of the line
'\x1b[0m' # Restart the "style"
'|' # Put the first "|"
+ blocks_done * '' # Completed blocks
+ blocks_to_go * ' ' # Uncompleted blocks
+ '\x1b[7m|\x1b[0m' # End the bar
+ ' '
+ msg # Add message
+ '\r' # Go back to the start
, end='')

View File

@ -1,8 +0,0 @@
def show_knowledge(knowledge):
for key in knowledge.knowledge:
print("\x1b[1m{}\x1b[0m {}".format(key, knowledge.knowledge[key]))
def show_samples(knowledge):
for example in knowledge.originals:
print("{}".format(example))