Improve the remixing and fitting mechanisms.

This commit is contained in:
kenkeiras 2017-06-04 18:55:45 +02:00
commit b42bf37f77
6 changed files with 120 additions and 20 deletions

View File

@ -22,7 +22,6 @@ class KnowledgeBase(object):
knowledge_before = copy.deepcopy(self.knowledge) knowledge_before = copy.deepcopy(self.knowledge)
# Parse everything # Parse everything
parsed_examples = []
for example in examples: for example in examples:
# If there's parsed data, leverage it ASAP # If there's parsed data, leverage it ASAP
if 'parsed' in example: if 'parsed' in example:

View File

@ -1,3 +1,5 @@
import logging
from .modifiable_property import ( from .modifiable_property import (
ModifiableProperty, ModifiableProperty,
ModifiablePropertyWithAst, ModifiablePropertyWithAst,
@ -9,6 +11,7 @@ def resolve(knowledge_base, elements, value):
if isinstance(value, int): if isinstance(value, int):
return elements[value] return elements[value]
elif isinstance(value, tuple) or isinstance(value, list): elif isinstance(value, tuple) or isinstance(value, list):
print("V:", value, elements)
return integrate_information(knowledge_base, { return integrate_information(knowledge_base, {
"elements": elements, "elements": elements,
"parsed": value, "parsed": value,
@ -100,11 +103,17 @@ def exists_property_with_value(knowledge_base, elements, subj, value):
def modifiable_element_for_existance_in_set(container, set_name, element): def modifiable_element_for_existance_in_set(container, set_name, element):
print("-----({} {} {})".format(container, set_name, element))
import traceback
# traceback.print_stack()
def getter(): def getter():
nonlocal container, set_name, element nonlocal container, set_name, element
print(" get({} {} {})".format(container, set_name, element))
return (set_name in container) and (element in container[set_name]) return (set_name in container) and (element in container[set_name])
def setter(): def setter():
print(" add({} {} {})".format(container, set_name, element))
nonlocal container, set_name, element nonlocal container, set_name, element
return container[set_name].add(element) return container[set_name].add(element)
@ -212,6 +221,7 @@ def perform_verb_over_object(knowledge_base, elements, subj, verb, obj):
subj = resolve(knowledge_base, elements, subj) subj = resolve(knowledge_base, elements, subj)
verb = resolve(knowledge_base, elements, verb) verb = resolve(knowledge_base, elements, verb)
obj = resolve(knowledge_base, elements, obj) obj = resolve(knowledge_base, elements, obj)
logging.debug("({} {} {})".format(verb, subj, obj))
if subj not in knowledge_base: if subj not in knowledge_base:
knowledge_base[subj] = {'groups': set()} knowledge_base[subj] = {'groups': set()}
@ -255,6 +265,29 @@ def integrate_information(knowledge_base, example):
args = ast[1:] args = ast[1:]
elements = example.get('elements', None) elements = example.get('elements', None)
logging.debug("Integrating:")
logging.debug("AST: {}".format(ast))
logging.debug("ARG: {}".format(elements))
logging.debug("------------")
return tagged_with_ast( return tagged_with_ast(
ast, elements, ast, elements,
knowledge_ingestion[method](knowledge_base, elements, *args)) knowledge_ingestion[method](knowledge_base, elements, *args))
def can_be_used_in_place(knowledge, token, minisegment):
if token not in knowledge.knowledge:
return False
info = knowledge.knowledge[token]
info_groups = info.get('groups', set())
minisegment_groups = minisegment.get('groups', set())
# Common group
if len(info_groups & minisegment_groups) > 0:
return True
# Neither has a group
elif len(info_groups) == 0 == len(minisegment_groups):
return True
return False

View File

@ -20,6 +20,9 @@ def to_tokens(text):
def make_template(knowledge_base, tokens, parsed): def make_template(knowledge_base, tokens, parsed):
matcher = list(tokens) matcher = list(tokens)
template = list(parsed) template = list(parsed)
logging.debug(" -- MK TEMPLATE --")
logging.debug("MATCHR: {}".format(matcher))
logging.debug("TEMPLT: {}".format(template))
for i in range(len(matcher)): for i in range(len(matcher)):
word = matcher[i] word = matcher[i]
if word in template: if word in template:
@ -56,6 +59,11 @@ def get_lower_levels(parsed):
# TODO: probably optimize this, it creates lots of unnecessary tuples # TODO: probably optimize this, it creates lots of unnecessary tuples
def replace_position(tree, position, new_element): def replace_position(tree, position, new_element):
logging.debug("REPLACE POSITIONS:")
logging.debug(" TREE : {}".format(tree))
logging.debug("POSITION: {}".format(position))
logging.debug("NEW ELEM: {}".format(new_element))
logging.debug("------------------")
def aux(current_tree, remaining_route): def aux(current_tree, remaining_route):
if len(remaining_route) == 0: if len(remaining_route) == 0:
@ -69,7 +77,9 @@ def replace_position(tree, position, new_element):
+ tree[step + 2:] + tree[step + 2:]
) )
return aux(tree, position) result = aux(tree, position)
logging.debug("-RESULT: {}".format(result))
return result
def integrate_language(knowledge_base, example): def integrate_language(knowledge_base, example):
@ -90,15 +100,23 @@ def integrate_language(knowledge_base, example):
logging.debug("\x1b[1mSelecting\x1b[0m: {}".format(atom)) logging.debug("\x1b[1mSelecting\x1b[0m: {}".format(atom))
similar = get_similar_tree(knowledge_base, atom, tokens) similar = get_similar_tree(knowledge_base, atom, tokens)
remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, tokens, atom, similar) remix, (start_bounds, end_bounds) = build_remix_matrix(knowledge_base, tokens, atom, similar)
_, matcher, result = make_template(knowledge_base, tokens, atom)
logging.debug("Tx: {}".format(tokens)) after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
logging.debug("--FIND MIX--")
logging.debug("-MIX- | {}".format(remix))
logging.debug("-FRM- | {}".format(tokens))
logging.debug("-AFT- | {}".format(after_remix))
print()
_, matcher, result = make_template(knowledge_base, after_remix, atom)
logging.debug("Tx: {}".format(after_remix))
logging.debug("Mx: {}".format(matcher)) logging.debug("Mx: {}".format(matcher))
logging.debug("Rx: {}".format(result)) logging.debug("Rx: {}".format(result))
logging.debug("Remix: {}".format(remix))
logging.debug("Sx: {}".format(start_bounds)) logging.debug("Sx: {}".format(start_bounds))
logging.debug("Ex: {}".format(end_bounds)) logging.debug("Ex: {}".format(end_bounds))
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens)) assert(len(after_remix) + len(start_bounds) + len(end_bounds) == len(tokens))
logging.debug( " +-> {}".format(after_remix)) logging.debug( " +-> {}".format(after_remix))
subquery_type = knowledge_evaluation.get_subquery_type(knowledge_base.knowledge, atom) subquery_type = knowledge_evaluation.get_subquery_type(knowledge_base.knowledge, atom)
@ -115,6 +133,8 @@ def integrate_language(knowledge_base, example):
tokens = new_tokens tokens = new_tokens
resolved_parsed = replace_position(resolved_parsed, position, offset) resolved_parsed = replace_position(resolved_parsed, position, offset)
logging.debug("RP: {}".format(resolved_parsed))
logging.debug("AT: {}".format(atom))
logging.debug("#########") logging.debug("#########")
@ -277,6 +297,14 @@ def get_similar_tree(knowledge_base, atom, tokens):
if len(sorted_possibilities) < 1: if len(sorted_possibilities) < 1:
return None return None
for i, possibility in enumerate(sorted_possibilities):
logging.debug('---- POSSIBILITY #{} ----'.format(i))
similar_matcher, similar_result, similar_result_resolved, _, _ = possibility
logging.debug('AST: {}'.format(similar_result))
logging.debug('Based on: {}'.format(similar_matcher))
logging.debug('Results on: {}'.format(similar_result_resolved))
logging.debug('---------------------')
return sorted_possibilities[0] return sorted_possibilities[0]
@ -336,7 +364,7 @@ def reverse_remix(tree_section, remix):
offset = 0 offset = 0
for origin in remix: for origin in remix:
if isinstance(origin, int): if isinstance(origin, int):
if origin >= len(tree_section): if (origin + offset) >= len(tree_section):
return None return None
result_section.append(copy.deepcopy(tree_section[origin + offset])) result_section.append(copy.deepcopy(tree_section[origin + offset]))
@ -347,13 +375,18 @@ def reverse_remix(tree_section, remix):
def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS): def get_fit(knowledge, tokens, remaining_recursions=parameters.MAX_RECURSIONS):
results = []
for matcher, ast in knowledge.trained: for matcher, ast in knowledge.trained:
result = match_fit(knowledge, tokens, matcher, ast, result = match_fit(knowledge, tokens, matcher, ast,
remaining_recursions) remaining_recursions)
if result is not None:
return result
return None if result is not None:
results.append(result)
print("XXX", result)
print(' - ' + '\n - '.join(map(str, results)))
if len(results) > 0:
return results[0]
def is_definite_minisegment(minisegment): def is_definite_minisegment(minisegment):
@ -362,8 +395,7 @@ def is_definite_minisegment(minisegment):
def match_token(knowledge, next_token, minisegment): def match_token(knowledge, next_token, minisegment):
if isinstance(minisegment, dict): if isinstance(minisegment, dict):
# TODO: check if the dictionary matches the values return knowledge_evaluation.can_be_used_in_place(knowledge, next_token, minisegment)
return True
elif isinstance(minisegment, str): elif isinstance(minisegment, str):
# TODO: check if the two elements can be used in each other place # TODO: check if the two elements can be used in each other place
return next_token == minisegment return next_token == minisegment
@ -382,11 +414,20 @@ def resolve_fit(knowledge, fit, remaining_recursions):
if remixed_tokens is None: if remixed_tokens is None:
return None return None
# if len(tokens) == 3 and tokens[2] == 'electricity':
# logging.debug("--UNMIX--")
# logging.debug("-MIX- | {}".format(remixer))
# logging.debug("REMIX | {}".format(tokens))
# logging.debug(" T O | {}".format(remixed_tokens))
# if remixer != [0, 1, 2]:
# return None
minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1) minifit = get_fit(knowledge, remixed_tokens, remaining_recursions - 1)
if minifit is None: if minifit is None:
return None return None
minitokens, miniast = minifit minitokens, miniast = minifit
logging.debug(" AST | {}".format(miniast))
subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast) subproperty = knowledge_evaluation.resolve(knowledge.knowledge, minitokens, miniast)
fitted.append(subproperty) fitted.append(subproperty)
@ -395,6 +436,7 @@ def resolve_fit(knowledge, fit, remaining_recursions):
def match_fit(knowledge, tokens, matcher, ast, remaining_recursions): def match_fit(knowledge, tokens, matcher, ast, remaining_recursions):
segment_possibilities = [([], tokens)] # Matched tokens, remaining tokens segment_possibilities = [([], tokens)] # Matched tokens, remaining tokens
indent = ' ' * (parameters.MAX_RECURSIONS - remaining_recursions)
for minisegment in matcher: for minisegment in matcher:
possibilities_after_round = [] possibilities_after_round = []
for matched_tokens, remaining_tokens in segment_possibilities: for matched_tokens, remaining_tokens in segment_possibilities:
@ -424,6 +466,11 @@ def match_fit(knowledge, tokens, matcher, ast, remaining_recursions):
resolved_fits = [] resolved_fits = []
for fit, _ in fully_matched_segments: for fit, _ in fully_matched_segments:
print(indent + ":::", fit) # REMIXES HAVE TO BE APPLIED BEFORE!!!
print(indent + '*' * 20)
for fit, _ in fully_matched_segments:
print(indent + ":::", fit) # REMIXES HAVE TO BE APPLIED BEFORE!!!
resolved_fit = resolve_fit(knowledge, fit, remaining_recursions) resolved_fit = resolve_fit(knowledge, fit, remaining_recursions)
if resolved_fit is not None: if resolved_fit is not None:
resolved_fits.append(resolved_fit) resolved_fits.append(resolved_fit)

View File

@ -3,7 +3,7 @@ import logging
from .tests import basic from .tests import basic
from .tests import gac_100 from .tests import gac_100
logging.getLogger().setLevel(logging.ERROR) logging.getLogger().setLevel(logging.DEBUG)
tests = ( tests = (
("basic", basic), ("basic", basic),

View File

@ -98,6 +98,12 @@ base_knowledge = {
'fly': { 'fly': {
"groups": {'verb'}, "groups": {'verb'},
}, },
'bus': {
"groups": {'noun'},
},
'run': {
"groups": {'verb'},
},
'swim': { 'swim': {
"groups": {'verb'}, "groups": {'verb'},
}, },

View File

@ -1,3 +1,4 @@
import logging
from ..knowledge_base import KnowledgeBase from ..knowledge_base import KnowledgeBase
from ..utils.visuals import show_progbar from ..utils.visuals import show_progbar
@ -91,6 +92,9 @@ examples = [
"parsed": ("question", "parsed": ("question",
('perform-verb-over-object', 'computers', 'use', 'electricity')), ('perform-verb-over-object', 'computers', 'use', 'electricity')),
"answer": True, "answer": True,
"after_execution": [(
lambda knowledge: print("->", knowledge.knowledge['computers'])
),],
}), }),
# { # {
# "text": "The dominant language in france is french?", # "text": "The dominant language in france is french?",
@ -106,7 +110,7 @@ examples = [
# }, # },
('text_example', ('text_example',
{ {
"question": "Is milk white?", "question": "is milk white?",
"affirmation": "milk is white", "affirmation": "milk is white",
"answer": True, "answer": True,
}), }),
@ -659,17 +663,26 @@ base_knowledge = {
'planet': { 'planet': {
"groups": {'noun', 'group'}, "groups": {'noun', 'group'},
}, },
'white': {
"groups": {'noun', 'color', 'concept', 'property'},
},
'green': { 'green': {
"groups": {'noun', 'color', 'concept'}, "groups": {'noun', 'color', 'concept'},
}, },
'milk': {
"groups": {'noun'},
},
'fly': { 'fly': {
"groups": {'verb'}, "groups": {'verb'},
}, },
'computers': {
"groups": {'object'},
},
'use': { 'use': {
"groups": {'verb'}, "groups": {'verb'},
}, },
'electricity': { 'electricity': {
"groups": {}, "groups": {'power'},
}, },
} }
@ -693,24 +706,26 @@ def main():
show_progbar(i, total, data['text']) show_progbar(i, total, data['text'])
differences = knowledge.train([question]) differences = knowledge.train([question])
print(differences())
result, _, _ = knowledge.process(data['text']) result, _, _ = knowledge.process(data['text'])
if result != data['answer']:
raise AssertionError('{} is not {}'.format(result, data['answer']))
if "after_execution" in data: if "after_execution" in data:
for f in data["after_execution"]: for f in data["after_execution"]:
f(knowledge) f(knowledge)
if result != data['answer']:
raise AssertionError('{} is not {}'.format(result, data['answer']))
elif example_type == 'text_example': elif example_type == 'text_example':
show_progbar(i, total, data['affirmation']) show_progbar(i, total, data['affirmation'])
affirmation = data['affirmation'] affirmation = data['affirmation']
logging.debug("Processing affirmation: {}".format(affirmation))
_, _, _ = knowledge.process(affirmation)
show_progbar(i, total, data['question']) show_progbar(i, total, data['question'])
question = data['question'] question = data['question']
logging.debug("Processing question : {}".format(question))
_, _, _ = knowledge.process(affirmation)
result, _, _ = knowledge.process(question) result, _, _ = knowledge.process(question)
if result != data['answer']: if result != data['answer']: