diff --git a/.gitignore b/.gitignore index 961205f..e9d4714 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ *#* *~ +.vscode *.ba?k *.pyc __pycache__ diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index 389a70a..3302ea9 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -4,8 +4,7 @@ import logging from .session.org_mode import global_session as session from .atoms import Atom -from . import parsing -from . import tokenization +from . import layered_model from . import knowledge_evaluation from .modifiable_property import is_modifiable_property import random @@ -15,21 +14,6 @@ def diff_knowledge(before, after): return jsondiff.diff(before, after) -def randomized_weighted_list(elements): - # Randomized - randomized = list(elements) - random.shuffle(randomized) - - # And return only once - already_returned = set() - for e in randomized: - if e in already_returned: - continue - - yield e - already_returned.add(e) - - class KnowledgeBase(object): def __init__(self, knowledge={}, examples=[], trained=[]): @@ -37,41 +21,9 @@ class KnowledgeBase(object): self.originals = [] self.examples = copy.copy(examples) self.trained = copy.copy(trained) - self.structural_elements = set() - self.token_chains = {} - self.tokens = set() - - def add_token_pair(self, precedent, consequent): - self.add_token(precedent) - self.add_token(consequent) - - if precedent not in self.token_chains: - self.token_chains[precedent] = [] - self.token_chains[precedent].append(consequent) - - def add_token(self, token): - self.tokens.add(token) - if (not isinstance(token, Atom)) and (token not in self.structural_elements): - session().annotate('Found new structural element “{}”'.format(token)) - self.structural_elements.add(token) - - def expected_token_after_precedent(self, precedent=None): - if precedent not in self.token_chains: # If there's no known precedent, just return all tokens - return randomized_weighted_list(self.tokens) - - return randomized_weighted_list(self.token_chains[precedent]) - - def train_tokenizer(self, example): - with session().log('Training tokenizer'): - session().annotate("Example: {}".format(example)) - tokens = tokenization.integrate_tokenization(self, example) - - # Integrate knowledge of concept - for token in tokens: - if not token in self.knowledge: - self.knowledge[token] = {} - + self.layers = layered_model.BaseModel(self) + ## Parsing def train(self, examples): knowledge_before = copy.deepcopy(self.knowledge) with session().log('Train'): @@ -86,11 +38,12 @@ class KnowledgeBase(object): self.act_upon(result) with session().log("language integration"): - tokens, decomposition, inferred_tree = parsing.integrate_language(self, example) - session().annotate("Tokens: {}".format(tokens)) - session().annotate("Inferred tree: {}".format(inferred_tree)) + for tokens, decomposition, inferred_tree in self.layers.integrate(self, example): + session().annotate("Tokens: {}".format(tokens)) + session().annotate("Inferred tree: {}".format(inferred_tree)) with session().log("full information integration"): + tokens = self.layers.tokenization.tokenize(example['text'], return_one=True) result = knowledge_evaluation.integrate_information(self.knowledge, { "elements": tokens, "decomposition": decomposition, @@ -105,7 +58,7 @@ class KnowledgeBase(object): # Reduce values with session().log("reprocessing"): - self.trained = parsing.reprocess_language_knowledge(self, self.examples) + self.layers.reprocess(self.examples) knowledge_after = copy.deepcopy(self.knowledge) knowledge_diff_getter = lambda: diff_knowledge(knowledge_before, @@ -113,19 +66,6 @@ class KnowledgeBase(object): return knowledge_diff_getter - def tokenize(self, row, return_one=True): - row = row.lower() - with session().log("Tokenize: {}".format(row)): - options = list(tokenization.to_tokens(self, row)) - session().log("Results:\n{}".format('\n'.join(map(str, options)))) - - if return_one: - chosen = tokenization.pick_one_tokenization(options, self) - session().log("Chosen: “{}”".format(chosen)) - self.train_tokenizer({'text': row, 'tokens': chosen}) - return chosen - return options - def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): diff --git a/naive-nlu/tree_nlu/layered_model.py b/naive-nlu/tree_nlu/layered_model.py new file mode 100644 index 0000000..9ecc242 --- /dev/null +++ b/naive-nlu/tree_nlu/layered_model.py @@ -0,0 +1,47 @@ +from .layers import tokenization_layer +from .layers import parsing_layer + + +def make_yield_pipe(layers, knowledge_base, example): + if len(layers) < 1: + yield example + return + + input_generator = make_yield_pipe(layers[:-1], knowledge_base, example) + for input in input_generator: + print("-->", input) + for d in list(layers[-1].integrate(knowledge_base, input)): + yield d + + +class BaseModel: + def __init__(self, knowledge_base): + self.tokenization = tokenization_layer.TokenizationLayer(knowledge_base) + self.parsing = parsing_layer.ParsingLayer() + + self.layers = [ + self.tokenization, + self.parsing, + ] + + def reprocess(self, examples): + for example in examples: + self._reprocess_single(example) + + def _reprocess_single(self, example): + return + pattern_examples = [] + for i, sample in enumerate(examples): + other = examples[:i] + examples[i + 1:] + match = get_matching(sample, other) + if len(match) > 0: + sample = (match, sample[1],) + pattern_examples.append(sample) + + return pattern_examples + + def integrate(self, knowledge_base, example): + yield from make_yield_pipe(self.layers, knowledge_base, example) + + def tokenize(self, row, return_one=True): + return self.tokenization.to_tokens(row) diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/layers/parsing.py similarity index 95% rename from naive-nlu/tree_nlu/parsing.py rename to naive-nlu/tree_nlu/layers/parsing.py index f22a4ce..7073a3a 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/layers/parsing.py @@ -1,18 +1,14 @@ #!/usr/bin/env python -from . import knowledge_evaluation -from . import tokenization - -from . import depth_meter -from .session.org_mode import global_session as session +from ..session.org_mode import global_session as session import re import copy from functools import reduce from typing import List, Dict -from .modifiable_property import ModifiableProperty -from . import parameters -from .atoms import Atom, a, is_atom +from ..modifiable_property import ModifiableProperty +from .. import parameters +from ..atoms import Atom, a, is_atom def make_template(knowledge_base, tokens, parsed): matcher = list(tokens) @@ -83,8 +79,8 @@ def integrate_language(knowledge_base, example): text = example["text"].lower() parsed = example["parsed"] + tokens = example['tokens'] resolved_parsed = copy.deepcopy(parsed) - tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base)) while True: session().annotate("P: {}".format(resolved_parsed)) @@ -95,14 +91,14 @@ def integrate_language(knowledge_base, example): for position, atom in lower_levels: with session().log("Atom {}".format(atom)): + result = None similars = get_similar_tree(knowledge_base, atom, tokens) for similar in similars: result = build_remix_matrix(knowledge_base, tokens, atom, similar) if result is not None: break - if result is None: - raise Exception("No match found") + return remix, (start_bounds, end_bounds) = result after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix) @@ -147,7 +143,7 @@ def integrate_language(knowledge_base, example): session().annotate("M: {}".format(matcher)) session().annotate("R: {}".format(result)) session().annotate("---") - return tokens, matcher, result + yield tokens, matcher, result def apply_remix(tokens, remix): @@ -319,7 +315,7 @@ def get_similar_tree(knowledge_base, atom, tokens): sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True) if len(sorted_possibilities) < 1: - return None + return [] for i, possibility in enumerate(sorted_possibilities): similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility @@ -369,20 +365,6 @@ def get_matching(sample, other): return matching -def reprocess_language_knowledge(knowledge_base, examples): - examples = knowledge_base.examples + examples - - pattern_examples = [] - for i, sample in enumerate(examples): - other = examples[:i] + examples[i + 1:] - match = get_matching(sample, other) - if len(match) > 0: - sample = (match, sample[1],) - pattern_examples.append(sample) - - return pattern_examples - - def reverse_remix(tree_section, remix): result_section = [] offset = 0 diff --git a/naive-nlu/tree_nlu/layers/parsing_layer.py b/naive-nlu/tree_nlu/layers/parsing_layer.py new file mode 100644 index 0000000..13b865d --- /dev/null +++ b/naive-nlu/tree_nlu/layers/parsing_layer.py @@ -0,0 +1,11 @@ +from . import parsing + +class ParsingLayer: + def __init__(self): + pass + + def integrate(self, knowledge_base, example): + yield from parsing.integrate_language(knowledge_base, example) + + def train(self, knowledge_base, example): + assert False \ No newline at end of file diff --git a/naive-nlu/tree_nlu/tokenization.py b/naive-nlu/tree_nlu/layers/tokenization.py similarity index 98% rename from naive-nlu/tree_nlu/tokenization.py rename to naive-nlu/tree_nlu/layers/tokenization.py index 7322cb5..ec3f0a8 100644 --- a/naive-nlu/tree_nlu/tokenization.py +++ b/naive-nlu/tree_nlu/layers/tokenization.py @@ -1,5 +1,5 @@ -from .session.org_mode import global_session as session -from .atoms import Atom, a, is_atom +from ..session.org_mode import global_session as session +from ..atoms import Atom, a, is_atom def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): for se in knowledge_base.structural_elements: diff --git a/naive-nlu/tree_nlu/layers/tokenization_layer.py b/naive-nlu/tree_nlu/layers/tokenization_layer.py new file mode 100644 index 0000000..1271818 --- /dev/null +++ b/naive-nlu/tree_nlu/layers/tokenization_layer.py @@ -0,0 +1,84 @@ +from ..session.org_mode import global_session as session +from ..atoms import Atom +from . import tokenization +import random +import copy + +def randomized_weighted_list(elements): + # Randomized + randomized = list(elements) + random.shuffle(randomized) + + # And return only once + already_returned = set() + for e in randomized: + if e in already_returned: + continue + + yield e + already_returned.add(e) + +class TokenizationLayer: + def __init__(self, knowledge_base): + self.structural_elements = set() + self.token_chains = {} + self.tokens = set() + self.knowledge_base = knowledge_base + self.knowledge = knowledge_base.knowledge + + def integrate(self, knowledge_base, data): + assert knowledge_base is self.knowledge_base + + print(data) + assert 'text' in data + with session().log("Tokenize: {}".format(data['text'])): + for tokens in tokenization.to_tokens(self, data['text']): + data_with_row = copy.copy(data) + data_with_row['tokens'] = tokens + print(data_with_row) + yield data_with_row + + + def tokenize(self, row, return_one=True): + row = row.lower() + with session().log("Tokenize: {}".format(row)): + options = list(tokenization.to_tokens(self, row)) + session().log("Results:\n{}".format('\n'.join(map(str, options)))) + + if return_one: + chosen = tokenization.pick_one_tokenization(options, self) + session().log("Chosen: “{}”".format(chosen)) + self.train({'text': row, 'tokens': chosen}) + return chosen + return options + + ## Tokenization + def add_token_pair(self, precedent, consequent): + self.add_token(precedent) + self.add_token(consequent) + + if precedent not in self.token_chains: + self.token_chains[precedent] = [] + self.token_chains[precedent].append(consequent) + + def add_token(self, token): + self.tokens.add(token) + if (not isinstance(token, Atom)) and (token not in self.structural_elements): + session().annotate('Found new structural element “{}”'.format(token)) + self.structural_elements.add(token) + + def expected_token_after_precedent(self, precedent=None): + if precedent not in self.token_chains: # If there's no known precedent, just return all tokens + return randomized_weighted_list(self.tokens) + + return randomized_weighted_list(self.token_chains[precedent]) + + def train(self, example): + with session().log('Training tokenizer'): + session().annotate("Example: {}".format(example)) + tokens = tokenization.integrate_tokenization(self, example) + + # Integrate knowledge of concept + for token in tokens: + if not token in self.knowledge: + self.knowledge[token] = {} \ No newline at end of file diff --git a/naive-nlu/tree_nlu/tests/gac_100.py b/naive-nlu/tree_nlu/tests/gac_100.py index f4656fb..71469ac 100644 --- a/naive-nlu/tree_nlu/tests/gac_100.py +++ b/naive-nlu/tree_nlu/tests/gac_100.py @@ -99,14 +99,14 @@ examples = [ lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use']) ),], }), - ('full_example', - { - "text": "The dominant language in france is french?", - "affirmation": "The dominant language in france is french", - "parsed": ("question", - ("property-has-value", "france", "dominant-language", "french")), - "answer": True, - }), + # ('full_example', + # { + # "text": "The dominant language in france is french?", + # "affirmation": "The dominant language in france is french", + # "parsed": ("question", + # ("property-has-value", "france", "dominant-language", "french")), + # "answer": True, + # }), # { # "text": "was abraham lincoln once president of the united states?", # "affirmation": "was abraham lincoln once president of the united states?", diff --git a/naive-nlu/tree_nlu/tests/tokenization.py b/naive-nlu/tree_nlu/tests/tokenization.py index 6b61fc4..9e32588 100644 --- a/naive-nlu/tree_nlu/tests/tokenization.py +++ b/naive-nlu/tree_nlu/tests/tokenization.py @@ -63,11 +63,11 @@ def main(): show_progbar(i, total, example['text']) if case_type == 'example': with session().log(example['text']): - knowledge.train_tokenizer(example) + knowledge.layers.tokenization.train(example) elif case_type == 'test': with session().log(example['text']): - tokens = list(knowledge.tokenize(example['text'])) + tokens = list(knowledge.layers.tokenization.tokenize(example['text'])) session().log('Expected “{}”, found “{}”' .format(example['tokens'], tokens)) diff --git a/naive-nlu/tree_nlu/utils/tokenization.py b/naive-nlu/tree_nlu/utils/tokenization.py index 4664923..f13c798 100644 --- a/naive-nlu/tree_nlu/utils/tokenization.py +++ b/naive-nlu/tree_nlu/utils/tokenization.py @@ -26,4 +26,4 @@ BASIC_TOKENIZATION_EXAMPLES = ( def train_basic_tokenization(knowledge_base): with session().log('Training basic tokenization'): for example in BASIC_TOKENIZATION_EXAMPLES: - knowledge_base.train_tokenizer(example) + knowledge_base.layers.tokenization.train(example)