diff --git a/naive-nlu/tree_nlu/atoms.py b/naive-nlu/tree_nlu/atoms.py index a0028e5..d1de20a 100644 --- a/naive-nlu/tree_nlu/atoms.py +++ b/naive-nlu/tree_nlu/atoms.py @@ -8,6 +8,15 @@ from collections import namedtuple Atom = namedtuple('Atom', field_names='name') +def is_atom(element, name=None): + '''Check if an element is an atom with a specific name.''' + if not isinstance(element, Atom): + return False + + if name is None: + return True + + return element.name == name def a(name): '''Build an atom with a given name.''' diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index 3e09ec6..f8cfa99 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -7,25 +7,69 @@ from .atoms import Atom from . import parsing from . import knowledge_evaluation from .modifiable_property import is_modifiable_property - +import random def diff_knowledge(before, after): import jsondiff return jsondiff.diff(before, after) +def randomized_weighted_list(elements): + # Randomized + randomized = list(elements) + random.shuffle(randomized) + + # And return only once + already_returned = set() + for e in randomized: + if e in already_returned: + continue + + yield e + already_returned.add(e) + + + class KnowledgeBase(object): def __init__(self, knowledge={}, examples=[], trained=[]): self.knowledge = copy.copy(knowledge) self.originals = [] self.examples = copy.copy(examples) self.trained = copy.copy(trained) - self.tokenization = set() self.structural_elements = set() + self.token_chains = {} + self.tokens = set() + + def add_token_pair(self, precedent, consequent): + self.add_token(precedent) + self.add_token(consequent) + + if precedent not in self.token_chains: + self.token_chains[precedent] = [] + self.token_chains[precedent].append(consequent) + + def add_token(self, token): + self.tokens.add(token) + if (not isinstance(token, Atom)) and (token not in self.structural_elements): + session().annotate('Found new structural element “{}”'.format(token)) + self.structural_elements.add(token) + + def expected_token_after_precedent(self, precedent=None): + if precedent not in self.token_chains: # If there's no known precedent, just return all tokens + return randomized_weighted_list(self.tokens) + + return randomized_weighted_list(self.token_chains[precedent]) def train_tokenizer(self, example): - with session().log('Train'): - parsing.integrate_tokenization(self, example) + with session().log('Training tokenizer'): + session().annotate("Example: {}".format(example)) + tokens = parsing.integrate_tokenization(self, example) + + # Integrate knowledge of concept + for token in tokens: + if not token in self.knowledge: + self.knowledge[token] = {} + def train(self, examples): knowledge_before = copy.deepcopy(self.knowledge) @@ -80,14 +124,6 @@ class KnowledgeBase(object): return chosen return options - def add_tokenization(self, tokenization): - with session().log('Added tokenization: “{}”'.format(tokenization)): - self.tokenization.add(tokenization) - for e in tokenization: - if (not isinstance(e, Atom)) and (e not in self.structural_elements): - session().annotate('Found new structural element “{}”'.format(e)) - self.structural_elements.add(e) - def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py index 5683943..8f7613d 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/parsing.py @@ -11,7 +11,7 @@ from functools import reduce from typing import List, Dict from .modifiable_property import ModifiableProperty from . import parameters -from .atoms import Atom, a +from .atoms import Atom, a, is_atom def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): for se in knowledge_base.structural_elements: @@ -36,79 +36,84 @@ def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): -def to_tokens(knowledge_base, text, acc=None): - # TODO This is an extra-naïve implementation - found = 0 +def to_tokens(knowledge_base, text, precedent=None): + if len(text) == 0: + session().annotate("No text remaining") + yield [''] + return - for tokenization in knowledge_base.tokenization: - with session().log("Tokenization {}".format(tokenization)): - remaining = text - possibility = [] + with session().log("Tokenizing {}".format(text)): + for option in knowledge_base.expected_token_after_precedent(precedent): + with session().log("Next: “{}”".format(option)): + with session().log("Matching “{}” on “{}”".format(option, text)): + for token_match in tokenization_match(option, text, knowledge_base): + if token_match is None: + session().annotate("No match") - # Apply tokenization to all elmenets - for i, token in enumerate(tokenization): - with session().log("T “{}” over “{}”".format(token, remaining)): - if token == Atom('token'): - for thing in knowledge_base.knowledge.keys(): - session().annotate("Testing with “{}”".format(thing)) - if remaining.startswith(thing): - # TODO We should also branch here, probably :\ - remaining = remaining[len(thing):] - possibility.append(thing) - else: - if i + 1 >= len(tokenization): # Last element, lookahead for tokens/structural elements - with session().log("Token not found, looking ahead for splits on “{}”".format(remaining)): - # If we start with remaining[0:] it's not a real lookahead - # ... and it can get us trapped on infinite recursion - splits = lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining[1:]) + match, remaining = token_match + if len(remaining) == len(text): + raise Exception('No text consumed in match') - if splits is None: - session().log("No splits found, keeping remaining as token “{}”".format(remaining)) + session().annotate('Match: “{}”'.format(match)) + with session().log('Remaining “{}”'.format(remaining)): + for sublevel in to_tokens(knowledge_base, remaining, match): + candidate = list(filter(lambda x: x != '', [match] + sublevel)) + session().annotate('Yielding candidate “{}”'.format(candidate)) + yield candidate - possibility.append(remaining) - remaining = "" - else: - # Consider we only have one possibility - assert len(splits) == 1 - - before_split, pivot, after_split = splits[0] - before_split = remaining[0] + before_split - - session().log("1 split found, cutting on token “{}”, keeping “{}”".format(found, before_split)) - - possibility.append(before_split) - remaining = pivot + after_split - - else: # Not las element, use the next one as cutter - # Try with (HYPERSIMPLISTIC!) backtracking - # Cut using the next token we should use more!!! - next_token = tokenization[i + 1] - session().annotate("Trying to cut for next token on “{}”".format(next_token)) - - cutoff = remaining.find(next_token) - if cutoff < 0: - break - - possibility.append(remaining[:cutoff]) - remaining = remaining[cutoff:] - else: - if remaining.find(token) < 0: # Not inmediately after! - break - remaining = remaining[len(token):] - session().annotate("OK, remaining: “{}” with {} items".format(remaining, len(tokenization) - (i + 1))) +def tokenization_match(element, text, knowledge_base): + # Constant/structural string matching + if isinstance(element, str): + if text.find(element) == 0: + # This match comes from a structuring element + # It doesn't appear on the tokenization + # So we should return it as an empty string + yield ('', text[len(element):]) + return else: - # Tokenization applicable - found += 1 - if remaining == '': - session().log("Concluded possibility “{}”".format(possibility)) - yield possibility - else: - with session().log("Continuing with “{}”".format(remaining)): - for consecuent in to_tokens(knowledge_base, remaining, possibility): - yield list(filter(lambda x: x != '', possibility + consecuent)) - if found == 0: - raise Exception('No tokenization found') + # No match found + return + + elif is_atom(element, 'token'): + yield from match_single_token(text, knowledge_base) + return + raise NotImplementedError() + + +def match_single_token(text, knowledge_base): + found_token = False + for token in knowledge_base.knowledge.keys(): + if text.find(token) == 0: + yield token, text[len(token):] + found_token = True + + if found_token: + return + + session().annotate('No token found at the start of ”{}”'.format(text)) + session().annotate('using structural elements to infer it') + # TODO: review this when multiple structural elements are available + for se in knowledge_base.structural_elements: + session().annotate('Looking for se “{}” in “{}”'.format(se, text)) + position = text.find(se, 0) + found = position > 0 # 0 is not considered a valid position for this kind of split + if found: + session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position])) + yield text[:position], text[position:] + + session().annotate('No structural element or token found, inferring only token remaining') + yield text, '' + + # Using other tokens for cutoff + for token in knowledge_base.knowledge.keys(): + session().annotate('Looking for token “{}” in “{}”'.format(token, text)) + position = text.find(token) + found = position >= 0 + if found: + session().annotate('Found ”{}”, in position ”{}”'.format(token, position)) + yield text[:position], text[position:] + def integrate_tokenization(knowledge_base, example): text = example['text'] @@ -131,7 +136,7 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens): if token in text: before, after = text.split(token, maxsplit=1) texts = (texts[:i] + [before] - + [token_id] + + [a('token')] + [after] + texts[i + 1:]) break else: @@ -139,18 +144,16 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens): # Remove leftovers from splits texts = list(filter(lambda x: x != '', texts)) + session().log("Tokenized as {} over {}".format(texts, tokens)) - for token_id, _token in enumerate(tokens): - # Find all elements between current token and next token - i = texts.index(token_id) - elements = [a('token')] + for i, element in enumerate(texts[:-1]): + learn_token_pair(element, texts[i + 1], knowledge_base) - i += 1 - while i < len(texts) and not isinstance(texts[i], int): - elements.append(texts[i]) - i += 1 + return tokens + +def learn_token_pair(precedent, consequent, knowledge_base): + knowledge_base.add_token_pair(precedent, consequent) - knowledge_base.add_tokenization(tuple(elements)) def pick_one_tokenization(options, knowledge_base): ''' @@ -158,26 +161,34 @@ def pick_one_tokenization(options, knowledge_base): Just pick the one with more results. ''' + options = list(options) with session().log("Picking among: {} options\n{}".format(len(options), '\n'.join(map(str, options)))): return pick_by_score(options, [ - # First by number of splits - lambda tokenization: len(tokenization), - - # Among them, by number of splits without structuring elements + # By number of splits without structuring elements lambda tokenization: sum(map( - lambda split: -sum(map( + lambda split: sum(map( lambda se: se in split, knowledge_base.structural_elements - )), tokenization)) + )), tokenization)), + + # By number of unknown tokens + lambda tokenization: len(list(filter(lambda token: + (token not in knowledge_base.knowledge.keys()) and + (token not in knowledge_base.structural_elements), + tokenization))), + + # By number of splits + lambda tokenization: -len(tokenization), ]) def pick_by_score(options, heuristics): for heuristic in heuristics: assert(len(options) > 0) options = list(map(lambda opt: (heuristic(opt), opt), options)) - sorted_options = sorted(options, key=lambda x: x[0], reverse=True) + sorted_options = sorted(options, key=lambda x: x[0], reverse=False) heuristic_cutoff = sorted_options[0][0] + session().annotate(sorted_options) pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff] options = pass_heuristic