diff --git a/naive-nlu/tree_nlu/knowledge_base.py b/naive-nlu/tree_nlu/knowledge_base.py index 830a6f3..b34efe7 100644 --- a/naive-nlu/tree_nlu/knowledge_base.py +++ b/naive-nlu/tree_nlu/knowledge_base.py @@ -3,6 +3,7 @@ import logging from .session.org_mode import global_session as session +from .atoms import Atom from . import parsing from . import knowledge_evaluation from .modifiable_property import is_modifiable_property @@ -20,6 +21,7 @@ class KnowledgeBase(object): self.examples = copy.copy(examples) self.trained = copy.copy(trained) self.tokenization = set() + self.structural_elements = set() def train_tokenizer(self, example): with session().log('Train'): @@ -74,6 +76,14 @@ class KnowledgeBase(object): return parsing.pick_one_tokenization(options) return options + def add_tokenization(self, tokenization): + with session().log('Added tokenization: “{}”'.format(tokenization)): + self.tokenization.add(tokenization) + for e in tokenization: + if (not isinstance(e, Atom)) and (e not in self.structural_elements): + session().annotate('Found new structural element “{}”'.format(e)) + self.structural_elements.add(e) + def process(self, row): knowledge_before = copy.deepcopy(self.knowledge) with session().log("Process: {}".format(row)): diff --git a/naive-nlu/tree_nlu/parsing.py b/naive-nlu/tree_nlu/parsing.py index 6cae405..198bda2 100644 --- a/naive-nlu/tree_nlu/parsing.py +++ b/naive-nlu/tree_nlu/parsing.py @@ -99,7 +99,7 @@ def integrate_token_to_text_matching(knowledge_base, text, tokens): elements.append(texts[i]) i += 1 - knowledge_base.tokenization.add(tuple(elements)) + knowledge_base.add_tokenization(tuple(elements)) def pick_one_tokenization(options): '''