Exploration of layers for tokenization and parsing.
This commit is contained in:
parent
c18c9b8cb1
commit
a444766c7c
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,5 +1,6 @@
|
|||||||
*#*
|
*#*
|
||||||
*~
|
*~
|
||||||
|
.vscode
|
||||||
*.ba?k
|
*.ba?k
|
||||||
*.pyc
|
*.pyc
|
||||||
__pycache__
|
__pycache__
|
||||||
|
@ -4,8 +4,7 @@ import logging
|
|||||||
from .session.org_mode import global_session as session
|
from .session.org_mode import global_session as session
|
||||||
|
|
||||||
from .atoms import Atom
|
from .atoms import Atom
|
||||||
from . import parsing
|
from . import layered_model
|
||||||
from . import tokenization
|
|
||||||
from . import knowledge_evaluation
|
from . import knowledge_evaluation
|
||||||
from .modifiable_property import is_modifiable_property
|
from .modifiable_property import is_modifiable_property
|
||||||
import random
|
import random
|
||||||
@ -15,21 +14,6 @@ def diff_knowledge(before, after):
|
|||||||
return jsondiff.diff(before, after)
|
return jsondiff.diff(before, after)
|
||||||
|
|
||||||
|
|
||||||
def randomized_weighted_list(elements):
|
|
||||||
# Randomized
|
|
||||||
randomized = list(elements)
|
|
||||||
random.shuffle(randomized)
|
|
||||||
|
|
||||||
# And return only once
|
|
||||||
already_returned = set()
|
|
||||||
for e in randomized:
|
|
||||||
if e in already_returned:
|
|
||||||
continue
|
|
||||||
|
|
||||||
yield e
|
|
||||||
already_returned.add(e)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class KnowledgeBase(object):
|
class KnowledgeBase(object):
|
||||||
def __init__(self, knowledge={}, examples=[], trained=[]):
|
def __init__(self, knowledge={}, examples=[], trained=[]):
|
||||||
@ -37,41 +21,9 @@ class KnowledgeBase(object):
|
|||||||
self.originals = []
|
self.originals = []
|
||||||
self.examples = copy.copy(examples)
|
self.examples = copy.copy(examples)
|
||||||
self.trained = copy.copy(trained)
|
self.trained = copy.copy(trained)
|
||||||
self.structural_elements = set()
|
self.layers = layered_model.BaseModel(self)
|
||||||
self.token_chains = {}
|
|
||||||
self.tokens = set()
|
|
||||||
|
|
||||||
def add_token_pair(self, precedent, consequent):
|
|
||||||
self.add_token(precedent)
|
|
||||||
self.add_token(consequent)
|
|
||||||
|
|
||||||
if precedent not in self.token_chains:
|
|
||||||
self.token_chains[precedent] = []
|
|
||||||
self.token_chains[precedent].append(consequent)
|
|
||||||
|
|
||||||
def add_token(self, token):
|
|
||||||
self.tokens.add(token)
|
|
||||||
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
|
|
||||||
session().annotate('Found new structural element “{}”'.format(token))
|
|
||||||
self.structural_elements.add(token)
|
|
||||||
|
|
||||||
def expected_token_after_precedent(self, precedent=None):
|
|
||||||
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
|
|
||||||
return randomized_weighted_list(self.tokens)
|
|
||||||
|
|
||||||
return randomized_weighted_list(self.token_chains[precedent])
|
|
||||||
|
|
||||||
def train_tokenizer(self, example):
|
|
||||||
with session().log('Training tokenizer'):
|
|
||||||
session().annotate("Example: {}".format(example))
|
|
||||||
tokens = tokenization.integrate_tokenization(self, example)
|
|
||||||
|
|
||||||
# Integrate knowledge of concept
|
|
||||||
for token in tokens:
|
|
||||||
if not token in self.knowledge:
|
|
||||||
self.knowledge[token] = {}
|
|
||||||
|
|
||||||
|
|
||||||
|
## Parsing
|
||||||
def train(self, examples):
|
def train(self, examples):
|
||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
with session().log('Train'):
|
with session().log('Train'):
|
||||||
@ -86,11 +38,12 @@ class KnowledgeBase(object):
|
|||||||
self.act_upon(result)
|
self.act_upon(result)
|
||||||
|
|
||||||
with session().log("language integration"):
|
with session().log("language integration"):
|
||||||
tokens, decomposition, inferred_tree = parsing.integrate_language(self, example)
|
for tokens, decomposition, inferred_tree in self.layers.integrate(self, example):
|
||||||
session().annotate("Tokens: {}".format(tokens))
|
session().annotate("Tokens: {}".format(tokens))
|
||||||
session().annotate("Inferred tree: {}".format(inferred_tree))
|
session().annotate("Inferred tree: {}".format(inferred_tree))
|
||||||
|
|
||||||
with session().log("full information integration"):
|
with session().log("full information integration"):
|
||||||
|
tokens = self.layers.tokenization.tokenize(example['text'], return_one=True)
|
||||||
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
result = knowledge_evaluation.integrate_information(self.knowledge, {
|
||||||
"elements": tokens,
|
"elements": tokens,
|
||||||
"decomposition": decomposition,
|
"decomposition": decomposition,
|
||||||
@ -105,7 +58,7 @@ class KnowledgeBase(object):
|
|||||||
|
|
||||||
# Reduce values
|
# Reduce values
|
||||||
with session().log("reprocessing"):
|
with session().log("reprocessing"):
|
||||||
self.trained = parsing.reprocess_language_knowledge(self, self.examples)
|
self.layers.reprocess(self.examples)
|
||||||
|
|
||||||
knowledge_after = copy.deepcopy(self.knowledge)
|
knowledge_after = copy.deepcopy(self.knowledge)
|
||||||
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
|
knowledge_diff_getter = lambda: diff_knowledge(knowledge_before,
|
||||||
@ -113,19 +66,6 @@ class KnowledgeBase(object):
|
|||||||
|
|
||||||
return knowledge_diff_getter
|
return knowledge_diff_getter
|
||||||
|
|
||||||
def tokenize(self, row, return_one=True):
|
|
||||||
row = row.lower()
|
|
||||||
with session().log("Tokenize: {}".format(row)):
|
|
||||||
options = list(tokenization.to_tokens(self, row))
|
|
||||||
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
|
||||||
|
|
||||||
if return_one:
|
|
||||||
chosen = tokenization.pick_one_tokenization(options, self)
|
|
||||||
session().log("Chosen: “{}”".format(chosen))
|
|
||||||
self.train_tokenizer({'text': row, 'tokens': chosen})
|
|
||||||
return chosen
|
|
||||||
return options
|
|
||||||
|
|
||||||
def process(self, row):
|
def process(self, row):
|
||||||
knowledge_before = copy.deepcopy(self.knowledge)
|
knowledge_before = copy.deepcopy(self.knowledge)
|
||||||
with session().log("Process: {}".format(row)):
|
with session().log("Process: {}".format(row)):
|
||||||
|
47
naive-nlu/tree_nlu/layered_model.py
Normal file
47
naive-nlu/tree_nlu/layered_model.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from .layers import tokenization_layer
|
||||||
|
from .layers import parsing_layer
|
||||||
|
|
||||||
|
|
||||||
|
def make_yield_pipe(layers, knowledge_base, example):
|
||||||
|
if len(layers) < 1:
|
||||||
|
yield example
|
||||||
|
return
|
||||||
|
|
||||||
|
input_generator = make_yield_pipe(layers[:-1], knowledge_base, example)
|
||||||
|
for input in input_generator:
|
||||||
|
print("-->", input)
|
||||||
|
for d in list(layers[-1].integrate(knowledge_base, input)):
|
||||||
|
yield d
|
||||||
|
|
||||||
|
|
||||||
|
class BaseModel:
|
||||||
|
def __init__(self, knowledge_base):
|
||||||
|
self.tokenization = tokenization_layer.TokenizationLayer(knowledge_base)
|
||||||
|
self.parsing = parsing_layer.ParsingLayer()
|
||||||
|
|
||||||
|
self.layers = [
|
||||||
|
self.tokenization,
|
||||||
|
self.parsing,
|
||||||
|
]
|
||||||
|
|
||||||
|
def reprocess(self, examples):
|
||||||
|
for example in examples:
|
||||||
|
self._reprocess_single(example)
|
||||||
|
|
||||||
|
def _reprocess_single(self, example):
|
||||||
|
return
|
||||||
|
pattern_examples = []
|
||||||
|
for i, sample in enumerate(examples):
|
||||||
|
other = examples[:i] + examples[i + 1:]
|
||||||
|
match = get_matching(sample, other)
|
||||||
|
if len(match) > 0:
|
||||||
|
sample = (match, sample[1],)
|
||||||
|
pattern_examples.append(sample)
|
||||||
|
|
||||||
|
return pattern_examples
|
||||||
|
|
||||||
|
def integrate(self, knowledge_base, example):
|
||||||
|
yield from make_yield_pipe(self.layers, knowledge_base, example)
|
||||||
|
|
||||||
|
def tokenize(self, row, return_one=True):
|
||||||
|
return self.tokenization.to_tokens(row)
|
@ -1,18 +1,14 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
from . import knowledge_evaluation
|
from ..session.org_mode import global_session as session
|
||||||
from . import tokenization
|
|
||||||
|
|
||||||
from . import depth_meter
|
|
||||||
from .session.org_mode import global_session as session
|
|
||||||
import re
|
import re
|
||||||
import copy
|
import copy
|
||||||
|
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from typing import List, Dict
|
from typing import List, Dict
|
||||||
from .modifiable_property import ModifiableProperty
|
from ..modifiable_property import ModifiableProperty
|
||||||
from . import parameters
|
from .. import parameters
|
||||||
from .atoms import Atom, a, is_atom
|
from ..atoms import Atom, a, is_atom
|
||||||
|
|
||||||
def make_template(knowledge_base, tokens, parsed):
|
def make_template(knowledge_base, tokens, parsed):
|
||||||
matcher = list(tokens)
|
matcher = list(tokens)
|
||||||
@ -83,8 +79,8 @@ def integrate_language(knowledge_base, example):
|
|||||||
text = example["text"].lower()
|
text = example["text"].lower()
|
||||||
parsed = example["parsed"]
|
parsed = example["parsed"]
|
||||||
|
|
||||||
|
tokens = example['tokens']
|
||||||
resolved_parsed = copy.deepcopy(parsed)
|
resolved_parsed = copy.deepcopy(parsed)
|
||||||
tokens = list(tokenization.pick_one_tokenization(tokenization.to_tokens(knowledge_base, text), knowledge_base))
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
session().annotate("P: {}".format(resolved_parsed))
|
session().annotate("P: {}".format(resolved_parsed))
|
||||||
@ -95,14 +91,14 @@ def integrate_language(knowledge_base, example):
|
|||||||
|
|
||||||
for position, atom in lower_levels:
|
for position, atom in lower_levels:
|
||||||
with session().log("Atom {}".format(atom)):
|
with session().log("Atom {}".format(atom)):
|
||||||
|
result = None
|
||||||
similars = get_similar_tree(knowledge_base, atom, tokens)
|
similars = get_similar_tree(knowledge_base, atom, tokens)
|
||||||
for similar in similars:
|
for similar in similars:
|
||||||
result = build_remix_matrix(knowledge_base, tokens, atom, similar)
|
result = build_remix_matrix(knowledge_base, tokens, atom, similar)
|
||||||
if result is not None:
|
if result is not None:
|
||||||
break
|
break
|
||||||
|
|
||||||
if result is None:
|
return
|
||||||
raise Exception("No match found")
|
|
||||||
remix, (start_bounds, end_bounds) = result
|
remix, (start_bounds, end_bounds) = result
|
||||||
|
|
||||||
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
|
after_remix = apply_remix(tokens[len(start_bounds):-len(end_bounds)], remix)
|
||||||
@ -147,7 +143,7 @@ def integrate_language(knowledge_base, example):
|
|||||||
session().annotate("M: {}".format(matcher))
|
session().annotate("M: {}".format(matcher))
|
||||||
session().annotate("R: {}".format(result))
|
session().annotate("R: {}".format(result))
|
||||||
session().annotate("---")
|
session().annotate("---")
|
||||||
return tokens, matcher, result
|
yield tokens, matcher, result
|
||||||
|
|
||||||
|
|
||||||
def apply_remix(tokens, remix):
|
def apply_remix(tokens, remix):
|
||||||
@ -319,7 +315,7 @@ def get_similar_tree(knowledge_base, atom, tokens):
|
|||||||
|
|
||||||
sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True)
|
sorted_possibilities = sorted(sorted_possibilities, key=lambda p: p[3] * 100 + p[4], reverse=True)
|
||||||
if len(sorted_possibilities) < 1:
|
if len(sorted_possibilities) < 1:
|
||||||
return None
|
return []
|
||||||
|
|
||||||
for i, possibility in enumerate(sorted_possibilities):
|
for i, possibility in enumerate(sorted_possibilities):
|
||||||
similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility
|
similar_matcher, similar_result, similar_result_resolved, _atom_score, _token_score = possibility
|
||||||
@ -369,20 +365,6 @@ def get_matching(sample, other):
|
|||||||
return matching
|
return matching
|
||||||
|
|
||||||
|
|
||||||
def reprocess_language_knowledge(knowledge_base, examples):
|
|
||||||
examples = knowledge_base.examples + examples
|
|
||||||
|
|
||||||
pattern_examples = []
|
|
||||||
for i, sample in enumerate(examples):
|
|
||||||
other = examples[:i] + examples[i + 1:]
|
|
||||||
match = get_matching(sample, other)
|
|
||||||
if len(match) > 0:
|
|
||||||
sample = (match, sample[1],)
|
|
||||||
pattern_examples.append(sample)
|
|
||||||
|
|
||||||
return pattern_examples
|
|
||||||
|
|
||||||
|
|
||||||
def reverse_remix(tree_section, remix):
|
def reverse_remix(tree_section, remix):
|
||||||
result_section = []
|
result_section = []
|
||||||
offset = 0
|
offset = 0
|
11
naive-nlu/tree_nlu/layers/parsing_layer.py
Normal file
11
naive-nlu/tree_nlu/layers/parsing_layer.py
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
from . import parsing
|
||||||
|
|
||||||
|
class ParsingLayer:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
def integrate(self, knowledge_base, example):
|
||||||
|
yield from parsing.integrate_language(knowledge_base, example)
|
||||||
|
|
||||||
|
def train(self, knowledge_base, example):
|
||||||
|
assert False
|
@ -1,5 +1,5 @@
|
|||||||
from .session.org_mode import global_session as session
|
from ..session.org_mode import global_session as session
|
||||||
from .atoms import Atom, a, is_atom
|
from ..atoms import Atom, a, is_atom
|
||||||
|
|
||||||
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
|
||||||
for se in knowledge_base.structural_elements:
|
for se in knowledge_base.structural_elements:
|
84
naive-nlu/tree_nlu/layers/tokenization_layer.py
Normal file
84
naive-nlu/tree_nlu/layers/tokenization_layer.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
from ..session.org_mode import global_session as session
|
||||||
|
from ..atoms import Atom
|
||||||
|
from . import tokenization
|
||||||
|
import random
|
||||||
|
import copy
|
||||||
|
|
||||||
|
def randomized_weighted_list(elements):
|
||||||
|
# Randomized
|
||||||
|
randomized = list(elements)
|
||||||
|
random.shuffle(randomized)
|
||||||
|
|
||||||
|
# And return only once
|
||||||
|
already_returned = set()
|
||||||
|
for e in randomized:
|
||||||
|
if e in already_returned:
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield e
|
||||||
|
already_returned.add(e)
|
||||||
|
|
||||||
|
class TokenizationLayer:
|
||||||
|
def __init__(self, knowledge_base):
|
||||||
|
self.structural_elements = set()
|
||||||
|
self.token_chains = {}
|
||||||
|
self.tokens = set()
|
||||||
|
self.knowledge_base = knowledge_base
|
||||||
|
self.knowledge = knowledge_base.knowledge
|
||||||
|
|
||||||
|
def integrate(self, knowledge_base, data):
|
||||||
|
assert knowledge_base is self.knowledge_base
|
||||||
|
|
||||||
|
print(data)
|
||||||
|
assert 'text' in data
|
||||||
|
with session().log("Tokenize: {}".format(data['text'])):
|
||||||
|
for tokens in tokenization.to_tokens(self, data['text']):
|
||||||
|
data_with_row = copy.copy(data)
|
||||||
|
data_with_row['tokens'] = tokens
|
||||||
|
print(data_with_row)
|
||||||
|
yield data_with_row
|
||||||
|
|
||||||
|
|
||||||
|
def tokenize(self, row, return_one=True):
|
||||||
|
row = row.lower()
|
||||||
|
with session().log("Tokenize: {}".format(row)):
|
||||||
|
options = list(tokenization.to_tokens(self, row))
|
||||||
|
session().log("Results:\n{}".format('\n'.join(map(str, options))))
|
||||||
|
|
||||||
|
if return_one:
|
||||||
|
chosen = tokenization.pick_one_tokenization(options, self)
|
||||||
|
session().log("Chosen: “{}”".format(chosen))
|
||||||
|
self.train({'text': row, 'tokens': chosen})
|
||||||
|
return chosen
|
||||||
|
return options
|
||||||
|
|
||||||
|
## Tokenization
|
||||||
|
def add_token_pair(self, precedent, consequent):
|
||||||
|
self.add_token(precedent)
|
||||||
|
self.add_token(consequent)
|
||||||
|
|
||||||
|
if precedent not in self.token_chains:
|
||||||
|
self.token_chains[precedent] = []
|
||||||
|
self.token_chains[precedent].append(consequent)
|
||||||
|
|
||||||
|
def add_token(self, token):
|
||||||
|
self.tokens.add(token)
|
||||||
|
if (not isinstance(token, Atom)) and (token not in self.structural_elements):
|
||||||
|
session().annotate('Found new structural element “{}”'.format(token))
|
||||||
|
self.structural_elements.add(token)
|
||||||
|
|
||||||
|
def expected_token_after_precedent(self, precedent=None):
|
||||||
|
if precedent not in self.token_chains: # If there's no known precedent, just return all tokens
|
||||||
|
return randomized_weighted_list(self.tokens)
|
||||||
|
|
||||||
|
return randomized_weighted_list(self.token_chains[precedent])
|
||||||
|
|
||||||
|
def train(self, example):
|
||||||
|
with session().log('Training tokenizer'):
|
||||||
|
session().annotate("Example: {}".format(example))
|
||||||
|
tokens = tokenization.integrate_tokenization(self, example)
|
||||||
|
|
||||||
|
# Integrate knowledge of concept
|
||||||
|
for token in tokens:
|
||||||
|
if not token in self.knowledge:
|
||||||
|
self.knowledge[token] = {}
|
@ -99,14 +99,14 @@ examples = [
|
|||||||
lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use'])
|
lambda knowledge: _assert('electricity' in knowledge.knowledge['computers']['performs-over']['use'])
|
||||||
),],
|
),],
|
||||||
}),
|
}),
|
||||||
('full_example',
|
# ('full_example',
|
||||||
{
|
# {
|
||||||
"text": "The dominant language in france is french?",
|
# "text": "The dominant language in france is french?",
|
||||||
"affirmation": "The dominant language in france is french",
|
# "affirmation": "The dominant language in france is french",
|
||||||
"parsed": ("question",
|
# "parsed": ("question",
|
||||||
("property-has-value", "france", "dominant-language", "french")),
|
# ("property-has-value", "france", "dominant-language", "french")),
|
||||||
"answer": True,
|
# "answer": True,
|
||||||
}),
|
# }),
|
||||||
# {
|
# {
|
||||||
# "text": "was abraham lincoln once president of the united states?",
|
# "text": "was abraham lincoln once president of the united states?",
|
||||||
# "affirmation": "was abraham lincoln once president of the united states?",
|
# "affirmation": "was abraham lincoln once president of the united states?",
|
||||||
|
@ -63,11 +63,11 @@ def main():
|
|||||||
show_progbar(i, total, example['text'])
|
show_progbar(i, total, example['text'])
|
||||||
if case_type == 'example':
|
if case_type == 'example':
|
||||||
with session().log(example['text']):
|
with session().log(example['text']):
|
||||||
knowledge.train_tokenizer(example)
|
knowledge.layers.tokenization.train(example)
|
||||||
|
|
||||||
elif case_type == 'test':
|
elif case_type == 'test':
|
||||||
with session().log(example['text']):
|
with session().log(example['text']):
|
||||||
tokens = list(knowledge.tokenize(example['text']))
|
tokens = list(knowledge.layers.tokenization.tokenize(example['text']))
|
||||||
|
|
||||||
session().log('Expected “{}”, found “{}”'
|
session().log('Expected “{}”, found “{}”'
|
||||||
.format(example['tokens'], tokens))
|
.format(example['tokens'], tokens))
|
||||||
|
@ -26,4 +26,4 @@ BASIC_TOKENIZATION_EXAMPLES = (
|
|||||||
def train_basic_tokenization(knowledge_base):
|
def train_basic_tokenization(knowledge_base):
|
||||||
with session().log('Training basic tokenization'):
|
with session().log('Training basic tokenization'):
|
||||||
for example in BASIC_TOKENIZATION_EXAMPLES:
|
for example in BASIC_TOKENIZATION_EXAMPLES:
|
||||||
knowledge_base.train_tokenizer(example)
|
knowledge_base.layers.tokenization.train(example)
|
||||||
|
Loading…
Reference in New Issue
Block a user