lang-model/naive-nlu/tree_nlu/layers/tokenization.py

from ..session.org_mode import global_session as session
from ..atoms import Atom, a, is_atom

def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):
    for se in knowledge_base.structural_elements:
        found_position = remaining.find(se)
        found = found_position >= 0
        session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))
        if found:
            return [
                (remaining[:found_position], se, remaining[found_position + len(se):])
            ]

    for token in knowledge_base.knowledge.keys():
        found_position = remaining.find(token)
        found = found_position >= 0
        session().annotate('Looking for token “{}”, found? {}'.format(token, found))
        if found:
            return [
                (remaining[:found_position], token, remaining[found_position + len(token):])
            ]

    return None


def to_tokens(knowledge_base, text, precedent=None):
    if len(text) == 0:
        session().annotate("No text remaining")
        yield ['']
        return

    with session().log("Tokenizing {}".format(text)):
        for option in knowledge_base.expected_token_after_precedent(precedent):
            with session().log("Next: “{}”".format(option)):
                with session().log("Matching “{}” on “{}”".format(option, text)):
                    for token_match in tokenization_match(option, text, knowledge_base):
                        if token_match is None:
                            session().annotate("No match")

                        match, remaining = token_match
                        if len(remaining) == len(text):
                            raise Exception('No text consumed in match')

                        session().annotate('Match: “{}”'.format(match))
                        with session().log('Remaining “{}”'.format(remaining)):
                            for sublevel in to_tokens(knowledge_base, remaining, match):
                                candidate = list(filter(lambda x: x != '', [match] + sublevel))
                                session().annotate('Yielding candidate “{}”'.format(candidate))
                                yield candidate


def tokenization_match(element, text, knowledge_base):
    # Constant/structural string matching
    if isinstance(element, str):
        if text.find(element) == 0:
            # This match comes from a structuring element
            # It doesn't appear on the tokenization
            # So we should return it as an empty string
            yield ('', text[len(element):])
            return
        else:
            # No match found
            return

    elif is_atom(element, 'token'):
        yield from match_single_token(text, knowledge_base)
        return
    raise NotImplementedError()


def match_single_token(text, knowledge_base):
    found_token = False
    for token in knowledge_base.knowledge.keys():
        if text.find(token) == 0:
            yield token, text[len(token):]
            found_token = True

    if found_token:
        return

    session().annotate('No token found at the start of ”{}”'.format(text))
    session().annotate('using structural elements to infer it')
    # TODO: review this when multiple structural elements are available
    for se in knowledge_base.structural_elements:
        session().annotate('Looking for se “{}” in “{}”'.format(se, text))
        position = text.find(se, 0)
        found = position > 0  # 0 is not considered a valid position for this kind of split
        if found:
            session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))
            yield text[:position], text[position:]

    session().annotate('No structural element or token found, inferring only token remaining')
    yield text, ''

    # Using other tokens for cutoff
    for token in knowledge_base.knowledge.keys():
        session().annotate('Looking for token “{}” in “{}”'.format(token, text))
        position = text.find(token)
        found = position >= 0
        if found:
            session().annotate('Found ”{}”, in position ”{}”'.format(token, position))
            yield text[:position], text[position:]


def integrate_tokenization(knowledge_base, example):
    text = example['text']
    tokens = example['tokens']
    meaning = example.get('meaning')

    return integrate_token_to_text_matching(knowledge_base, text, tokens)


def integrate_token_to_text_matching(knowledge_base, text, tokens):
    texts = [text]

    # Convert to tokens
    for token_id, token in enumerate(tokens):
        # Look for token in texts
        for i, text in enumerate(texts):
            if isinstance(text, int):
                continue

            if token in text:
                before, after = text.split(token, maxsplit=1)
                texts = (texts[:i] + [before]
                         + [a('token')]
                         + [after] + texts[i + 1:])
                break
        else:
            raise Exception('Token not found')

    # Remove leftovers from splits
    texts = list(filter(lambda x: x != '', texts))
    session().log("Tokenized as {} over {}".format(texts, tokens))

    for i, element in enumerate(texts[:-1]):
       learn_token_pair(element, texts[i + 1], knowledge_base)

    return tokens

def learn_token_pair(precedent, consequent, knowledge_base):
    knowledge_base.add_token_pair(precedent, consequent)


def pick_one_tokenization(options, knowledge_base):
    '''
    Heuristic function to pick the most probable tokenization.

    Just pick the one with more results.
    '''
    options = list(options)
    with session().log("Picking among: {} options".format(len(options))):
        session().log("Options: \n{}".format('\n'.join(map(str, options))))
        return pick_by_score(options,
                             [
                                 # By number of splits without structuring elements
                                 lambda tokenization: sum(map(
                                     lambda split: sum(map(
                                         lambda se: se in split, knowledge_base.structural_elements
                                     )), tokenization)),

                                 # By number of unknown tokens
                                 lambda tokenization: len(list(filter(lambda token:
                                                                      (token not in knowledge_base.knowledge.keys()) and
                                                                      (token not in knowledge_base.structural_elements),
                                                                      tokenization))),

                                 # By number of splits
                                 lambda tokenization: -len(tokenization),
                             ])

def pick_by_score(options, heuristics):
    for heuristic in heuristics:
        assert(len(options) > 0)
        options = list(map(lambda opt: (heuristic(opt), opt), options))
        sorted_options = sorted(options, key=lambda x: x[0], reverse=False)

        heuristic_cutoff = sorted_options[0][0]
        session().annotate(sorted_options)
        pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]
        options = pass_heuristic

    session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))
    return options[0]
Exploration of layers for tokenization and parsing. 2018-04-23 22:48:10 +02:00			`from ..session.org_mode import global_session as session`
			`from ..atoms import Atom, a, is_atom`
Separate tokenization module. 2018-04-15 22:15:28 +02:00
			`def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining):`
			`for se in knowledge_base.structural_elements:`
			`found_position = remaining.find(se)`
			`found = found_position >= 0`
			`session().annotate('Looking for structure with “{}”, found? {}'.format(se, found))`
			`if found:`
			`return [`
			`(remaining[:found_position], se, remaining[found_position + len(se):])`
			`]`

			`for token in knowledge_base.knowledge.keys():`
			`found_position = remaining.find(token)`
			`found = found_position >= 0`
			`session().annotate('Looking for token “{}”, found? {}'.format(token, found))`
			`if found:`
			`return [`
			`(remaining[:found_position], token, remaining[found_position + len(token):])`
			`]`

			`return None`



			`def to_tokens(knowledge_base, text, precedent=None):`
			`if len(text) == 0:`
			`session().annotate("No text remaining")`
			`yield ['']`
			`return`

			`with session().log("Tokenizing {}".format(text)):`
			`for option in knowledge_base.expected_token_after_precedent(precedent):`
			`with session().log("Next: “{}”".format(option)):`
			`with session().log("Matching “{}” on “{}”".format(option, text)):`
			`for token_match in tokenization_match(option, text, knowledge_base):`
			`if token_match is None:`
			`session().annotate("No match")`

			`match, remaining = token_match`
			`if len(remaining) == len(text):`
			`raise Exception('No text consumed in match')`

			`session().annotate('Match: “{}”'.format(match))`
			`with session().log('Remaining “{}”'.format(remaining)):`
			`for sublevel in to_tokens(knowledge_base, remaining, match):`
			`candidate = list(filter(lambda x: x != '', [match] + sublevel))`
			`session().annotate('Yielding candidate “{}”'.format(candidate))`
			`yield candidate`


			`def tokenization_match(element, text, knowledge_base):`
			`# Constant/structural string matching`
			`if isinstance(element, str):`
			`if text.find(element) == 0:`
			`# This match comes from a structuring element`
			`# It doesn't appear on the tokenization`
			`# So we should return it as an empty string`
			`yield ('', text[len(element):])`
			`return`
			`else:`
			`# No match found`
			`return`

			`elif is_atom(element, 'token'):`
			`yield from match_single_token(text, knowledge_base)`
			`return`
			`raise NotImplementedError()`


			`def match_single_token(text, knowledge_base):`
			`found_token = False`
			`for token in knowledge_base.knowledge.keys():`
			`if text.find(token) == 0:`
			`yield token, text[len(token):]`
			`found_token = True`

			`if found_token:`
			`return`

			`session().annotate('No token found at the start of ”{}”'.format(text))`
			`session().annotate('using structural elements to infer it')`
			`# TODO: review this when multiple structural elements are available`
			`for se in knowledge_base.structural_elements:`
			`session().annotate('Looking for se “{}” in “{}”'.format(se, text))`
			`position = text.find(se, 0)`
			`found = position > 0 # 0 is not considered a valid position for this kind of split`
			`if found:`
			`session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position]))`
			`yield text[:position], text[position:]`

			`session().annotate('No structural element or token found, inferring only token remaining')`
			`yield text, ''`

			`# Using other tokens for cutoff`
			`for token in knowledge_base.knowledge.keys():`
			`session().annotate('Looking for token “{}” in “{}”'.format(token, text))`
			`position = text.find(token)`
			`found = position >= 0`
			`if found:`
			`session().annotate('Found ”{}”, in position ”{}”'.format(token, position))`
			`yield text[:position], text[position:]`


			`def integrate_tokenization(knowledge_base, example):`
			`text = example['text']`
			`tokens = example['tokens']`
			`meaning = example.get('meaning')`

			`return integrate_token_to_text_matching(knowledge_base, text, tokens)`


			`def integrate_token_to_text_matching(knowledge_base, text, tokens):`
			`texts = [text]`

			`# Convert to tokens`
			`for token_id, token in enumerate(tokens):`
			`# Look for token in texts`
			`for i, text in enumerate(texts):`
			`if isinstance(text, int):`
			`continue`

			`if token in text:`
			`before, after = text.split(token, maxsplit=1)`
			`texts = (texts[:i] + [before]`
			`+ [a('token')]`
			`+ [after] + texts[i + 1:])`
			`break`
			`else:`
			`raise Exception('Token not found')`

			`# Remove leftovers from splits`
			`texts = list(filter(lambda x: x != '', texts))`
			`session().log("Tokenized as {} over {}".format(texts, tokens))`

			`for i, element in enumerate(texts[:-1]):`
			`learn_token_pair(element, texts[i + 1], knowledge_base)`

			`return tokens`

			`def learn_token_pair(precedent, consequent, knowledge_base):`
			`knowledge_base.add_token_pair(precedent, consequent)`


			`def pick_one_tokenization(options, knowledge_base):`
			`'''`
			`Heuristic function to pick the most probable tokenization.`

			`Just pick the one with more results.`
			`'''`
			`options = list(options)`
			`with session().log("Picking among: {} options".format(len(options))):`
			`session().log("Options: \n{}".format('\n'.join(map(str, options))))`
			`return pick_by_score(options,`
			`[`
			`# By number of splits without structuring elements`
			`lambda tokenization: sum(map(`
			`lambda split: sum(map(`
			`lambda se: se in split, knowledge_base.structural_elements`
			`)), tokenization)),`

			`# By number of unknown tokens`
			`lambda tokenization: len(list(filter(lambda token:`
			`(token not in knowledge_base.knowledge.keys()) and`
			`(token not in knowledge_base.structural_elements),`
			`tokenization))),`

			`# By number of splits`
			`lambda tokenization: -len(tokenization),`
			`])`

			`def pick_by_score(options, heuristics):`
			`for heuristic in heuristics:`
			`assert(len(options) > 0)`
			`options = list(map(lambda opt: (heuristic(opt), opt), options))`
			`sorted_options = sorted(options, key=lambda x: x[0], reverse=False)`

			`heuristic_cutoff = sorted_options[0][0]`
			`session().annotate(sorted_options)`
			`pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff]`
			`options = pass_heuristic`

			`session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options))))`
			`return options[0]`