from ..session.org_mode import global_session as session from ..atoms import Atom, a, is_atom def lookahead_for_tokens_or_strucutral_elements(knowledge_base, remaining): for se in knowledge_base.structural_elements: found_position = remaining.find(se) found = found_position >= 0 session().annotate('Looking for structure with “{}”, found? {}'.format(se, found)) if found: return [ (remaining[:found_position], se, remaining[found_position + len(se):]) ] for token in knowledge_base.knowledge.keys(): found_position = remaining.find(token) found = found_position >= 0 session().annotate('Looking for token “{}”, found? {}'.format(token, found)) if found: return [ (remaining[:found_position], token, remaining[found_position + len(token):]) ] return None def to_tokens(knowledge_base, text, precedent=None): if len(text) == 0: session().annotate("No text remaining") yield [''] return with session().log("Tokenizing {}".format(text)): for option in knowledge_base.expected_token_after_precedent(precedent): with session().log("Next: “{}”".format(option)): with session().log("Matching “{}” on “{}”".format(option, text)): for token_match in tokenization_match(option, text, knowledge_base): if token_match is None: session().annotate("No match") match, remaining = token_match if len(remaining) == len(text): raise Exception('No text consumed in match') session().annotate('Match: “{}”'.format(match)) with session().log('Remaining “{}”'.format(remaining)): for sublevel in to_tokens(knowledge_base, remaining, match): candidate = list(filter(lambda x: x != '', [match] + sublevel)) session().annotate('Yielding candidate “{}”'.format(candidate)) yield candidate def tokenization_match(element, text, knowledge_base): # Constant/structural string matching if isinstance(element, str): if text.find(element) == 0: # This match comes from a structuring element # It doesn't appear on the tokenization # So we should return it as an empty string yield ('', text[len(element):]) return else: # No match found return elif is_atom(element, 'token'): yield from match_single_token(text, knowledge_base) return raise NotImplementedError() def match_single_token(text, knowledge_base): found_token = False for token in knowledge_base.knowledge.keys(): if text.find(token) == 0: yield token, text[len(token):] found_token = True if found_token: return session().annotate('No token found at the start of ”{}”'.format(text)) session().annotate('using structural elements to infer it') # TODO: review this when multiple structural elements are available for se in knowledge_base.structural_elements: session().annotate('Looking for se “{}” in “{}”'.format(se, text)) position = text.find(se, 0) found = position > 0 # 0 is not considered a valid position for this kind of split if found: session().annotate('Found ”{}”, inferring “{}”'.format(se, text[:position])) yield text[:position], text[position:] session().annotate('No structural element or token found, inferring only token remaining') yield text, '' # Using other tokens for cutoff for token in knowledge_base.knowledge.keys(): session().annotate('Looking for token “{}” in “{}”'.format(token, text)) position = text.find(token) found = position >= 0 if found: session().annotate('Found ”{}”, in position ”{}”'.format(token, position)) yield text[:position], text[position:] def integrate_tokenization(knowledge_base, example): text = example['text'] tokens = example['tokens'] meaning = example.get('meaning') return integrate_token_to_text_matching(knowledge_base, text, tokens) def integrate_token_to_text_matching(knowledge_base, text, tokens): texts = [text] # Convert to tokens for token_id, token in enumerate(tokens): # Look for token in texts for i, text in enumerate(texts): if isinstance(text, int): continue if token in text: before, after = text.split(token, maxsplit=1) texts = (texts[:i] + [before] + [a('token')] + [after] + texts[i + 1:]) break else: raise Exception('Token not found') # Remove leftovers from splits texts = list(filter(lambda x: x != '', texts)) session().log("Tokenized as {} over {}".format(texts, tokens)) for i, element in enumerate(texts[:-1]): learn_token_pair(element, texts[i + 1], knowledge_base) return tokens def learn_token_pair(precedent, consequent, knowledge_base): knowledge_base.add_token_pair(precedent, consequent) def pick_one_tokenization(options, knowledge_base): ''' Heuristic function to pick the most probable tokenization. Just pick the one with more results. ''' options = list(options) with session().log("Picking among: {} options".format(len(options))): session().log("Options: \n{}".format('\n'.join(map(str, options)))) return pick_by_score(options, [ # By number of splits without structuring elements lambda tokenization: sum(map( lambda split: sum(map( lambda se: se in split, knowledge_base.structural_elements )), tokenization)), # By number of unknown tokens lambda tokenization: len(list(filter(lambda token: (token not in knowledge_base.knowledge.keys()) and (token not in knowledge_base.structural_elements), tokenization))), # By number of splits lambda tokenization: -len(tokenization), ]) def pick_by_score(options, heuristics): for heuristic in heuristics: assert(len(options) > 0) options = list(map(lambda opt: (heuristic(opt), opt), options)) sorted_options = sorted(options, key=lambda x: x[0], reverse=False) heuristic_cutoff = sorted_options[0][0] session().annotate(sorted_options) pass_heuristic = [opt for (score, opt) in sorted_options if score <= heuristic_cutoff] options = pass_heuristic session().log("{} finalists: \n{}".format(len(options), '\n'.join(map(str, options)))) return options[0]