From e73ce5d480ec2a963409cd0e4e935ab86d3825bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Mon, 2 Nov 2020 20:39:16 +0100 Subject: [PATCH] Format with black, use tokens for markup segmentation. - Don't use trees in first instance as interleaving might be lossy. --- org_dom/org_dom.py | 526 ++++++++++++++++++---------------- org_dom/utils.py | 4 +- tests/02-markup.org | 2 + tests/test_dom.py | 36 ++- tests/utils/dom_assertions.py | 62 ++-- 5 files changed, 356 insertions(+), 274 deletions(-) diff --git a/org_dom/org_dom.py b/org_dom/org_dom.py index ff5fdac..98fa5ff 100644 --- a/org_dom/org_dom.py +++ b/org_dom/org_dom.py @@ -1,11 +1,12 @@ +import collections import logging import re -import collections +from enum import Enum from typing import List, Tuple BASE_ENVIRONMENT = { - 'org-footnote-section': 'Footnotes', - 'org-options-keywords': ( + "org-footnote-section": "Footnotes", + "org-options-keywords": ( "ARCHIVE:", "AUTHOR:", "BIND:", @@ -30,52 +31,103 @@ BASE_ENVIRONMENT = { "SEQ_TODO:", "SETUPFILE:", "STARTUP:", - "TAGS:" - "TITLE:", + "TAGS:" "TITLE:", "TODO:", "TYP_TODO:", "SELECT_TAGS:", - "EXCLUDE_TAGS:" + "EXCLUDE_TAGS:", ), } -HEADLINE_RE = re.compile(r'^(?P\*+) (?P\s*)(?P.*)$') -KEYWORDS_RE = re.compile(r'^(?P\s*)#\+(?P[^:\[]+)(\[(?P[^\]]*)\])?:(?P\s*)(?P.*)$') -PROPERTY_DRAWER_RE = re.compile(r'^(?P\s*):PROPERTIES:(?P\s*)$') -DRAWER_END_RE = re.compile(r'^(?P\s*):END:(?P\s*)$') -NODE_PROPERTIES_RE = re.compile(r'^(?P\s*):(?P[^+:]+)(?P\+)?:(?P\s*)(?P.*)$') -RAW_LINE_RE = re.compile(r'^\s*([^\s#:*]|$)') -BASE_TIME_STAMP_RE = r'(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)( (?P\d{1,2}):(?P\d{1,2})(--(?P\d{1,2}):(?P\d{1,2}))?)?' +HEADLINE_RE = re.compile(r"^(?P\*+) (?P\s*)(?P.*)$") +KEYWORDS_RE = re.compile( + r"^(?P\s*)#\+(?P[^:\[]+)(\[(?P[^\]]*)\])?:(?P\s*)(?P.*)$" +) +PROPERTY_DRAWER_RE = re.compile( + r"^(?P\s*):PROPERTIES:(?P\s*)$" +) +DRAWER_END_RE = re.compile(r"^(?P\s*):END:(?P\s*)$") +NODE_PROPERTIES_RE = re.compile( + r"^(?P\s*):(?P[^+:]+)(?P\+)?:(?P\s*)(?P.*)$" +) +RAW_LINE_RE = re.compile(r"^\s*([^\s#:*]|$)") +BASE_TIME_STAMP_RE = r"(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)( (?P\d{1,2}):(?P\d{1,2})(--(?P\d{1,2}):(?P\d{1,2}))?)?" -ACTIVE_TIME_STAMP_RE = re.compile(r'<{}>'.format(BASE_TIME_STAMP_RE)) -INACTIVE_TIME_STAMP_RE = re.compile(r'\[{}\]'.format(BASE_TIME_STAMP_RE)) +ACTIVE_TIME_STAMP_RE = re.compile(r"<{}>".format(BASE_TIME_STAMP_RE)) +INACTIVE_TIME_STAMP_RE = re.compile(r"\[{}\]".format(BASE_TIME_STAMP_RE)) # BASE_TIME_RANGE_RE = (r'(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)((?P\d{1,2}):(?P\d{1,2}))?', # r'(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)((?P\d{1,2}):(?P\d{1,2}))?') -Headline = collections.namedtuple('Headline', ('start_line', 'depth', - 'orig', - 'properties', 'keywords', - 'priority_start', 'priority', - 'title_start', 'title', - 'tags_start', 'tags', - 'contents', - 'children', - 'structural', -)) +Headline = collections.namedtuple( + "Headline", + ( + "start_line", + "depth", + "orig", + "properties", + "keywords", + "priority_start", + "priority", + "title_start", + "title", + "tags_start", + "tags", + "contents", + "children", + "structural", + ), +) -RawLine = collections.namedtuple('RawLine', ('linenum', 'line')) -Keyword = collections.namedtuple('Keyword', ('linenum', 'match', 'key', 'value', 'options')) -Property = collections.namedtuple('Property', ('linenum', 'match', 'key', 'value', 'options')) +RawLine = collections.namedtuple("RawLine", ("linenum", "line")) +Keyword = collections.namedtuple( + "Keyword", ("linenum", "match", "key", "value", "options") +) +Property = collections.namedtuple( + "Property", ("linenum", "match", "key", "value", "options") +) # @TODO How are [YYYY-MM-DD HH:mm--HH:mm] and ([... HH:mm]--[... HH:mm]) differentiated ? # @TODO Consider recurrence annotations -TimeRange = collections.namedtuple('TimeRange', ('start_time', 'end_time')) -Timestamp = collections.namedtuple('Timestamp', ('active', 'year', 'month', 'day', 'dow', 'hour', 'minute')) +TimeRange = collections.namedtuple("TimeRange", ("start_time", "end_time")) +Timestamp = collections.namedtuple( + "Timestamp", ("active", "year", "month", "day", "dow", "hour", "minute") +) + + +class MarkerType(Enum): + NO_MODE = 0b0 + BOLD_MODE = 0b1 + CODE_MODE = 0b10 + ITALIC_MODE = 0b100 + STRIKE_MODE = 0b1000 + UNDERLINED_MODE = 0b10000 + VERBATIM_MODE = 0b100000 + +MARKERS = { + "*": MarkerType.BOLD_MODE, + "~": MarkerType.CODE_MODE, + "/": MarkerType.ITALIC_MODE, + "+": MarkerType.STRIKE_MODE, + "_": MarkerType.UNDERLINED_MODE, + "=": MarkerType.VERBATIM_MODE, +} + +ModeToMarker = {} + +for tok, mode in MARKERS.items(): + ModeToMarker[mode] = tok + +MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type")) + +BEGIN_PROPERTIES = "OPEN_PROPERTIES" +END_PROPERTIES = "CLOSE_PROPERTIES" + +def token_from_type(tok_type): + print(ModeToMarker, tok_type) + return ModeToMarker[tok_type] -BEGIN_PROPERTIES = 'OPEN_PROPERTIES' -END_PROPERTIES = 'CLOSE_PROPERTIES' def parse_org_time(value): if m := ACTIVE_TIME_STAMP_RE.match(value): @@ -85,29 +137,57 @@ def parse_org_time(value): else: return None - if m.group('end_hour'): - return TimeRange(Timestamp(active, int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))), - Timestamp(active, int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('end_hour')), int(m.group('end_minute')))) - return Timestamp(active, int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))) + if m.group("end_hour"): + return TimeRange( + Timestamp( + active, + int(m.group("year")), + int(m.group("month")), + int(m.group("day")), + m.group("dow"), + int(m.group("start_hour")), + int(m.group("start_minute")), + ), + Timestamp( + active, + int(m.group("year")), + int(m.group("month")), + int(m.group("day")), + m.group("dow"), + int(m.group("end_hour")), + int(m.group("end_minute")), + ), + ) + return Timestamp( + active, + int(m.group("year")), + int(m.group("month")), + int(m.group("day")), + m.group("dow"), + int(m.group("start_hour")), + int(m.group("start_minute")), + ) + def timestamp_to_string(ts): - date = '{year}-{month:02d}-{day:02d}'.format( - year=ts.year, - month=ts.month, - day=ts.day + date = "{year}-{month:02d}-{day:02d}".format( + year=ts.year, month=ts.month, day=ts.day ) if ts.dow: - date = date + ' ' + ts.dow + date = date + " " + ts.dow if ts.hour is not None: - base = '{date} {hour:02}:{minute:02d}'.format(date=date, hour=ts.hour, minute=ts.minute) + base = "{date} {hour:02}:{minute:02d}".format( + date=date, hour=ts.hour, minute=ts.minute + ) else: base = date if ts.active: - return '<{}>'.format(base) + return "<{}>".format(base) else: - return '[{}]'.format(base) + return "[{}]".format(base) + def get_raw(doc): if isinstance(doc, str): @@ -115,6 +195,7 @@ def get_raw(doc): else: return doc.get_raw() + class Line: def __init__(self, linenum, contents): self.linenum = linenum @@ -127,7 +208,8 @@ class Line: rawchunks.append(chunk) else: rawchunks.append(chunk.get_raw()) - return ''.join(rawchunks) + '\n' + return "".join(rawchunks) + "\n" + class Text: def __init__(self, contents, line): @@ -135,104 +217,122 @@ class Text: self.linenum = line def get_raw(self): - raw = ''.join(self.contents) - return raw + contents = [] + for chunk in self.contents: + if isinstance(chunk, str): + contents.append(chunk) + else: + assert isinstance(chunk, MarkerToken) + contents.append(token_from_type(chunk.tok_type)) + return ''.join(contents) + class Bold: - Marker = '*' + Marker = "*" def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(map(get_raw, self.contents)) + raw = "".join(map(get_raw, self.contents)) return f"{self.Marker}{raw}{self.Marker}" + class Code: - Marker = '~' + Marker = "~" def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(map(get_raw, self.contents)) + raw = "".join(map(get_raw, self.contents)) return f"{self.Marker}{raw}{self.Marker}" + class Italic: - Marker = '/' + Marker = "/" def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(map(get_raw, self.contents)) + raw = "".join(map(get_raw, self.contents)) return f"{self.Marker}{raw}{self.Marker}" + class Strike: - Marker = '+' + Marker = "+" def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(map(get_raw, self.contents)) + raw = "".join(map(get_raw, self.contents)) return f"{self.Marker}{raw}{self.Marker}" + class Underlined: - Marker = '_' + Marker = "_" def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(map(get_raw, self.contents)) + raw = "".join(map(get_raw, self.contents)) return f"{self.Marker}{raw}{self.Marker}" + class Verbatim: - Marker = '=' + Marker = "=" def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(map(get_raw, self.contents)) + raw = "".join(map(get_raw, self.contents)) return f"{self.Marker}{raw}{self.Marker}" def is_pre(char: str) -> bool: if isinstance(char, str): - return char in '\n\r\t -({\'"' + return char in "\n\r\t -({'\"" else: return True + def is_marker(char: str) -> bool: if isinstance(char, str): - return char in '*=/+_~' + return char in "*=/+_~" else: return False + def is_border(char: str) -> bool: if isinstance(char, str): - return char not in '\n\r\t ' + return char not in "\n\r\t " else: return False + def is_body(char: str) -> bool: if isinstance(char, str): return True else: return False + def is_post(char: str) -> bool: if isinstance(char, str): - return char in '-.,;:!?\')}["' + return char in "-.,;:!?')}[\"" else: return False + TOKEN_TYPE_TEXT = 0 TOKEN_TYPE_OPEN_MARKER = 1 TOKEN_TYPE_CLOSE_MARKER = 2 + def tokenize_contents(contents: str): tokens = [] last_char = None @@ -244,17 +344,17 @@ def tokenize_contents(contents: str): has_changed = False if ( - (i not in closes) - and is_marker(char) - and is_pre(last_char) - and ((i + 1 < len(contents)) - and is_border(contents[i + 1]))): + (i not in closes) + and is_marker(char) + and is_pre(last_char) + and ((i + 1 < len(contents)) and is_border(contents[i + 1])) + ): is_valid_mark = False # Check that is closed later text_in_line = True for j in range(i, len(contents) - 1): - if contents[j] == '\n': + if contents[j] == "\n": if not text_in_line: break text_in_line = False @@ -267,13 +367,13 @@ def tokenize_contents(contents: str): if is_valid_mark: if len(text) > 0: - tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) + tokens.append((TOKEN_TYPE_TEXT, "".join(text))) text = [] tokens.append((TOKEN_TYPE_OPEN_MARKER, char)) has_changed = True elif i in closes: if len(text) > 0: - tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) + tokens.append((TOKEN_TYPE_TEXT, "".join(text))) text = [] tokens.append((TOKEN_TYPE_CLOSE_MARKER, char)) has_changed = True @@ -283,156 +383,57 @@ def tokenize_contents(contents: str): last_char = char if len(text) > 0: - tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) + tokens.append((TOKEN_TYPE_TEXT, "".join(text))) return tokens -def parse_contents(raw_contents:List[RawLine]): - NO_MODE = 0b0 - BOLD_MODE = 0b1 - CODE_MODE = 0b10 - ITALIC_MODE = 0b100 - STRIKE_MODE = 0b1000 - UNDERLINED_MODE = 0b10000 - VERBATIM_MODE = 0b100000 - - MARKERS = { - '*': BOLD_MODE, - '~': CODE_MODE, - '/': ITALIC_MODE, - '+': STRIKE_MODE, - '_': UNDERLINED_MODE, - '=': VERBATIM_MODE, - } - MODES = ( - (BOLD_MODE, Bold), - (CODE_MODE, Code), - (ITALIC_MODE, Italic), - (STRIKE_MODE, Strike), - (UNDERLINED_MODE, Underlined), - (VERBATIM_MODE, Verbatim), - ) - _MODES = { - BOLD_MODE: Bold, - CODE_MODE: Code, - ITALIC_MODE: Italic, - STRIKE_MODE: Strike, - UNDERLINED_MODE: Underlined, - VERBATIM_MODE: Verbatim, - } - - mode = NO_MODE - escaped = False - - chunk = [] - inline = [] - chunks = [] - - linenum = start_linenum = raw_contents[0].linenum +def parse_contents(raw_contents: List[RawLine]): contents_buff = [] for line in raw_contents: contents_buff.append(line.line) - contents = '\n'.join(contents_buff) + contents = "\n".join(contents_buff) tokens = tokenize_contents(contents) + current_line = raw_contents[0].linenum - # Use tokens to tag chunks of text with it's container type - for (tok_type, tok_val) in tokens: - if tok_type == TOKEN_TYPE_TEXT: - chunks.append((mode, tok_val)) - elif tok_type == TOKEN_TYPE_OPEN_MARKER: - mode = mode | MARKERS[tok_val] - elif tok_type == TOKEN_TYPE_OPEN_MARKER: - mode = mode ^ MARKERS[tok_val] - - # Convert those chunks to a tree - def tree_for_tag(tag, in_mode): - tree = [] - for (mask, mode) in MODES: - if (mask & tag) and not (mask & in_mode): - tree.append(mode) - print(tree) - if len(tree) == 0: - return Text - - - if len(raw_contents) > 0: - current_line = raw_contents[0].linenum - - # tree = [] - # pos = [] - # print('\n'.join(map(str, chunks))) - # for (tag, chunk) in chunks: - # if pos == []: - # tree.append(tree_for_tag(tag, NO_MODE)(chunk, line=current_line)) - # pos.append(tree[-1]) - # else: - # raise NotImplementedError() - - # current_line += chunk.count('\n') - - - tree = [] - mode_tree = [] contents = [] # Use tokens to tag chunks of text with it's container type for (tok_type, tok_val) in tokens: if tok_type == TOKEN_TYPE_TEXT: - if len(mode_tree) == 0: - tree.append(Text(tok_val, current_line)) - else: - contents[-1].append(tok_val) - - current_line += chunk.count('\n') - + contents.append(tok_val) elif tok_type == TOKEN_TYPE_OPEN_MARKER: - mode_tree.append(_MODES[MARKERS[tok_val]]) - contents.append([]) - + contents.append(MarkerToken(False, MARKERS[tok_val])) elif tok_type == TOKEN_TYPE_CLOSE_MARKER: - mode = _MODES[MARKERS[tok_val]] - matching_mode = mode_tree.pop() - assert mode == matching_mode - value = mode(contents.pop(), current_line) - current_line += chunk.count('\n') + contents.append(MarkerToken(True, MARKERS[tok_val])) - if len(mode_tree) == 0: # Closed branch of tree - tree.append(value) - else: - print("{} <- {}".format(mode_tree[-1], mode)) - contents[-1].append(value) + return [Text(contents, current_line)] - current_line += chunk.count('\n') - - if len(tree) > 3: - print("L", len(tree)) - print("F:", tree) - return tree def parse_headline(hl) -> Headline: - stars = hl['orig'].group('stars') + stars = hl["orig"].group("stars") depth = len(stars) # TODO: Parse line for priority, cookies and tags - line = hl['orig'].group('line') + line = hl["orig"].group("line") title = line.strip() - contents = parse_contents(hl['contents']) + contents = parse_contents(hl["contents"]) - return Headline(start_line=hl['linenum'], - depth=depth, - orig=hl['orig'], - title=title, - contents=contents, - children=[parse_headline(child) for child in hl['children']], - keywords=hl['keywords'], - properties=hl['properties'], - structural=hl['structural'], - title_start=None, - priority=None, - priority_start=None, - tags_start=None, - tags=None, + return Headline( + start_line=hl["linenum"], + depth=depth, + orig=hl["orig"], + title=title, + contents=contents, + children=[parse_headline(child) for child in hl["children"]], + keywords=hl["keywords"], + properties=hl["properties"], + structural=hl["structural"], + title_start=None, + priority=None, + priority_start=None, + tags_start=None, + tags=None, ) @@ -454,35 +455,41 @@ class OrgDom: # Writing def dump_kw(self, kw): - options = kw.match.group('options') + options = kw.match.group("options") if not options: - options = '' + options = "" - return (kw.linenum, - '{indentation}#+{key}{options}:{spacing}{value}'.format( - indentation=kw.match.group('indentation'), - key=kw.key, - options=kw.options, - spacing=kw.match.group('spacing'), - value=kw.value, - )) + return ( + kw.linenum, + "{indentation}#+{key}{options}:{spacing}{value}".format( + indentation=kw.match.group("indentation"), + key=kw.key, + options=kw.options, + spacing=kw.match.group("spacing"), + value=kw.value, + ), + ) def dump_property(self, prop: Property): - plus = prop.match.group('plus') - if plus is None: plus = '' + plus = prop.match.group("plus") + if plus is None: + plus = "" if isinstance(prop.value, Timestamp): value = timestamp_to_string(prop.value) else: value = prop.value - return (prop.linenum, '{indentation}:{key}{plus}:{spacing}{value}'.format( - indentation=prop.match.group('indentation'), - key=prop.key, - plus=plus, - spacing=prop.match.group('spacing'), - value=value, - )) + return ( + prop.linenum, + "{indentation}:{key}{plus}:{spacing}{value}".format( + indentation=prop.match.group("indentation"), + key=prop.key, + plus=plus, + spacing=prop.match.group("spacing"), + value=value, + ), + ) def dump_contents(self, raw): if isinstance(raw, RawLine): @@ -494,7 +501,9 @@ class OrgDom: return (structural[0], structural[1]) def dump_headline(self, headline): - yield '*' * headline.depth + ' ' + headline.orig.group('spacing') + headline.title + yield "*" * headline.depth + " " + headline.orig.group( + "spacing" + ) + headline.title lines = [] KW_T = 0 @@ -523,21 +532,31 @@ class OrgDom: if ltype == PROPERTIES_T and last_type not in (STRUCTURAL_T, PROPERTIES_T): # No structural opening - structured_lines.append(' ' * content.index(':') + ':PROPERTIES:\n') - logging.warning("Added structural: ".format(line[1][0], structured_lines[-1].strip())) - elif ltype not in (STRUCTURAL_T, PROPERTIES_T) and last_type == PROPERTIES_T: + structured_lines.append(" " * content.index(":") + ":PROPERTIES:\n") + logging.warning( + "Added structural: ".format( + line[1][0], structured_lines[-1].strip() + ) + ) + elif ( + ltype not in (STRUCTURAL_T, PROPERTIES_T) and last_type == PROPERTIES_T + ): # No structural closing last_line = lines[i - 1][1][1] - structured_lines.append(' ' * last_line.index(':') + ':END:\n') - logging.warning("Added structural:{}: {}".format(line[1][0], structured_lines[-1].strip())) + structured_lines.append(" " * last_line.index(":") + ":END:\n") + logging.warning( + "Added structural:{}: {}".format( + line[1][0], structured_lines[-1].strip() + ) + ) elif ltype != CONTENT_T: - content = content + '\n' + content = content + "\n" last_type = ltype structured_lines.append(content) - yield ''.join(structured_lines) + yield "".join(structured_lines) for child in headline.children: yield from self.dump_headline(child) @@ -555,8 +574,8 @@ class OrgDom: for headline in self.headlines: yield from self.dump_headline(headline) -class OrgDomReader: +class OrgDomReader: def __init__(self): self.headlines: List[Headline] = [] self.keywords: List[Property] = [] @@ -569,18 +588,18 @@ class OrgDomReader: ## Construction def add_headline(self, linenum: int, match: re.Match) -> int: # Position reader on the proper headline - stars = match.group('stars') + stars = match.group("stars") depth = len(stars) headline = { - 'linenum': linenum, - 'orig': match, - 'title': match.group('line'), - 'contents': [], - 'children': [], - 'keywords': [], - 'properties': [], - 'structural': [], + "linenum": linenum, + "orig": match, + "title": match.group("line"), + "contents": [], + "children": [], + "keywords": [], + "properties": [], + "structural": [], } while (depth - 2) > len(self.headline_hierarchy): @@ -592,41 +611,46 @@ class OrgDomReader: if depth == 1: self.headlines.append(headline) else: - self.headline_hierarchy[-1]['children'].append(headline) + self.headline_hierarchy[-1]["children"].append(headline) self.headline_hierarchy.append(headline) - def add_keyword_line(self, linenum: int, match: re.Match) -> int: - options = match.group('options') - kw = Keyword(linenum, match, match.group('key'), match.group('value'), options if options is not None else '') + options = match.group("options") + kw = Keyword( + linenum, + match, + match.group("key"), + match.group("value"), + options if options is not None else "", + ) if len(self.headline_hierarchy) == 0: self.keywords.append(kw) else: - self.headline_hierarchy[-1]['keywords'].append(kw) + self.headline_hierarchy[-1]["keywords"].append(kw) def add_raw_line(self, linenum: int, line: str) -> int: raw = RawLine(linenum, line) if len(self.headline_hierarchy) == 0: self.contents.append(raw) else: - self.headline_hierarchy[-1]['contents'].append(raw) + self.headline_hierarchy[-1]["contents"].append(raw) def add_property_drawer_line(self, linenum: int, line: str, match: re.Match) -> int: - self.current_drawer = self.headline_hierarchy[-1]['properties'] - self.headline_hierarchy[-1]['structural'].append((linenum, line)) + self.current_drawer = self.headline_hierarchy[-1]["properties"] + self.headline_hierarchy[-1]["structural"].append((linenum, line)) def add_drawer_end_line(self, linenum: int, line: str, match: re.Match) -> int: self.current_drawer = None - self.headline_hierarchy[-1]['structural'].append((linenum, line)) + self.headline_hierarchy[-1]["structural"].append((linenum, line)) def add_node_properties_line(self, linenum: int, match: re.Match) -> int: - key = match.group('key') - value = match.group('value').strip() + key = match.group("key") + value = match.group("value").strip() - if (value.count('>--<') == 1) or (value.count(']--[') == 1): + if (value.count(">--<") == 1) or (value.count("]--[") == 1): # Time ranges with two different dates # @TODO properly consider "=> DURATION" section - chunks = value.split('=').split('--') + chunks = value.split("=").split("--") as_time_range = parse_org_time(chunks[0], chunks[1]) if (as_time_range[0] is not None) and (as_time_range[1] is not None): value = TimeRange(as_time_range[0], as_time_range[1]) @@ -636,7 +660,7 @@ class OrgDomReader: self.current_drawer.append(Property(linenum, match, key, value, None)) def read(self, s, environment): - lines = s.split('\n') + lines = s.split("\n") reader = enumerate(lines) for linenum, line in reader: @@ -653,7 +677,7 @@ class OrgDomReader: elif m := NODE_PROPERTIES_RE.match(line): self.add_node_properties_line(linenum, m) else: - raise NotImplementedError('{}: ‘{}’'.format(linenum, line)) + raise NotImplementedError("{}: ‘{}’".format(linenum, line)) def loads(s, environment=BASE_ENVIRONMENT, extra_cautious=False): @@ -662,7 +686,9 @@ def loads(s, environment=BASE_ENVIRONMENT, extra_cautious=False): dom = doc.finalize() if extra_cautious: # Check that all options can be properly re-serialized if dumps(dom) != s: - raise NotImplementedError("Error re-serializing, file uses something not implemented") + raise NotImplementedError( + "Error re-serializing, file uses something not implemented" + ) return dom @@ -672,6 +698,6 @@ def load(f, environment=BASE_ENVIRONMENT, extra_cautious=False): def dumps(doc): dump = list(doc.dump()) - result = '\n'.join(dump) + result = "\n".join(dump) print(result) return result diff --git a/org_dom/utils.py b/org_dom/utils.py index ce77add..34747a0 100644 --- a/org_dom/utils.py +++ b/org_dom/utils.py @@ -1,4 +1,6 @@ -from .org_dom import Headline, Line, RawLine, Text, Bold, Code, Italic, Strike, Underlined, Verbatim +from .org_dom import (Bold, Code, Headline, Italic, Line, RawLine, Strike, + Text, Underlined, Verbatim) + def get_hl_raw_contents(doc: Headline) -> str: lines = [] diff --git a/tests/02-markup.org b/tests/02-markup.org index 41c2bb2..1de34da 100644 --- a/tests/02-markup.org +++ b/tests/02-markup.org @@ -22,6 +22,8 @@ This is a nested *bold =verbatim /italic +strike _underlined ~code .~ ._ .+ ./ .= .* + This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~ + This is a _ non-underlined phrase because an incorrectly placed content _. This is a _ non-underlined phrase because an incorrectly placed content beginning_. diff --git a/tests/test_dom.py b/tests/test_dom.py index 7e855ba..9321502 100644 --- a/tests/test_dom.py +++ b/tests/test_dom.py @@ -5,7 +5,7 @@ from datetime import datetime as DT from org_dom import dumps, load, loads from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE, - UNDERLINED, VERBATIM, WEB_LINK, Dom,) + UNDERLINED, VERBATIM, WEB_LINK, Dom, Tokens) DIR = os.path.dirname(os.path.abspath(__file__)) @@ -47,7 +47,7 @@ class TestSerde(unittest.TestCase): self.assertEqual(dumps(doc), orig) def test_markup_file_02(self): - self.maxDiff = 1024 + self.maxDiff = 10000 with open(os.path.join(DIR, '02-markup.org')) as f: doc = load(f) @@ -82,11 +82,33 @@ class TestSerde(unittest.TestCase): SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])), SPAN("\n"), - # SPAN(""), - # # TODO: THIS IS INTERLEAVED, not nested - # In ORG: This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~ - # SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])), - # SPAN(""), + SPAN("\n"), + # THIS IS INTERLEAVED, not nested + SPAN([" This is a interleaved ", + Tokens.BOLD_START, + "bold ", + Tokens.VERBATIM_START, + "verbatim ", + Tokens.ITALIC_START, + "italic ", + Tokens.STRIKE_START, + "strike ", + Tokens.UNDERLINED_START, + "underlined ", + Tokens.CODE_START, + "code .", + Tokens.BOLD_END, + " .", + Tokens.VERBATIM_END, + " .", + Tokens.ITALIC_END, + " .", + Tokens.STRIKE_END, + " .", + Tokens.UNDERLINED_END, + " .", + Tokens.CODE_END, + "\n"]), SPAN("\n"), SPAN(" This is a _ non-underlined phrase because an incorrectly placed content _.\n"), diff --git a/tests/utils/dom_assertions.py b/tests/utils/dom_assertions.py index b8aafa8..0a69372 100644 --- a/tests/utils/dom_assertions.py +++ b/tests/utils/dom_assertions.py @@ -2,7 +2,8 @@ import collections import unittest from datetime import datetime -from org_dom import Line, Text, Bold, Code, Italic, Strike, Underlined, Verbatim, get_raw_contents +from org_dom import (Bold, Code, Italic, Line, Strike, Text, Underlined, + Verbatim, get_raw_contents) def timestamp_to_datetime(ts): @@ -13,7 +14,7 @@ def get_raw(doc): if isinstance(doc, str): return doc elif isinstance(doc, list): - return ''.join([get_raw(e) for e in doc]) + return "".join([get_raw(e) for e in doc]) else: return doc.get_raw() @@ -44,8 +45,7 @@ class Dom: test_case.assertEqual(len(doc.getTopHeadlines()), 0, "Top") else: doc_headlines = doc.getTopHeadlines() - test_case.assertEqual(len(doc_headlines), len(self.children), - "Top") + test_case.assertEqual(len(doc_headlines), len(self.children), "Top") for i, children in enumerate(self.children): children.assert_matches(test_case, doc_headlines[i]) @@ -72,7 +72,8 @@ class HL: test_case.assertEqual(doc_props[i].key, prop[0]) if isinstance(prop[1], datetime): test_case.assertEqual( - timestamp_to_datetime(doc_props[i].value), prop[1]) + timestamp_to_datetime(doc_props[i].value), prop[1] + ) test_case.assertEqual(get_raw_contents(doc), self.get_raw()) @@ -81,14 +82,13 @@ class HL: test_case.assertEqual(len(doc.children), 0) else: doc_headlines = doc.children - test_case.assertEqual(len(doc_headlines), len(self.children), - self.title) + test_case.assertEqual(len(doc_headlines), len(self.children), self.title) for i, children in enumerate(self.children): children.assert_matches(test_case, doc_headlines[i]) def get_raw(self): - return ''.join(map(get_raw, self.content)) + return "".join(map(get_raw, self.content)) class SPAN: @@ -100,10 +100,16 @@ class SPAN: for section in self.contents: if isinstance(section, str): chunks.append(section) + elif isinstance(section, list): + for subsection in section: + if isinstance(subsection, str): + chunks.append(subsection) + else: + chunks.append(subsection.get_raw()) else: chunks.append(section.get_raw()) - return ''.join(chunks) + return "".join(chunks) def assert_matches(self, test_case, doc): if not isinstance(doc, Line): @@ -121,7 +127,7 @@ class BOLD: self.text = text def get_raw(self): - return '*{}*'.format(get_raw(self.text)) + return "*{}*".format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Bold)) @@ -133,29 +139,31 @@ class CODE: self.text = text def get_raw(self): - return '~{}~'.format(get_raw(self.text)) + return "~{}~".format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Code)) test_case.assertEqual(self.text, other.contents) + class ITALIC: def __init__(self, text): self.text = text def get_raw(self): - return '/{}/'.format(get_raw(self.text)) + return "/{}/".format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Italic)) test_case.assertEqual(self.text, other.contents) + class STRIKE: def __init__(self, text): self.text = text def get_raw(self): - return '+{}+'.format(get_raw(self.text)) + return "+{}+".format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Strike)) @@ -167,32 +175,54 @@ class UNDERLINED: self.text = text def get_raw(self): - return '_{}_'.format(get_raw(self.text)) + return "_{}_".format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Underlined)) test_case.assertEqual(self.text, other.contents) + class VERBATIM: def __init__(self, text): self.text = text def get_raw(self): - return '={}='.format(get_raw(self.text)) + return "={}=".format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Verbatim)) test_case.assertEqual(self.text, other.contents) + class WEB_LINK: def __init__(self, text, link): self.text = text self.link = link def get_raw(self): - return '[[{}][{}]]'.format(self.link, self.text) + return "[[{}][{}]]".format(self.link, self.text) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, WebLink)) test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.link, other.link) + + +class Tokens: + BOLD_END = "*" + BOLD_START = "*" + + VERBATIM_START = "=" + VERBATIM_END = "=" + + ITALIC_START = "/" + ITALIC_END = "/" + + STRIKE_START = "+" + STRIKE_END = "+" + + UNDERLINED_START = "_" + UNDERLINED_END = "_" + + CODE_START = "~" + CODE_END = "~"