Format with black, use tokens for markup segmentation.

- Don't use trees in first instance as interleaving might be lossy.
This commit is contained in:
Sergio Martínez Portela 2020-11-02 20:39:16 +01:00
parent f6de69fd90
commit e73ce5d480
5 changed files with 356 additions and 274 deletions

View File

@ -1,11 +1,12 @@
import collections
import logging import logging
import re import re
import collections from enum import Enum
from typing import List, Tuple from typing import List, Tuple
BASE_ENVIRONMENT = { BASE_ENVIRONMENT = {
'org-footnote-section': 'Footnotes', "org-footnote-section": "Footnotes",
'org-options-keywords': ( "org-options-keywords": (
"ARCHIVE:", "ARCHIVE:",
"AUTHOR:", "AUTHOR:",
"BIND:", "BIND:",
@ -30,52 +31,103 @@ BASE_ENVIRONMENT = {
"SEQ_TODO:", "SEQ_TODO:",
"SETUPFILE:", "SETUPFILE:",
"STARTUP:", "STARTUP:",
"TAGS:" "TAGS:" "TITLE:",
"TITLE:",
"TODO:", "TODO:",
"TYP_TODO:", "TYP_TODO:",
"SELECT_TAGS:", "SELECT_TAGS:",
"EXCLUDE_TAGS:" "EXCLUDE_TAGS:",
), ),
} }
HEADLINE_RE = re.compile(r'^(?P<stars>\*+) (?P<spacing>\s*)(?P<line>.*)$') HEADLINE_RE = re.compile(r"^(?P<stars>\*+) (?P<spacing>\s*)(?P<line>.*)$")
KEYWORDS_RE = re.compile(r'^(?P<indentation>\s*)#\+(?P<key>[^:\[]+)(\[(?P<options>[^\]]*)\])?:(?P<spacing>\s*)(?P<value>.*)$') KEYWORDS_RE = re.compile(
PROPERTY_DRAWER_RE = re.compile(r'^(?P<indentation>\s*):PROPERTIES:(?P<end_indentation>\s*)$') r"^(?P<indentation>\s*)#\+(?P<key>[^:\[]+)(\[(?P<options>[^\]]*)\])?:(?P<spacing>\s*)(?P<value>.*)$"
DRAWER_END_RE = re.compile(r'^(?P<indentation>\s*):END:(?P<end_indentation>\s*)$') )
NODE_PROPERTIES_RE = re.compile(r'^(?P<indentation>\s*):(?P<key>[^+:]+)(?P<plus>\+)?:(?P<spacing>\s*)(?P<value>.*)$') PROPERTY_DRAWER_RE = re.compile(
RAW_LINE_RE = re.compile(r'^\s*([^\s#:*]|$)') r"^(?P<indentation>\s*):PROPERTIES:(?P<end_indentation>\s*)$"
BASE_TIME_STAMP_RE = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<dow>[^ ]+)( (?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2})(--(?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?)?' )
DRAWER_END_RE = re.compile(r"^(?P<indentation>\s*):END:(?P<end_indentation>\s*)$")
NODE_PROPERTIES_RE = re.compile(
r"^(?P<indentation>\s*):(?P<key>[^+:]+)(?P<plus>\+)?:(?P<spacing>\s*)(?P<value>.*)$"
)
RAW_LINE_RE = re.compile(r"^\s*([^\s#:*]|$)")
BASE_TIME_STAMP_RE = r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<dow>[^ ]+)( (?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2})(--(?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?)?"
ACTIVE_TIME_STAMP_RE = re.compile(r'<{}>'.format(BASE_TIME_STAMP_RE)) ACTIVE_TIME_STAMP_RE = re.compile(r"<{}>".format(BASE_TIME_STAMP_RE))
INACTIVE_TIME_STAMP_RE = re.compile(r'\[{}\]'.format(BASE_TIME_STAMP_RE)) INACTIVE_TIME_STAMP_RE = re.compile(r"\[{}\]".format(BASE_TIME_STAMP_RE))
# BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?', # BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
# r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?') # r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
Headline = collections.namedtuple('Headline', ('start_line', 'depth', Headline = collections.namedtuple(
'orig', "Headline",
'properties', 'keywords', (
'priority_start', 'priority', "start_line",
'title_start', 'title', "depth",
'tags_start', 'tags', "orig",
'contents', "properties",
'children', "keywords",
'structural', "priority_start",
)) "priority",
"title_start",
"title",
"tags_start",
"tags",
"contents",
"children",
"structural",
),
)
RawLine = collections.namedtuple('RawLine', ('linenum', 'line')) RawLine = collections.namedtuple("RawLine", ("linenum", "line"))
Keyword = collections.namedtuple('Keyword', ('linenum', 'match', 'key', 'value', 'options')) Keyword = collections.namedtuple(
Property = collections.namedtuple('Property', ('linenum', 'match', 'key', 'value', 'options')) "Keyword", ("linenum", "match", "key", "value", "options")
)
Property = collections.namedtuple(
"Property", ("linenum", "match", "key", "value", "options")
)
# @TODO How are [YYYY-MM-DD HH:mm--HH:mm] and ([... HH:mm]--[... HH:mm]) differentiated ? # @TODO How are [YYYY-MM-DD HH:mm--HH:mm] and ([... HH:mm]--[... HH:mm]) differentiated ?
# @TODO Consider recurrence annotations # @TODO Consider recurrence annotations
TimeRange = collections.namedtuple('TimeRange', ('start_time', 'end_time')) TimeRange = collections.namedtuple("TimeRange", ("start_time", "end_time"))
Timestamp = collections.namedtuple('Timestamp', ('active', 'year', 'month', 'day', 'dow', 'hour', 'minute')) Timestamp = collections.namedtuple(
"Timestamp", ("active", "year", "month", "day", "dow", "hour", "minute")
)
class MarkerType(Enum):
NO_MODE = 0b0
BOLD_MODE = 0b1
CODE_MODE = 0b10
ITALIC_MODE = 0b100
STRIKE_MODE = 0b1000
UNDERLINED_MODE = 0b10000
VERBATIM_MODE = 0b100000
MARKERS = {
"*": MarkerType.BOLD_MODE,
"~": MarkerType.CODE_MODE,
"/": MarkerType.ITALIC_MODE,
"+": MarkerType.STRIKE_MODE,
"_": MarkerType.UNDERLINED_MODE,
"=": MarkerType.VERBATIM_MODE,
}
ModeToMarker = {}
for tok, mode in MARKERS.items():
ModeToMarker[mode] = tok
MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type"))
BEGIN_PROPERTIES = "OPEN_PROPERTIES"
END_PROPERTIES = "CLOSE_PROPERTIES"
def token_from_type(tok_type):
print(ModeToMarker, tok_type)
return ModeToMarker[tok_type]
BEGIN_PROPERTIES = 'OPEN_PROPERTIES'
END_PROPERTIES = 'CLOSE_PROPERTIES'
def parse_org_time(value): def parse_org_time(value):
if m := ACTIVE_TIME_STAMP_RE.match(value): if m := ACTIVE_TIME_STAMP_RE.match(value):
@ -85,29 +137,57 @@ def parse_org_time(value):
else: else:
return None return None
if m.group('end_hour'): if m.group("end_hour"):
return TimeRange(Timestamp(active, int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))), return TimeRange(
Timestamp(active, int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('end_hour')), int(m.group('end_minute')))) Timestamp(
return Timestamp(active, int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))) active,
int(m.group("year")),
int(m.group("month")),
int(m.group("day")),
m.group("dow"),
int(m.group("start_hour")),
int(m.group("start_minute")),
),
Timestamp(
active,
int(m.group("year")),
int(m.group("month")),
int(m.group("day")),
m.group("dow"),
int(m.group("end_hour")),
int(m.group("end_minute")),
),
)
return Timestamp(
active,
int(m.group("year")),
int(m.group("month")),
int(m.group("day")),
m.group("dow"),
int(m.group("start_hour")),
int(m.group("start_minute")),
)
def timestamp_to_string(ts): def timestamp_to_string(ts):
date = '{year}-{month:02d}-{day:02d}'.format( date = "{year}-{month:02d}-{day:02d}".format(
year=ts.year, year=ts.year, month=ts.month, day=ts.day
month=ts.month,
day=ts.day
) )
if ts.dow: if ts.dow:
date = date + ' ' + ts.dow date = date + " " + ts.dow
if ts.hour is not None: if ts.hour is not None:
base = '{date} {hour:02}:{minute:02d}'.format(date=date, hour=ts.hour, minute=ts.minute) base = "{date} {hour:02}:{minute:02d}".format(
date=date, hour=ts.hour, minute=ts.minute
)
else: else:
base = date base = date
if ts.active: if ts.active:
return '<{}>'.format(base) return "<{}>".format(base)
else: else:
return '[{}]'.format(base) return "[{}]".format(base)
def get_raw(doc): def get_raw(doc):
if isinstance(doc, str): if isinstance(doc, str):
@ -115,6 +195,7 @@ def get_raw(doc):
else: else:
return doc.get_raw() return doc.get_raw()
class Line: class Line:
def __init__(self, linenum, contents): def __init__(self, linenum, contents):
self.linenum = linenum self.linenum = linenum
@ -127,7 +208,8 @@ class Line:
rawchunks.append(chunk) rawchunks.append(chunk)
else: else:
rawchunks.append(chunk.get_raw()) rawchunks.append(chunk.get_raw())
return ''.join(rawchunks) + '\n' return "".join(rawchunks) + "\n"
class Text: class Text:
def __init__(self, contents, line): def __init__(self, contents, line):
@ -135,104 +217,122 @@ class Text:
self.linenum = line self.linenum = line
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) contents = []
return raw for chunk in self.contents:
if isinstance(chunk, str):
contents.append(chunk)
else:
assert isinstance(chunk, MarkerToken)
contents.append(token_from_type(chunk.tok_type))
return ''.join(contents)
class Bold: class Bold:
Marker = '*' Marker = "*"
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(map(get_raw, self.contents)) raw = "".join(map(get_raw, self.contents))
return f"{self.Marker}{raw}{self.Marker}" return f"{self.Marker}{raw}{self.Marker}"
class Code: class Code:
Marker = '~' Marker = "~"
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(map(get_raw, self.contents)) raw = "".join(map(get_raw, self.contents))
return f"{self.Marker}{raw}{self.Marker}" return f"{self.Marker}{raw}{self.Marker}"
class Italic: class Italic:
Marker = '/' Marker = "/"
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(map(get_raw, self.contents)) raw = "".join(map(get_raw, self.contents))
return f"{self.Marker}{raw}{self.Marker}" return f"{self.Marker}{raw}{self.Marker}"
class Strike: class Strike:
Marker = '+' Marker = "+"
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(map(get_raw, self.contents)) raw = "".join(map(get_raw, self.contents))
return f"{self.Marker}{raw}{self.Marker}" return f"{self.Marker}{raw}{self.Marker}"
class Underlined: class Underlined:
Marker = '_' Marker = "_"
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(map(get_raw, self.contents)) raw = "".join(map(get_raw, self.contents))
return f"{self.Marker}{raw}{self.Marker}" return f"{self.Marker}{raw}{self.Marker}"
class Verbatim: class Verbatim:
Marker = '=' Marker = "="
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(map(get_raw, self.contents)) raw = "".join(map(get_raw, self.contents))
return f"{self.Marker}{raw}{self.Marker}" return f"{self.Marker}{raw}{self.Marker}"
def is_pre(char: str) -> bool: def is_pre(char: str) -> bool:
if isinstance(char, str): if isinstance(char, str):
return char in '\n\r\t -({\'"' return char in "\n\r\t -({'\""
else: else:
return True return True
def is_marker(char: str) -> bool: def is_marker(char: str) -> bool:
if isinstance(char, str): if isinstance(char, str):
return char in '*=/+_~' return char in "*=/+_~"
else: else:
return False return False
def is_border(char: str) -> bool: def is_border(char: str) -> bool:
if isinstance(char, str): if isinstance(char, str):
return char not in '\n\r\t ' return char not in "\n\r\t "
else: else:
return False return False
def is_body(char: str) -> bool: def is_body(char: str) -> bool:
if isinstance(char, str): if isinstance(char, str):
return True return True
else: else:
return False return False
def is_post(char: str) -> bool: def is_post(char: str) -> bool:
if isinstance(char, str): if isinstance(char, str):
return char in '-.,;:!?\')}["' return char in "-.,;:!?')}[\""
else: else:
return False return False
TOKEN_TYPE_TEXT = 0 TOKEN_TYPE_TEXT = 0
TOKEN_TYPE_OPEN_MARKER = 1 TOKEN_TYPE_OPEN_MARKER = 1
TOKEN_TYPE_CLOSE_MARKER = 2 TOKEN_TYPE_CLOSE_MARKER = 2
def tokenize_contents(contents: str): def tokenize_contents(contents: str):
tokens = [] tokens = []
last_char = None last_char = None
@ -244,17 +344,17 @@ def tokenize_contents(contents: str):
has_changed = False has_changed = False
if ( if (
(i not in closes) (i not in closes)
and is_marker(char) and is_marker(char)
and is_pre(last_char) and is_pre(last_char)
and ((i + 1 < len(contents)) and ((i + 1 < len(contents)) and is_border(contents[i + 1]))
and is_border(contents[i + 1]))): ):
is_valid_mark = False is_valid_mark = False
# Check that is closed later # Check that is closed later
text_in_line = True text_in_line = True
for j in range(i, len(contents) - 1): for j in range(i, len(contents) - 1):
if contents[j] == '\n': if contents[j] == "\n":
if not text_in_line: if not text_in_line:
break break
text_in_line = False text_in_line = False
@ -267,13 +367,13 @@ def tokenize_contents(contents: str):
if is_valid_mark: if is_valid_mark:
if len(text) > 0: if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
text = [] text = []
tokens.append((TOKEN_TYPE_OPEN_MARKER, char)) tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
has_changed = True has_changed = True
elif i in closes: elif i in closes:
if len(text) > 0: if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
text = [] text = []
tokens.append((TOKEN_TYPE_CLOSE_MARKER, char)) tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
has_changed = True has_changed = True
@ -283,156 +383,57 @@ def tokenize_contents(contents: str):
last_char = char last_char = char
if len(text) > 0: if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
return tokens return tokens
def parse_contents(raw_contents:List[RawLine]): def parse_contents(raw_contents: List[RawLine]):
NO_MODE = 0b0
BOLD_MODE = 0b1
CODE_MODE = 0b10
ITALIC_MODE = 0b100
STRIKE_MODE = 0b1000
UNDERLINED_MODE = 0b10000
VERBATIM_MODE = 0b100000
MARKERS = {
'*': BOLD_MODE,
'~': CODE_MODE,
'/': ITALIC_MODE,
'+': STRIKE_MODE,
'_': UNDERLINED_MODE,
'=': VERBATIM_MODE,
}
MODES = (
(BOLD_MODE, Bold),
(CODE_MODE, Code),
(ITALIC_MODE, Italic),
(STRIKE_MODE, Strike),
(UNDERLINED_MODE, Underlined),
(VERBATIM_MODE, Verbatim),
)
_MODES = {
BOLD_MODE: Bold,
CODE_MODE: Code,
ITALIC_MODE: Italic,
STRIKE_MODE: Strike,
UNDERLINED_MODE: Underlined,
VERBATIM_MODE: Verbatim,
}
mode = NO_MODE
escaped = False
chunk = []
inline = []
chunks = []
linenum = start_linenum = raw_contents[0].linenum
contents_buff = [] contents_buff = []
for line in raw_contents: for line in raw_contents:
contents_buff.append(line.line) contents_buff.append(line.line)
contents = '\n'.join(contents_buff) contents = "\n".join(contents_buff)
tokens = tokenize_contents(contents) tokens = tokenize_contents(contents)
current_line = raw_contents[0].linenum
# Use tokens to tag chunks of text with it's container type
for (tok_type, tok_val) in tokens:
if tok_type == TOKEN_TYPE_TEXT:
chunks.append((mode, tok_val))
elif tok_type == TOKEN_TYPE_OPEN_MARKER:
mode = mode | MARKERS[tok_val]
elif tok_type == TOKEN_TYPE_OPEN_MARKER:
mode = mode ^ MARKERS[tok_val]
# Convert those chunks to a tree
def tree_for_tag(tag, in_mode):
tree = []
for (mask, mode) in MODES:
if (mask & tag) and not (mask & in_mode):
tree.append(mode)
print(tree)
if len(tree) == 0:
return Text
if len(raw_contents) > 0:
current_line = raw_contents[0].linenum
# tree = []
# pos = []
# print('\n'.join(map(str, chunks)))
# for (tag, chunk) in chunks:
# if pos == []:
# tree.append(tree_for_tag(tag, NO_MODE)(chunk, line=current_line))
# pos.append(tree[-1])
# else:
# raise NotImplementedError()
# current_line += chunk.count('\n')
tree = []
mode_tree = []
contents = [] contents = []
# Use tokens to tag chunks of text with it's container type # Use tokens to tag chunks of text with it's container type
for (tok_type, tok_val) in tokens: for (tok_type, tok_val) in tokens:
if tok_type == TOKEN_TYPE_TEXT: if tok_type == TOKEN_TYPE_TEXT:
if len(mode_tree) == 0: contents.append(tok_val)
tree.append(Text(tok_val, current_line))
else:
contents[-1].append(tok_val)
current_line += chunk.count('\n')
elif tok_type == TOKEN_TYPE_OPEN_MARKER: elif tok_type == TOKEN_TYPE_OPEN_MARKER:
mode_tree.append(_MODES[MARKERS[tok_val]]) contents.append(MarkerToken(False, MARKERS[tok_val]))
contents.append([])
elif tok_type == TOKEN_TYPE_CLOSE_MARKER: elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
mode = _MODES[MARKERS[tok_val]] contents.append(MarkerToken(True, MARKERS[tok_val]))
matching_mode = mode_tree.pop()
assert mode == matching_mode
value = mode(contents.pop(), current_line)
current_line += chunk.count('\n')
if len(mode_tree) == 0: # Closed branch of tree return [Text(contents, current_line)]
tree.append(value)
else:
print("{} <- {}".format(mode_tree[-1], mode))
contents[-1].append(value)
current_line += chunk.count('\n')
if len(tree) > 3:
print("L", len(tree))
print("F:", tree)
return tree
def parse_headline(hl) -> Headline: def parse_headline(hl) -> Headline:
stars = hl['orig'].group('stars') stars = hl["orig"].group("stars")
depth = len(stars) depth = len(stars)
# TODO: Parse line for priority, cookies and tags # TODO: Parse line for priority, cookies and tags
line = hl['orig'].group('line') line = hl["orig"].group("line")
title = line.strip() title = line.strip()
contents = parse_contents(hl['contents']) contents = parse_contents(hl["contents"])
return Headline(start_line=hl['linenum'], return Headline(
depth=depth, start_line=hl["linenum"],
orig=hl['orig'], depth=depth,
title=title, orig=hl["orig"],
contents=contents, title=title,
children=[parse_headline(child) for child in hl['children']], contents=contents,
keywords=hl['keywords'], children=[parse_headline(child) for child in hl["children"]],
properties=hl['properties'], keywords=hl["keywords"],
structural=hl['structural'], properties=hl["properties"],
title_start=None, structural=hl["structural"],
priority=None, title_start=None,
priority_start=None, priority=None,
tags_start=None, priority_start=None,
tags=None, tags_start=None,
tags=None,
) )
@ -454,35 +455,41 @@ class OrgDom:
# Writing # Writing
def dump_kw(self, kw): def dump_kw(self, kw):
options = kw.match.group('options') options = kw.match.group("options")
if not options: if not options:
options = '' options = ""
return (kw.linenum, return (
'{indentation}#+{key}{options}:{spacing}{value}'.format( kw.linenum,
indentation=kw.match.group('indentation'), "{indentation}#+{key}{options}:{spacing}{value}".format(
key=kw.key, indentation=kw.match.group("indentation"),
options=kw.options, key=kw.key,
spacing=kw.match.group('spacing'), options=kw.options,
value=kw.value, spacing=kw.match.group("spacing"),
)) value=kw.value,
),
)
def dump_property(self, prop: Property): def dump_property(self, prop: Property):
plus = prop.match.group('plus') plus = prop.match.group("plus")
if plus is None: plus = '' if plus is None:
plus = ""
if isinstance(prop.value, Timestamp): if isinstance(prop.value, Timestamp):
value = timestamp_to_string(prop.value) value = timestamp_to_string(prop.value)
else: else:
value = prop.value value = prop.value
return (prop.linenum, '{indentation}:{key}{plus}:{spacing}{value}'.format( return (
indentation=prop.match.group('indentation'), prop.linenum,
key=prop.key, "{indentation}:{key}{plus}:{spacing}{value}".format(
plus=plus, indentation=prop.match.group("indentation"),
spacing=prop.match.group('spacing'), key=prop.key,
value=value, plus=plus,
)) spacing=prop.match.group("spacing"),
value=value,
),
)
def dump_contents(self, raw): def dump_contents(self, raw):
if isinstance(raw, RawLine): if isinstance(raw, RawLine):
@ -494,7 +501,9 @@ class OrgDom:
return (structural[0], structural[1]) return (structural[0], structural[1])
def dump_headline(self, headline): def dump_headline(self, headline):
yield '*' * headline.depth + ' ' + headline.orig.group('spacing') + headline.title yield "*" * headline.depth + " " + headline.orig.group(
"spacing"
) + headline.title
lines = [] lines = []
KW_T = 0 KW_T = 0
@ -523,21 +532,31 @@ class OrgDom:
if ltype == PROPERTIES_T and last_type not in (STRUCTURAL_T, PROPERTIES_T): if ltype == PROPERTIES_T and last_type not in (STRUCTURAL_T, PROPERTIES_T):
# No structural opening # No structural opening
structured_lines.append(' ' * content.index(':') + ':PROPERTIES:\n') structured_lines.append(" " * content.index(":") + ":PROPERTIES:\n")
logging.warning("Added structural: ".format(line[1][0], structured_lines[-1].strip())) logging.warning(
elif ltype not in (STRUCTURAL_T, PROPERTIES_T) and last_type == PROPERTIES_T: "Added structural: ".format(
line[1][0], structured_lines[-1].strip()
)
)
elif (
ltype not in (STRUCTURAL_T, PROPERTIES_T) and last_type == PROPERTIES_T
):
# No structural closing # No structural closing
last_line = lines[i - 1][1][1] last_line = lines[i - 1][1][1]
structured_lines.append(' ' * last_line.index(':') + ':END:\n') structured_lines.append(" " * last_line.index(":") + ":END:\n")
logging.warning("Added structural:{}: {}".format(line[1][0], structured_lines[-1].strip())) logging.warning(
"Added structural:{}: {}".format(
line[1][0], structured_lines[-1].strip()
)
)
elif ltype != CONTENT_T: elif ltype != CONTENT_T:
content = content + '\n' content = content + "\n"
last_type = ltype last_type = ltype
structured_lines.append(content) structured_lines.append(content)
yield ''.join(structured_lines) yield "".join(structured_lines)
for child in headline.children: for child in headline.children:
yield from self.dump_headline(child) yield from self.dump_headline(child)
@ -555,8 +574,8 @@ class OrgDom:
for headline in self.headlines: for headline in self.headlines:
yield from self.dump_headline(headline) yield from self.dump_headline(headline)
class OrgDomReader:
class OrgDomReader:
def __init__(self): def __init__(self):
self.headlines: List[Headline] = [] self.headlines: List[Headline] = []
self.keywords: List[Property] = [] self.keywords: List[Property] = []
@ -569,18 +588,18 @@ class OrgDomReader:
## Construction ## Construction
def add_headline(self, linenum: int, match: re.Match) -> int: def add_headline(self, linenum: int, match: re.Match) -> int:
# Position reader on the proper headline # Position reader on the proper headline
stars = match.group('stars') stars = match.group("stars")
depth = len(stars) depth = len(stars)
headline = { headline = {
'linenum': linenum, "linenum": linenum,
'orig': match, "orig": match,
'title': match.group('line'), "title": match.group("line"),
'contents': [], "contents": [],
'children': [], "children": [],
'keywords': [], "keywords": [],
'properties': [], "properties": [],
'structural': [], "structural": [],
} }
while (depth - 2) > len(self.headline_hierarchy): while (depth - 2) > len(self.headline_hierarchy):
@ -592,41 +611,46 @@ class OrgDomReader:
if depth == 1: if depth == 1:
self.headlines.append(headline) self.headlines.append(headline)
else: else:
self.headline_hierarchy[-1]['children'].append(headline) self.headline_hierarchy[-1]["children"].append(headline)
self.headline_hierarchy.append(headline) self.headline_hierarchy.append(headline)
def add_keyword_line(self, linenum: int, match: re.Match) -> int: def add_keyword_line(self, linenum: int, match: re.Match) -> int:
options = match.group('options') options = match.group("options")
kw = Keyword(linenum, match, match.group('key'), match.group('value'), options if options is not None else '') kw = Keyword(
linenum,
match,
match.group("key"),
match.group("value"),
options if options is not None else "",
)
if len(self.headline_hierarchy) == 0: if len(self.headline_hierarchy) == 0:
self.keywords.append(kw) self.keywords.append(kw)
else: else:
self.headline_hierarchy[-1]['keywords'].append(kw) self.headline_hierarchy[-1]["keywords"].append(kw)
def add_raw_line(self, linenum: int, line: str) -> int: def add_raw_line(self, linenum: int, line: str) -> int:
raw = RawLine(linenum, line) raw = RawLine(linenum, line)
if len(self.headline_hierarchy) == 0: if len(self.headline_hierarchy) == 0:
self.contents.append(raw) self.contents.append(raw)
else: else:
self.headline_hierarchy[-1]['contents'].append(raw) self.headline_hierarchy[-1]["contents"].append(raw)
def add_property_drawer_line(self, linenum: int, line: str, match: re.Match) -> int: def add_property_drawer_line(self, linenum: int, line: str, match: re.Match) -> int:
self.current_drawer = self.headline_hierarchy[-1]['properties'] self.current_drawer = self.headline_hierarchy[-1]["properties"]
self.headline_hierarchy[-1]['structural'].append((linenum, line)) self.headline_hierarchy[-1]["structural"].append((linenum, line))
def add_drawer_end_line(self, linenum: int, line: str, match: re.Match) -> int: def add_drawer_end_line(self, linenum: int, line: str, match: re.Match) -> int:
self.current_drawer = None self.current_drawer = None
self.headline_hierarchy[-1]['structural'].append((linenum, line)) self.headline_hierarchy[-1]["structural"].append((linenum, line))
def add_node_properties_line(self, linenum: int, match: re.Match) -> int: def add_node_properties_line(self, linenum: int, match: re.Match) -> int:
key = match.group('key') key = match.group("key")
value = match.group('value').strip() value = match.group("value").strip()
if (value.count('>--<') == 1) or (value.count(']--[') == 1): if (value.count(">--<") == 1) or (value.count("]--[") == 1):
# Time ranges with two different dates # Time ranges with two different dates
# @TODO properly consider "=> DURATION" section # @TODO properly consider "=> DURATION" section
chunks = value.split('=').split('--') chunks = value.split("=").split("--")
as_time_range = parse_org_time(chunks[0], chunks[1]) as_time_range = parse_org_time(chunks[0], chunks[1])
if (as_time_range[0] is not None) and (as_time_range[1] is not None): if (as_time_range[0] is not None) and (as_time_range[1] is not None):
value = TimeRange(as_time_range[0], as_time_range[1]) value = TimeRange(as_time_range[0], as_time_range[1])
@ -636,7 +660,7 @@ class OrgDomReader:
self.current_drawer.append(Property(linenum, match, key, value, None)) self.current_drawer.append(Property(linenum, match, key, value, None))
def read(self, s, environment): def read(self, s, environment):
lines = s.split('\n') lines = s.split("\n")
reader = enumerate(lines) reader = enumerate(lines)
for linenum, line in reader: for linenum, line in reader:
@ -653,7 +677,7 @@ class OrgDomReader:
elif m := NODE_PROPERTIES_RE.match(line): elif m := NODE_PROPERTIES_RE.match(line):
self.add_node_properties_line(linenum, m) self.add_node_properties_line(linenum, m)
else: else:
raise NotImplementedError('{}: {}'.format(linenum, line)) raise NotImplementedError("{}: {}".format(linenum, line))
def loads(s, environment=BASE_ENVIRONMENT, extra_cautious=False): def loads(s, environment=BASE_ENVIRONMENT, extra_cautious=False):
@ -662,7 +686,9 @@ def loads(s, environment=BASE_ENVIRONMENT, extra_cautious=False):
dom = doc.finalize() dom = doc.finalize()
if extra_cautious: # Check that all options can be properly re-serialized if extra_cautious: # Check that all options can be properly re-serialized
if dumps(dom) != s: if dumps(dom) != s:
raise NotImplementedError("Error re-serializing, file uses something not implemented") raise NotImplementedError(
"Error re-serializing, file uses something not implemented"
)
return dom return dom
@ -672,6 +698,6 @@ def load(f, environment=BASE_ENVIRONMENT, extra_cautious=False):
def dumps(doc): def dumps(doc):
dump = list(doc.dump()) dump = list(doc.dump())
result = '\n'.join(dump) result = "\n".join(dump)
print(result) print(result)
return result return result

View File

@ -1,4 +1,6 @@
from .org_dom import Headline, Line, RawLine, Text, Bold, Code, Italic, Strike, Underlined, Verbatim from .org_dom import (Bold, Code, Headline, Italic, Line, RawLine, Strike,
Text, Underlined, Verbatim)
def get_hl_raw_contents(doc: Headline) -> str: def get_hl_raw_contents(doc: Headline) -> str:
lines = [] lines = []

View File

@ -22,6 +22,8 @@
This is a nested *bold =verbatim /italic +strike _underlined ~code .~ ._ .+ ./ .= .* This is a nested *bold =verbatim /italic +strike _underlined ~code .~ ._ .+ ./ .= .*
This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~
This is a _ non-underlined phrase because an incorrectly placed content _. This is a _ non-underlined phrase because an incorrectly placed content _.
This is a _ non-underlined phrase because an incorrectly placed content beginning_. This is a _ non-underlined phrase because an incorrectly placed content beginning_.

View File

@ -5,7 +5,7 @@ from datetime import datetime as DT
from org_dom import dumps, load, loads from org_dom import dumps, load, loads
from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE, from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE,
UNDERLINED, VERBATIM, WEB_LINK, Dom,) UNDERLINED, VERBATIM, WEB_LINK, Dom, Tokens)
DIR = os.path.dirname(os.path.abspath(__file__)) DIR = os.path.dirname(os.path.abspath(__file__))
@ -47,7 +47,7 @@ class TestSerde(unittest.TestCase):
self.assertEqual(dumps(doc), orig) self.assertEqual(dumps(doc), orig)
def test_markup_file_02(self): def test_markup_file_02(self):
self.maxDiff = 1024 self.maxDiff = 10000
with open(os.path.join(DIR, '02-markup.org')) as f: with open(os.path.join(DIR, '02-markup.org')) as f:
doc = load(f) doc = load(f)
@ -82,11 +82,33 @@ class TestSerde(unittest.TestCase):
SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])), SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])),
SPAN("\n"), SPAN("\n"),
# SPAN(""), SPAN("\n"),
# # TODO: THIS IS INTERLEAVED, not nested # THIS IS INTERLEAVED, not nested
# In ORG: This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~ SPAN([" This is a interleaved ",
# SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])), Tokens.BOLD_START,
# SPAN(""), "bold ",
Tokens.VERBATIM_START,
"verbatim ",
Tokens.ITALIC_START,
"italic ",
Tokens.STRIKE_START,
"strike ",
Tokens.UNDERLINED_START,
"underlined ",
Tokens.CODE_START,
"code .",
Tokens.BOLD_END,
" .",
Tokens.VERBATIM_END,
" .",
Tokens.ITALIC_END,
" .",
Tokens.STRIKE_END,
" .",
Tokens.UNDERLINED_END,
" .",
Tokens.CODE_END,
"\n"]),
SPAN("\n"), SPAN("\n"),
SPAN(" This is a _ non-underlined phrase because an incorrectly placed content _.\n"), SPAN(" This is a _ non-underlined phrase because an incorrectly placed content _.\n"),

View File

@ -2,7 +2,8 @@ import collections
import unittest import unittest
from datetime import datetime from datetime import datetime
from org_dom import Line, Text, Bold, Code, Italic, Strike, Underlined, Verbatim, get_raw_contents from org_dom import (Bold, Code, Italic, Line, Strike, Text, Underlined,
Verbatim, get_raw_contents)
def timestamp_to_datetime(ts): def timestamp_to_datetime(ts):
@ -13,7 +14,7 @@ def get_raw(doc):
if isinstance(doc, str): if isinstance(doc, str):
return doc return doc
elif isinstance(doc, list): elif isinstance(doc, list):
return ''.join([get_raw(e) for e in doc]) return "".join([get_raw(e) for e in doc])
else: else:
return doc.get_raw() return doc.get_raw()
@ -44,8 +45,7 @@ class Dom:
test_case.assertEqual(len(doc.getTopHeadlines()), 0, "Top") test_case.assertEqual(len(doc.getTopHeadlines()), 0, "Top")
else: else:
doc_headlines = doc.getTopHeadlines() doc_headlines = doc.getTopHeadlines()
test_case.assertEqual(len(doc_headlines), len(self.children), test_case.assertEqual(len(doc_headlines), len(self.children), "Top")
"Top")
for i, children in enumerate(self.children): for i, children in enumerate(self.children):
children.assert_matches(test_case, doc_headlines[i]) children.assert_matches(test_case, doc_headlines[i])
@ -72,7 +72,8 @@ class HL:
test_case.assertEqual(doc_props[i].key, prop[0]) test_case.assertEqual(doc_props[i].key, prop[0])
if isinstance(prop[1], datetime): if isinstance(prop[1], datetime):
test_case.assertEqual( test_case.assertEqual(
timestamp_to_datetime(doc_props[i].value), prop[1]) timestamp_to_datetime(doc_props[i].value), prop[1]
)
test_case.assertEqual(get_raw_contents(doc), self.get_raw()) test_case.assertEqual(get_raw_contents(doc), self.get_raw())
@ -81,14 +82,13 @@ class HL:
test_case.assertEqual(len(doc.children), 0) test_case.assertEqual(len(doc.children), 0)
else: else:
doc_headlines = doc.children doc_headlines = doc.children
test_case.assertEqual(len(doc_headlines), len(self.children), test_case.assertEqual(len(doc_headlines), len(self.children), self.title)
self.title)
for i, children in enumerate(self.children): for i, children in enumerate(self.children):
children.assert_matches(test_case, doc_headlines[i]) children.assert_matches(test_case, doc_headlines[i])
def get_raw(self): def get_raw(self):
return ''.join(map(get_raw, self.content)) return "".join(map(get_raw, self.content))
class SPAN: class SPAN:
@ -100,10 +100,16 @@ class SPAN:
for section in self.contents: for section in self.contents:
if isinstance(section, str): if isinstance(section, str):
chunks.append(section) chunks.append(section)
elif isinstance(section, list):
for subsection in section:
if isinstance(subsection, str):
chunks.append(subsection)
else:
chunks.append(subsection.get_raw())
else: else:
chunks.append(section.get_raw()) chunks.append(section.get_raw())
return ''.join(chunks) return "".join(chunks)
def assert_matches(self, test_case, doc): def assert_matches(self, test_case, doc):
if not isinstance(doc, Line): if not isinstance(doc, Line):
@ -121,7 +127,7 @@ class BOLD:
self.text = text self.text = text
def get_raw(self): def get_raw(self):
return '*{}*'.format(get_raw(self.text)) return "*{}*".format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Bold)) test_case.assertTrue(isinstance(other, Bold))
@ -133,29 +139,31 @@ class CODE:
self.text = text self.text = text
def get_raw(self): def get_raw(self):
return '~{}~'.format(get_raw(self.text)) return "~{}~".format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Code)) test_case.assertTrue(isinstance(other, Code))
test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.text, other.contents)
class ITALIC: class ITALIC:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def get_raw(self): def get_raw(self):
return '/{}/'.format(get_raw(self.text)) return "/{}/".format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Italic)) test_case.assertTrue(isinstance(other, Italic))
test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.text, other.contents)
class STRIKE: class STRIKE:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def get_raw(self): def get_raw(self):
return '+{}+'.format(get_raw(self.text)) return "+{}+".format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Strike)) test_case.assertTrue(isinstance(other, Strike))
@ -167,32 +175,54 @@ class UNDERLINED:
self.text = text self.text = text
def get_raw(self): def get_raw(self):
return '_{}_'.format(get_raw(self.text)) return "_{}_".format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Underlined)) test_case.assertTrue(isinstance(other, Underlined))
test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.text, other.contents)
class VERBATIM: class VERBATIM:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def get_raw(self): def get_raw(self):
return '={}='.format(get_raw(self.text)) return "={}=".format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Verbatim)) test_case.assertTrue(isinstance(other, Verbatim))
test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.text, other.contents)
class WEB_LINK: class WEB_LINK:
def __init__(self, text, link): def __init__(self, text, link):
self.text = text self.text = text
self.link = link self.link = link
def get_raw(self): def get_raw(self):
return '[[{}][{}]]'.format(self.link, self.text) return "[[{}][{}]]".format(self.link, self.text)
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, WebLink)) test_case.assertTrue(isinstance(other, WebLink))
test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.text, other.contents)
test_case.assertEqual(self.link, other.link) test_case.assertEqual(self.link, other.link)
class Tokens:
BOLD_END = "*"
BOLD_START = "*"
VERBATIM_START = "="
VERBATIM_END = "="
ITALIC_START = "/"
ITALIC_END = "/"
STRIKE_START = "+"
STRIKE_END = "+"
UNDERLINED_START = "_"
UNDERLINED_END = "_"
CODE_START = "~"
CODE_END = "~"