Add simple support for nested markup.

This commit is contained in:
Sergio Martínez Portela 2020-10-25 20:23:08 +01:00
parent 5b886e5e24
commit f6de69fd90
6 changed files with 356 additions and 121 deletions

View File

@ -109,6 +109,11 @@ def timestamp_to_string(ts):
else: else:
return '[{}]'.format(base) return '[{}]'.format(base)
def get_raw(doc):
if isinstance(doc, str):
return doc
else:
return doc.get_raw()
class Line: class Line:
def __init__(self, linenum, contents): def __init__(self, linenum, contents):
@ -125,73 +130,190 @@ class Line:
return ''.join(rawchunks) + '\n' return ''.join(rawchunks) + '\n'
class Text: class Text:
def __init__(self, contents): def __init__(self, contents, line):
self.contents = contents self.contents = contents
self.linenum = line
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(self.contents)
return raw return raw
class Bold: class Bold:
def __init__(self, contents): Marker = '*'
def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(map(get_raw, self.contents))
return f"*{raw}*" return f"{self.Marker}{raw}{self.Marker}"
class Code: class Code:
def __init__(self, contents): Marker = '~'
def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(map(get_raw, self.contents))
return f"~{raw}~" return f"{self.Marker}{raw}{self.Marker}"
class Italic: class Italic:
def __init__(self, contents): Marker = '/'
def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(map(get_raw, self.contents))
return f"/{raw}/" return f"{self.Marker}{raw}{self.Marker}"
class Strike: class Strike:
def __init__(self, contents): Marker = '+'
def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(map(get_raw, self.contents))
return f"+{raw}+" return f"{self.Marker}{raw}{self.Marker}"
class Underlined: class Underlined:
def __init__(self, contents): Marker = '_'
def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(map(get_raw, self.contents))
return f"_{raw}_" return f"{self.Marker}{raw}{self.Marker}"
class Verbatim: class Verbatim:
def __init__(self, contents): Marker = '='
def __init__(self, contents, line):
self.contents = contents self.contents = contents
def get_raw(self): def get_raw(self):
raw = ''.join(self.contents) raw = ''.join(map(get_raw, self.contents))
return f"={raw}=" return f"{self.Marker}{raw}{self.Marker}"
def is_pre(char: str) -> bool:
if isinstance(char, str):
return char in '\n\r\t -({\'"'
else:
return True
def is_marker(char: str) -> bool:
if isinstance(char, str):
return char in '*=/+_~'
else:
return False
def is_border(char: str) -> bool:
if isinstance(char, str):
return char not in '\n\r\t '
else:
return False
def is_body(char: str) -> bool:
if isinstance(char, str):
return True
else:
return False
def is_post(char: str) -> bool:
if isinstance(char, str):
return char in '-.,;:!?\')}["'
else:
return False
TOKEN_TYPE_TEXT = 0
TOKEN_TYPE_OPEN_MARKER = 1
TOKEN_TYPE_CLOSE_MARKER = 2
def tokenize_contents(contents: str):
tokens = []
last_char = None
text = []
closes = set()
for i, char in enumerate(contents):
has_changed = False
if (
(i not in closes)
and is_marker(char)
and is_pre(last_char)
and ((i + 1 < len(contents))
and is_border(contents[i + 1]))):
is_valid_mark = False
# Check that is closed later
text_in_line = True
for j in range(i, len(contents) - 1):
if contents[j] == '\n':
if not text_in_line:
break
text_in_line = False
elif is_border(contents[j]) and contents[j + 1] == char:
is_valid_mark = True
closes.add(j + 1)
break
else:
text_in_line |= is_body(contents[j])
if is_valid_mark:
if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, ''.join(text)))
text = []
tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
has_changed = True
elif i in closes:
if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, ''.join(text)))
text = []
tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
has_changed = True
if not has_changed:
text.append(char)
last_char = char
if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, ''.join(text)))
return tokens
def parse_contents(raw_contents:List[RawLine]): def parse_contents(raw_contents:List[RawLine]):
NO_MODE = 0 NO_MODE = 0b0
BOLD_MODE = 1 BOLD_MODE = 0b1
CODE_MODE = 2 CODE_MODE = 0b10
ITALIC_MODE = 3 ITALIC_MODE = 0b100
STRIKE_MODE = 4 STRIKE_MODE = 0b1000
UNDERLINED_MODE = 5 UNDERLINED_MODE = 0b10000
VERBATIM_MODE = 6 VERBATIM_MODE = 0b100000
MODE_CLASS = { MARKERS = {
NO_MODE: Line, '*': BOLD_MODE,
'~': CODE_MODE,
'/': ITALIC_MODE,
'+': STRIKE_MODE,
'_': UNDERLINED_MODE,
'=': VERBATIM_MODE,
}
MODES = (
(BOLD_MODE, Bold),
(CODE_MODE, Code),
(ITALIC_MODE, Italic),
(STRIKE_MODE, Strike),
(UNDERLINED_MODE, Underlined),
(VERBATIM_MODE, Verbatim),
)
_MODES = {
BOLD_MODE: Bold, BOLD_MODE: Bold,
CODE_MODE: Code, CODE_MODE: Code,
ITALIC_MODE: Italic, ITALIC_MODE: Italic,
@ -213,68 +335,80 @@ def parse_contents(raw_contents:List[RawLine]):
contents_buff.append(line.line) contents_buff.append(line.line)
contents = '\n'.join(contents_buff) contents = '\n'.join(contents_buff)
tokens = tokenize_contents(contents)
for c in contents: # Use tokens to tag chunks of text with it's container type
if mode == NO_MODE: for (tok_type, tok_val) in tokens:
if escaped: if tok_type == TOKEN_TYPE_TEXT:
chunk.append(c) chunks.append((mode, tok_val))
escaped = False elif tok_type == TOKEN_TYPE_OPEN_MARKER:
mode = mode | MARKERS[tok_val]
elif tok_type == TOKEN_TYPE_OPEN_MARKER:
mode = mode ^ MARKERS[tok_val]
elif c == '\\': # Convert those chunks to a tree
escaped = True def tree_for_tag(tag, in_mode):
elif c == '*': tree = []
mode = BOLD_MODE for (mask, mode) in MODES:
elif c == '~': if (mask & tag) and not (mask & in_mode):
mode = CODE_MODE tree.append(mode)
elif c == '/': print(tree)
mode = ITALIC_MODE if len(tree) == 0:
elif c == '+': return Text
mode = STRIKE_MODE
elif c == '_':
mode = UNDERLINED_MODE if len(raw_contents) > 0:
elif c == '=': current_line = raw_contents[0].linenum
mode = VERBATIM_MODE
elif c == '\n': # tree = []
chunks.append(Line(linenum, inline + [Text(chunk)])) # pos = []
chunk = [] # print('\n'.join(map(str, chunks)))
inline = [] # for (tag, chunk) in chunks:
# if pos == []:
# tree.append(tree_for_tag(tag, NO_MODE)(chunk, line=current_line))
# pos.append(tree[-1])
# else:
# raise NotImplementedError()
# current_line += chunk.count('\n')
tree = []
mode_tree = []
contents = []
# Use tokens to tag chunks of text with it's container type
for (tok_type, tok_val) in tokens:
if tok_type == TOKEN_TYPE_TEXT:
if len(mode_tree) == 0:
tree.append(Text(tok_val, current_line))
else: else:
chunk.append(c) contents[-1].append(tok_val)
if mode != NO_MODE: current_line += chunk.count('\n')
inline.append(Text([''.join(chunk)]))
chunk = []
else:
if escaped:
chunk.append(c)
escaped = False
was_mode = mode elif tok_type == TOKEN_TYPE_OPEN_MARKER:
if mode == BOLD_MODE and c == '*': mode_tree.append(_MODES[MARKERS[tok_val]])
mode = NO_MODE contents.append([])
elif mode == CODE_MODE and c == '~':
mode = NO_MODE elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
elif mode == ITALIC_MODE and c == '/': mode = _MODES[MARKERS[tok_val]]
mode = NO_MODE matching_mode = mode_tree.pop()
elif mode == STRIKE_MODE and c == '+': assert mode == matching_mode
mode = NO_MODE value = mode(contents.pop(), current_line)
elif mode == UNDERLINED_MODE and c == '_': current_line += chunk.count('\n')
mode = NO_MODE
elif mode == VERBATIM_MODE and c == '=': if len(mode_tree) == 0: # Closed branch of tree
mode = NO_MODE tree.append(value)
elif c == '\n':
raise NotImplementedError("[{} | {}]".format(c, chunk))
else: else:
chunk.append(c) print("{} <- {}".format(mode_tree[-1], mode))
contents[-1].append(value)
if mode == NO_MODE: current_line += chunk.count('\n')
inline.append(MODE_CLASS[was_mode](''.join(chunk)))
chunk = []
assert(len(chunk) == 0) if len(tree) > 3:
assert(len(inline) == 0) print("L", len(tree))
print("F:", tree)
return chunks return tree
def parse_headline(hl) -> Headline: def parse_headline(hl) -> Headline:
stars = hl['orig'].group('stars') stars = hl['orig'].group('stars')
@ -353,8 +487,8 @@ class OrgDom:
def dump_contents(self, raw): def dump_contents(self, raw):
if isinstance(raw, RawLine): if isinstance(raw, RawLine):
return (raw.linenum, raw.line) return (raw.linenum, raw.line)
else:
return (raw.linenum, raw.get_raw()) return (raw.linenum, raw.get_raw())
def dump_structural(self, structural: Tuple): def dump_structural(self, structural: Tuple):
return (structural[0], structural[1]) return (structural[0], structural[1])

View File

@ -1,5 +1,4 @@
from .org_dom import Headline, Line, RawLine from .org_dom import Headline, Line, RawLine, Text, Bold, Code, Italic, Strike, Underlined, Verbatim
def get_hl_raw_contents(doc: Headline) -> str: def get_hl_raw_contents(doc: Headline) -> str:
lines = [] lines = []
@ -18,6 +17,8 @@ def get_rawline_contents(doc: RawLine) -> str:
def get_span_contents(doc: Line) -> str: def get_span_contents(doc: Line) -> str:
return doc.get_raw() return doc.get_raw()
def get_text_contents(doc: Text) -> str:
return doc.get_raw()
def get_raw_contents(doc) -> str: def get_raw_contents(doc) -> str:
if isinstance(doc, Headline): if isinstance(doc, Headline):
@ -28,4 +29,7 @@ def get_raw_contents(doc) -> str:
return get_span_contents(doc) return get_span_contents(doc)
if isinstance(doc, list): if isinstance(doc, list):
return ''.join([get_raw_contents(chunk) for chunk in doc]) return ''.join([get_raw_contents(chunk) for chunk in doc])
if isinstance(doc, (Text, Bold, Code, Italic, Strike, Underlined, Verbatim)):
return doc.get_raw()
print('Unhandled type: ' + str(doc))
raise NotImplementedError('Unhandled type: ' + str(doc)) raise NotImplementedError('Unhandled type: ' + str(doc))

View File

@ -19,3 +19,18 @@
This is a _underlined phrase_. This is a _underlined phrase_.
This is a ~code phrase~. This is a ~code phrase~.
This is a nested *bold =verbatim /italic +strike _underlined ~code .~ ._ .+ ./ .= .*
This is a _ non-underlined phrase because an incorrectly placed content _.
This is a _ non-underlined phrase because an incorrectly placed content beginning_.
This is a _non-underlined phrase because an incorrectly placed content end _.
This is a _non-underlined phrase because the lack of an end.
This is a _non-underlined phrase because an empty line between beginning and
end._

13
tests/03-links.org Normal file
View File

@ -0,0 +1,13 @@
#+TITLE: 03-Links
#+DESCRIPTION: Simple org file to test links
#+TODO: TODO(t) PAUSED(p) | DONE(d)
* First level
:PROPERTIES:
:ID: 03-markup-first-level-id
:CREATED: [2020-01-01 Wed 01:01]
:END:
This is a [[https://codigoparallevar.com][web link]].
This is an /italized [[https://codigoparallevar.com][web link]]/.

View File

@ -5,7 +5,7 @@ from datetime import datetime as DT
from org_dom import dumps, load, loads from org_dom import dumps, load, loads
from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE, from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE,
UNDERLINED, VERBATIM, Dom) UNDERLINED, VERBATIM, WEB_LINK, Dom,)
DIR = os.path.dirname(os.path.abspath(__file__)) DIR = os.path.dirname(os.path.abspath(__file__))
@ -47,6 +47,7 @@ class TestSerde(unittest.TestCase):
self.assertEqual(dumps(doc), orig) self.assertEqual(dumps(doc), orig)
def test_markup_file_02(self): def test_markup_file_02(self):
self.maxDiff = 1024
with open(os.path.join(DIR, '02-markup.org')) as f: with open(os.path.join(DIR, '02-markup.org')) as f:
doc = load(f) doc = load(f)
@ -60,22 +61,73 @@ class TestSerde(unittest.TestCase):
], ],
content=[ content=[
SPAN(" This is a ", BOLD("bold phrase"), SPAN(" This is a ", BOLD("bold phrase"),
"."), ".\n"),
SPAN(""), SPAN("\n"),
SPAN(" This is a ", SPAN(" This is a ",
VERBATIM("verbatim phrase"), "."), VERBATIM("verbatim phrase"), ".\n"),
SPAN(""), SPAN("\n"),
SPAN(" This is a ", ITALIC("italic phrase"), SPAN(" This is a ", ITALIC("italic phrase"),
"."), ".\n"),
SPAN(""), SPAN("\n"),
SPAN(" This is a ", SPAN(" This is a ",
STRIKE("strike-through phrase"), "."), STRIKE("strike-through phrase"), ".\n"),
SPAN(""), SPAN("\n"),
SPAN(" This is a ", SPAN(" This is a ",
UNDERLINED("underlined phrase"), "."), UNDERLINED("underlined phrase"), ".\n"),
SPAN(""), SPAN("\n"),
SPAN(" This is a ", CODE("code phrase"), SPAN(" This is a ", CODE("code phrase"),
"."), ".\n"),
SPAN("\n"),
SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])),
SPAN("\n"),
# SPAN(""),
# # TODO: THIS IS INTERLEAVED, not nested
# In ORG: This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~
# SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])),
# SPAN(""),
SPAN("\n"),
SPAN(" This is a _ non-underlined phrase because an incorrectly placed content _.\n"),
SPAN("\n"),
SPAN(" This is a _ non-underlined phrase because an incorrectly placed content beginning_.\n"),
SPAN("\n"),
SPAN(""),
SPAN(" This is a _non-underlined phrase because an incorrectly placed content end _.\n"),
SPAN("\n"),
SPAN(""),
SPAN(" This is a _non-underlined phrase because the lack of an end.\n"),
SPAN("\n"),
SPAN("\n"),
SPAN(" This is a _non-underlined phrase because an empty line between beginning and\n"),
SPAN("\n"),
SPAN(""),
SPAN(" end._\n"),
]))) ])))
ex.assert_matches(self, doc) ex.assert_matches(self, doc)
# def test_links_file_03(self):
# with open(os.path.join(DIR, '03-links.org')) as f:
# doc = load(f)
# ex = Dom(props=[('TITLE', '03-Links'),
# ('DESCRIPTION', 'Simple org file to test links'),
# ('TODO', 'TODO(t) PAUSED(p) | DONE(d)')],
# children=(HL('First level',
# props=[
# ('ID', '03-markup-first-level-id'),
# ('CREATED', DT(2020, 1, 1, 1, 1)),
# ],
# content=[
# SPAN(" This is a ", WEB_LINK("web link", "https://codigoparallevar.com"),
# "."),
# ])))
# ex.assert_matches(self, doc)

View File

@ -9,6 +9,15 @@ def timestamp_to_datetime(ts):
return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute) return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute)
def get_raw(doc):
if isinstance(doc, str):
return doc
elif isinstance(doc, list):
return ''.join([get_raw(e) for e in doc])
else:
return doc.get_raw()
class Dom: class Dom:
def __init__(self, *, props=None, children=None): def __init__(self, *, props=None, children=None):
self.props = props self.props = props
@ -65,15 +74,7 @@ class HL:
test_case.assertEqual( test_case.assertEqual(
timestamp_to_datetime(doc_props[i].value), prop[1]) timestamp_to_datetime(doc_props[i].value), prop[1])
if isinstance(self.content, str): test_case.assertEqual(get_raw_contents(doc), self.get_raw())
test_case.assertEqual(get_raw_contents(doc), self.content)
else:
if len(doc.contents) != len(self.content):
print("Contents:", doc.contents)
print("Expected:", self.content)
test_case.assertEqual(len(doc.contents), len(self.content))
for i, content in enumerate(self.content):
content.assert_matches(test_case, doc.contents[i])
# Check children # Check children
if self.children is None: if self.children is None:
@ -86,18 +87,21 @@ class HL:
for i, children in enumerate(self.children): for i, children in enumerate(self.children):
children.assert_matches(test_case, doc_headlines[i]) children.assert_matches(test_case, doc_headlines[i])
def get_raw(self):
return ''.join(map(get_raw, self.content))
class SPAN: class SPAN:
def __init__(self, *kwargs): def __init__(self, *kwargs):
self.contents = kwargs self.contents = kwargs
def to_raw(self): def get_raw(self):
chunks = [] chunks = []
for section in self.contents: for section in self.contents:
if isinstance(section, str): if isinstance(section, str):
chunks.append(section) chunks.append(section)
else: else:
chunks.append(section.to_raw()) chunks.append(section.get_raw())
return ''.join(chunks) return ''.join(chunks)
@ -116,8 +120,8 @@ class BOLD:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def to_raw(self): def get_raw(self):
return '*{}*'.format(self.text) return '*{}*'.format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Bold)) test_case.assertTrue(isinstance(other, Bold))
@ -128,8 +132,8 @@ class CODE:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def to_raw(self): def get_raw(self):
return '~{}~'.format(self.text) return '~{}~'.format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Code)) test_case.assertTrue(isinstance(other, Code))
@ -139,8 +143,8 @@ class ITALIC:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def to_raw(self): def get_raw(self):
return '/{}/'.format(self.text) return '/{}/'.format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Italic)) test_case.assertTrue(isinstance(other, Italic))
@ -150,8 +154,8 @@ class STRIKE:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def to_raw(self): def get_raw(self):
return '+{}+'.format(self.text) return '+{}+'.format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Strike)) test_case.assertTrue(isinstance(other, Strike))
@ -162,8 +166,8 @@ class UNDERLINED:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def to_raw(self): def get_raw(self):
return '_{}_'.format(self.text) return '_{}_'.format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Underlined)) test_case.assertTrue(isinstance(other, Underlined))
@ -173,9 +177,22 @@ class VERBATIM:
def __init__(self, text): def __init__(self, text):
self.text = text self.text = text
def to_raw(self): def get_raw(self):
return '={}='.format(self.text) return '={}='.format(get_raw(self.text))
def assertEqual(self, test_case, other): def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, Verbatim)) test_case.assertTrue(isinstance(other, Verbatim))
test_case.assertEqual(self.text, other.contents) test_case.assertEqual(self.text, other.contents)
class WEB_LINK:
def __init__(self, text, link):
self.text = text
self.link = link
def get_raw(self):
return '[[{}][{}]]'.format(self.link, self.text)
def assertEqual(self, test_case, other):
test_case.assertTrue(isinstance(other, WebLink))
test_case.assertEqual(self.text, other.contents)
test_case.assertEqual(self.link, other.link)