diff --git a/org_dom/org_dom.py b/org_dom/org_dom.py index 8ff5746..ff5fdac 100644 --- a/org_dom/org_dom.py +++ b/org_dom/org_dom.py @@ -109,6 +109,11 @@ def timestamp_to_string(ts): else: return '[{}]'.format(base) +def get_raw(doc): + if isinstance(doc, str): + return doc + else: + return doc.get_raw() class Line: def __init__(self, linenum, contents): @@ -125,73 +130,190 @@ class Line: return ''.join(rawchunks) + '\n' class Text: - def __init__(self, contents): + def __init__(self, contents, line): self.contents = contents + self.linenum = line def get_raw(self): raw = ''.join(self.contents) return raw class Bold: - def __init__(self, contents): + Marker = '*' + + def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(self.contents) - return f"*{raw}*" + raw = ''.join(map(get_raw, self.contents)) + return f"{self.Marker}{raw}{self.Marker}" class Code: - def __init__(self, contents): + Marker = '~' + + def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(self.contents) - return f"~{raw}~" + raw = ''.join(map(get_raw, self.contents)) + return f"{self.Marker}{raw}{self.Marker}" class Italic: - def __init__(self, contents): + Marker = '/' + + def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(self.contents) - return f"/{raw}/" + raw = ''.join(map(get_raw, self.contents)) + return f"{self.Marker}{raw}{self.Marker}" class Strike: - def __init__(self, contents): + Marker = '+' + + def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(self.contents) - return f"+{raw}+" + raw = ''.join(map(get_raw, self.contents)) + return f"{self.Marker}{raw}{self.Marker}" class Underlined: - def __init__(self, contents): + Marker = '_' + + def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(self.contents) - return f"_{raw}_" + raw = ''.join(map(get_raw, self.contents)) + return f"{self.Marker}{raw}{self.Marker}" class Verbatim: - def __init__(self, contents): + Marker = '=' + + def __init__(self, contents, line): self.contents = contents def get_raw(self): - raw = ''.join(self.contents) - return f"={raw}=" + raw = ''.join(map(get_raw, self.contents)) + return f"{self.Marker}{raw}{self.Marker}" + + +def is_pre(char: str) -> bool: + if isinstance(char, str): + return char in '\n\r\t -({\'"' + else: + return True + +def is_marker(char: str) -> bool: + if isinstance(char, str): + return char in '*=/+_~' + else: + return False + +def is_border(char: str) -> bool: + if isinstance(char, str): + return char not in '\n\r\t ' + else: + return False + +def is_body(char: str) -> bool: + if isinstance(char, str): + return True + else: + return False + +def is_post(char: str) -> bool: + if isinstance(char, str): + return char in '-.,;:!?\')}["' + else: + return False + +TOKEN_TYPE_TEXT = 0 +TOKEN_TYPE_OPEN_MARKER = 1 +TOKEN_TYPE_CLOSE_MARKER = 2 + +def tokenize_contents(contents: str): + tokens = [] + last_char = None + + text = [] + closes = set() + + for i, char in enumerate(contents): + has_changed = False + + if ( + (i not in closes) + and is_marker(char) + and is_pre(last_char) + and ((i + 1 < len(contents)) + and is_border(contents[i + 1]))): + + is_valid_mark = False + # Check that is closed later + text_in_line = True + for j in range(i, len(contents) - 1): + if contents[j] == '\n': + if not text_in_line: + break + text_in_line = False + elif is_border(contents[j]) and contents[j + 1] == char: + is_valid_mark = True + closes.add(j + 1) + break + else: + text_in_line |= is_body(contents[j]) + + if is_valid_mark: + if len(text) > 0: + tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) + text = [] + tokens.append((TOKEN_TYPE_OPEN_MARKER, char)) + has_changed = True + elif i in closes: + if len(text) > 0: + tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) + text = [] + tokens.append((TOKEN_TYPE_CLOSE_MARKER, char)) + has_changed = True + + if not has_changed: + text.append(char) + last_char = char + + if len(text) > 0: + tokens.append((TOKEN_TYPE_TEXT, ''.join(text))) + + return tokens def parse_contents(raw_contents:List[RawLine]): - NO_MODE = 0 - BOLD_MODE = 1 - CODE_MODE = 2 - ITALIC_MODE = 3 - STRIKE_MODE = 4 - UNDERLINED_MODE = 5 - VERBATIM_MODE = 6 + NO_MODE = 0b0 + BOLD_MODE = 0b1 + CODE_MODE = 0b10 + ITALIC_MODE = 0b100 + STRIKE_MODE = 0b1000 + UNDERLINED_MODE = 0b10000 + VERBATIM_MODE = 0b100000 - MODE_CLASS = { - NO_MODE: Line, + MARKERS = { + '*': BOLD_MODE, + '~': CODE_MODE, + '/': ITALIC_MODE, + '+': STRIKE_MODE, + '_': UNDERLINED_MODE, + '=': VERBATIM_MODE, + } + MODES = ( + (BOLD_MODE, Bold), + (CODE_MODE, Code), + (ITALIC_MODE, Italic), + (STRIKE_MODE, Strike), + (UNDERLINED_MODE, Underlined), + (VERBATIM_MODE, Verbatim), + ) + _MODES = { BOLD_MODE: Bold, CODE_MODE: Code, ITALIC_MODE: Italic, @@ -213,68 +335,80 @@ def parse_contents(raw_contents:List[RawLine]): contents_buff.append(line.line) contents = '\n'.join(contents_buff) + tokens = tokenize_contents(contents) - for c in contents: - if mode == NO_MODE: - if escaped: - chunk.append(c) - escaped = False + # Use tokens to tag chunks of text with it's container type + for (tok_type, tok_val) in tokens: + if tok_type == TOKEN_TYPE_TEXT: + chunks.append((mode, tok_val)) + elif tok_type == TOKEN_TYPE_OPEN_MARKER: + mode = mode | MARKERS[tok_val] + elif tok_type == TOKEN_TYPE_OPEN_MARKER: + mode = mode ^ MARKERS[tok_val] - elif c == '\\': - escaped = True - elif c == '*': - mode = BOLD_MODE - elif c == '~': - mode = CODE_MODE - elif c == '/': - mode = ITALIC_MODE - elif c == '+': - mode = STRIKE_MODE - elif c == '_': - mode = UNDERLINED_MODE - elif c == '=': - mode = VERBATIM_MODE - elif c == '\n': - chunks.append(Line(linenum, inline + [Text(chunk)])) - chunk = [] - inline = [] + # Convert those chunks to a tree + def tree_for_tag(tag, in_mode): + tree = [] + for (mask, mode) in MODES: + if (mask & tag) and not (mask & in_mode): + tree.append(mode) + print(tree) + if len(tree) == 0: + return Text + + + if len(raw_contents) > 0: + current_line = raw_contents[0].linenum + + # tree = [] + # pos = [] + # print('\n'.join(map(str, chunks))) + # for (tag, chunk) in chunks: + # if pos == []: + # tree.append(tree_for_tag(tag, NO_MODE)(chunk, line=current_line)) + # pos.append(tree[-1]) + # else: + # raise NotImplementedError() + + # current_line += chunk.count('\n') + + + tree = [] + mode_tree = [] + contents = [] + # Use tokens to tag chunks of text with it's container type + for (tok_type, tok_val) in tokens: + if tok_type == TOKEN_TYPE_TEXT: + if len(mode_tree) == 0: + tree.append(Text(tok_val, current_line)) else: - chunk.append(c) + contents[-1].append(tok_val) - if mode != NO_MODE: - inline.append(Text([''.join(chunk)])) - chunk = [] - else: - if escaped: - chunk.append(c) - escaped = False + current_line += chunk.count('\n') - was_mode = mode - if mode == BOLD_MODE and c == '*': - mode = NO_MODE - elif mode == CODE_MODE and c == '~': - mode = NO_MODE - elif mode == ITALIC_MODE and c == '/': - mode = NO_MODE - elif mode == STRIKE_MODE and c == '+': - mode = NO_MODE - elif mode == UNDERLINED_MODE and c == '_': - mode = NO_MODE - elif mode == VERBATIM_MODE and c == '=': - mode = NO_MODE - elif c == '\n': - raise NotImplementedError("[{} | {}]".format(c, chunk)) + elif tok_type == TOKEN_TYPE_OPEN_MARKER: + mode_tree.append(_MODES[MARKERS[tok_val]]) + contents.append([]) + + elif tok_type == TOKEN_TYPE_CLOSE_MARKER: + mode = _MODES[MARKERS[tok_val]] + matching_mode = mode_tree.pop() + assert mode == matching_mode + value = mode(contents.pop(), current_line) + current_line += chunk.count('\n') + + if len(mode_tree) == 0: # Closed branch of tree + tree.append(value) else: - chunk.append(c) + print("{} <- {}".format(mode_tree[-1], mode)) + contents[-1].append(value) - if mode == NO_MODE: - inline.append(MODE_CLASS[was_mode](''.join(chunk))) - chunk = [] + current_line += chunk.count('\n') - assert(len(chunk) == 0) - assert(len(inline) == 0) - - return chunks + if len(tree) > 3: + print("L", len(tree)) + print("F:", tree) + return tree def parse_headline(hl) -> Headline: stars = hl['orig'].group('stars') @@ -353,8 +487,8 @@ class OrgDom: def dump_contents(self, raw): if isinstance(raw, RawLine): return (raw.linenum, raw.line) - else: - return (raw.linenum, raw.get_raw()) + + return (raw.linenum, raw.get_raw()) def dump_structural(self, structural: Tuple): return (structural[0], structural[1]) diff --git a/org_dom/utils.py b/org_dom/utils.py index e7c3e8c..ce77add 100644 --- a/org_dom/utils.py +++ b/org_dom/utils.py @@ -1,5 +1,4 @@ -from .org_dom import Headline, Line, RawLine - +from .org_dom import Headline, Line, RawLine, Text, Bold, Code, Italic, Strike, Underlined, Verbatim def get_hl_raw_contents(doc: Headline) -> str: lines = [] @@ -18,6 +17,8 @@ def get_rawline_contents(doc: RawLine) -> str: def get_span_contents(doc: Line) -> str: return doc.get_raw() +def get_text_contents(doc: Text) -> str: + return doc.get_raw() def get_raw_contents(doc) -> str: if isinstance(doc, Headline): @@ -28,4 +29,7 @@ def get_raw_contents(doc) -> str: return get_span_contents(doc) if isinstance(doc, list): return ''.join([get_raw_contents(chunk) for chunk in doc]) + if isinstance(doc, (Text, Bold, Code, Italic, Strike, Underlined, Verbatim)): + return doc.get_raw() + print('Unhandled type: ' + str(doc)) raise NotImplementedError('Unhandled type: ' + str(doc)) diff --git a/tests/02-markup.org b/tests/02-markup.org index 18d58f5..41c2bb2 100644 --- a/tests/02-markup.org +++ b/tests/02-markup.org @@ -19,3 +19,18 @@ This is a _underlined phrase_. This is a ~code phrase~. + + This is a nested *bold =verbatim /italic +strike _underlined ~code .~ ._ .+ ./ .= .* + + This is a _ non-underlined phrase because an incorrectly placed content _. + + This is a _ non-underlined phrase because an incorrectly placed content beginning_. + + This is a _non-underlined phrase because an incorrectly placed content end _. + + This is a _non-underlined phrase because the lack of an end. + + + This is a _non-underlined phrase because an empty line between beginning and + + end._ diff --git a/tests/03-links.org b/tests/03-links.org new file mode 100644 index 0000000..e31e9ad --- /dev/null +++ b/tests/03-links.org @@ -0,0 +1,13 @@ +#+TITLE: 03-Links +#+DESCRIPTION: Simple org file to test links +#+TODO: TODO(t) PAUSED(p) | DONE(d) + + +* First level + :PROPERTIES: + :ID: 03-markup-first-level-id + :CREATED: [2020-01-01 Wed 01:01] + :END: + This is a [[https://codigoparallevar.com][web link]]. + + This is an /italized [[https://codigoparallevar.com][web link]]/. diff --git a/tests/test_dom.py b/tests/test_dom.py index 5b1f8cb..7e855ba 100644 --- a/tests/test_dom.py +++ b/tests/test_dom.py @@ -5,7 +5,7 @@ from datetime import datetime as DT from org_dom import dumps, load, loads from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE, - UNDERLINED, VERBATIM, Dom) + UNDERLINED, VERBATIM, WEB_LINK, Dom,) DIR = os.path.dirname(os.path.abspath(__file__)) @@ -47,6 +47,7 @@ class TestSerde(unittest.TestCase): self.assertEqual(dumps(doc), orig) def test_markup_file_02(self): + self.maxDiff = 1024 with open(os.path.join(DIR, '02-markup.org')) as f: doc = load(f) @@ -60,22 +61,73 @@ class TestSerde(unittest.TestCase): ], content=[ SPAN(" This is a ", BOLD("bold phrase"), - "."), - SPAN(""), + ".\n"), + SPAN("\n"), SPAN(" This is a ", - VERBATIM("verbatim phrase"), "."), - SPAN(""), + VERBATIM("verbatim phrase"), ".\n"), + SPAN("\n"), SPAN(" This is a ", ITALIC("italic phrase"), - "."), - SPAN(""), + ".\n"), + SPAN("\n"), SPAN(" This is a ", - STRIKE("strike-through phrase"), "."), - SPAN(""), + STRIKE("strike-through phrase"), ".\n"), + SPAN("\n"), SPAN(" This is a ", - UNDERLINED("underlined phrase"), "."), - SPAN(""), + UNDERLINED("underlined phrase"), ".\n"), + SPAN("\n"), SPAN(" This is a ", CODE("code phrase"), - "."), + ".\n"), + + SPAN("\n"), + SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])), + SPAN("\n"), + + # SPAN(""), + # # TODO: THIS IS INTERLEAVED, not nested + # In ORG: This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~ + # SPAN(" This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])), + # SPAN(""), + + SPAN("\n"), + SPAN(" This is a _ non-underlined phrase because an incorrectly placed content _.\n"), + SPAN("\n"), + + SPAN(" This is a _ non-underlined phrase because an incorrectly placed content beginning_.\n"), + SPAN("\n"), + + SPAN(""), + SPAN(" This is a _non-underlined phrase because an incorrectly placed content end _.\n"), + SPAN("\n"), + + SPAN(""), + SPAN(" This is a _non-underlined phrase because the lack of an end.\n"), + SPAN("\n"), + + SPAN("\n"), + SPAN(" This is a _non-underlined phrase because an empty line between beginning and\n"), + SPAN("\n"), + + SPAN(""), + SPAN(" end._\n"), ]))) ex.assert_matches(self, doc) + + # def test_links_file_03(self): + # with open(os.path.join(DIR, '03-links.org')) as f: + # doc = load(f) + + # ex = Dom(props=[('TITLE', '03-Links'), + # ('DESCRIPTION', 'Simple org file to test links'), + # ('TODO', 'TODO(t) PAUSED(p) | DONE(d)')], + # children=(HL('First level', + # props=[ + # ('ID', '03-markup-first-level-id'), + # ('CREATED', DT(2020, 1, 1, 1, 1)), + # ], + # content=[ + # SPAN(" This is a ", WEB_LINK("web link", "https://codigoparallevar.com"), + # "."), + # ]))) + + # ex.assert_matches(self, doc) diff --git a/tests/utils/dom_assertions.py b/tests/utils/dom_assertions.py index 9bd77e7..b8aafa8 100644 --- a/tests/utils/dom_assertions.py +++ b/tests/utils/dom_assertions.py @@ -9,6 +9,15 @@ def timestamp_to_datetime(ts): return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute) +def get_raw(doc): + if isinstance(doc, str): + return doc + elif isinstance(doc, list): + return ''.join([get_raw(e) for e in doc]) + else: + return doc.get_raw() + + class Dom: def __init__(self, *, props=None, children=None): self.props = props @@ -65,15 +74,7 @@ class HL: test_case.assertEqual( timestamp_to_datetime(doc_props[i].value), prop[1]) - if isinstance(self.content, str): - test_case.assertEqual(get_raw_contents(doc), self.content) - else: - if len(doc.contents) != len(self.content): - print("Contents:", doc.contents) - print("Expected:", self.content) - test_case.assertEqual(len(doc.contents), len(self.content)) - for i, content in enumerate(self.content): - content.assert_matches(test_case, doc.contents[i]) + test_case.assertEqual(get_raw_contents(doc), self.get_raw()) # Check children if self.children is None: @@ -86,18 +87,21 @@ class HL: for i, children in enumerate(self.children): children.assert_matches(test_case, doc_headlines[i]) + def get_raw(self): + return ''.join(map(get_raw, self.content)) + class SPAN: def __init__(self, *kwargs): self.contents = kwargs - def to_raw(self): + def get_raw(self): chunks = [] for section in self.contents: if isinstance(section, str): chunks.append(section) else: - chunks.append(section.to_raw()) + chunks.append(section.get_raw()) return ''.join(chunks) @@ -116,8 +120,8 @@ class BOLD: def __init__(self, text): self.text = text - def to_raw(self): - return '*{}*'.format(self.text) + def get_raw(self): + return '*{}*'.format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Bold)) @@ -128,8 +132,8 @@ class CODE: def __init__(self, text): self.text = text - def to_raw(self): - return '~{}~'.format(self.text) + def get_raw(self): + return '~{}~'.format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Code)) @@ -139,8 +143,8 @@ class ITALIC: def __init__(self, text): self.text = text - def to_raw(self): - return '/{}/'.format(self.text) + def get_raw(self): + return '/{}/'.format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Italic)) @@ -150,8 +154,8 @@ class STRIKE: def __init__(self, text): self.text = text - def to_raw(self): - return '+{}+'.format(self.text) + def get_raw(self): + return '+{}+'.format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Strike)) @@ -162,8 +166,8 @@ class UNDERLINED: def __init__(self, text): self.text = text - def to_raw(self): - return '_{}_'.format(self.text) + def get_raw(self): + return '_{}_'.format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Underlined)) @@ -173,9 +177,22 @@ class VERBATIM: def __init__(self, text): self.text = text - def to_raw(self): - return '={}='.format(self.text) + def get_raw(self): + return '={}='.format(get_raw(self.text)) def assertEqual(self, test_case, other): test_case.assertTrue(isinstance(other, Verbatim)) test_case.assertEqual(self.text, other.contents) + +class WEB_LINK: + def __init__(self, text, link): + self.text = text + self.link = link + + def get_raw(self): + return '[[{}][{}]]'.format(self.link, self.text) + + def assertEqual(self, test_case, other): + test_case.assertTrue(isinstance(other, WebLink)) + test_case.assertEqual(self.text, other.contents) + test_case.assertEqual(self.link, other.link)