Add simple support for nested markup.

2020-10-25 20:23:08 +01:00 · 2020-10-25 20:23:08 +01:00 · f6de69fd90
commit f6de69fd90
parent 5b886e5e24
6 changed files with 356 additions and 121 deletions
--- a/org_dom/org_dom.py
+++ b/org_dom/org_dom.py
@ -109,6 +109,11 @@ def timestamp_to_string(ts):
    else:
        return '[{}]'.format(base)
 def get_raw(doc):
    if isinstance(doc, str):
        return doc
    else:
        return doc.get_raw()
 class Line:
    def __init__(self, linenum, contents):
@ -125,73 +130,190 @@ class Line:
        return ''.join(rawchunks) + '\n'
 class Text:
-    def __init__(self, contents):
+    def __init__(self, contents, line):
        self.contents = contents
        self.linenum = line
    def get_raw(self):
        raw = ''.join(self.contents)
        return raw
 class Bold:
-    def __init__(self, contents):
+    Marker = '*'
    def __init__(self, contents, line):
        self.contents = contents
    def get_raw(self):
-        raw = ''.join(self.contents)
+        raw = ''.join(map(get_raw, self.contents))
-        return f"*{raw}*"
+        return f"{self.Marker}{raw}{self.Marker}"
 class Code:
-    def __init__(self, contents):
+    Marker = '~'
    def __init__(self, contents, line):
        self.contents = contents
    def get_raw(self):
-        raw = ''.join(self.contents)
+        raw = ''.join(map(get_raw, self.contents))
-        return f"~{raw}~"
+        return f"{self.Marker}{raw}{self.Marker}"
 class Italic:
-    def __init__(self, contents):
+    Marker = '/'
    def __init__(self, contents, line):
        self.contents = contents
    def get_raw(self):
-        raw = ''.join(self.contents)
+        raw = ''.join(map(get_raw, self.contents))
-        return f"/{raw}/"
+        return f"{self.Marker}{raw}{self.Marker}"
 class Strike:
-    def __init__(self, contents):
+    Marker = '+'
    def __init__(self, contents, line):
        self.contents = contents
    def get_raw(self):
-        raw = ''.join(self.contents)
+        raw = ''.join(map(get_raw, self.contents))
-        return f"+{raw}+"
+        return f"{self.Marker}{raw}{self.Marker}"
 class Underlined:
-    def __init__(self, contents):
+    Marker = '_'
    def __init__(self, contents, line):
        self.contents = contents
    def get_raw(self):
-        raw = ''.join(self.contents)
+        raw = ''.join(map(get_raw, self.contents))
-        return f"_{raw}_"
+        return f"{self.Marker}{raw}{self.Marker}"
 class Verbatim:
-    def __init__(self, contents):
+    Marker = '='
    def __init__(self, contents, line):
        self.contents = contents
    def get_raw(self):
-        raw = ''.join(self.contents)
+        raw = ''.join(map(get_raw, self.contents))
-        return f"={raw}="
+        return f"{self.Marker}{raw}{self.Marker}"
 def is_pre(char: str) -> bool:
    if isinstance(char, str):
        return char in '\n\r\t -({\'"'
    else:
        return True
 def is_marker(char: str) -> bool:
    if isinstance(char, str):
        return char in '*=/+_~'
    else:
        return False
 def is_border(char: str) -> bool:
    if isinstance(char, str):
        return char not in '\n\r\t '
    else:
        return False
 def is_body(char: str) -> bool:
    if isinstance(char, str):
        return True
    else:
        return False
 def is_post(char: str) -> bool:
    if isinstance(char, str):
        return char in '-.,;:!?\')}["'
    else:
        return False
 TOKEN_TYPE_TEXT = 0
 TOKEN_TYPE_OPEN_MARKER = 1
 TOKEN_TYPE_CLOSE_MARKER = 2
 def tokenize_contents(contents: str):
    tokens = []
    last_char = None
    text = []
    closes = set()
    for i, char in enumerate(contents):
        has_changed = False
        if (
                (i not in closes)
                and is_marker(char)
                and is_pre(last_char)
                and ((i + 1 < len(contents))
                     and is_border(contents[i + 1]))):
            is_valid_mark = False
            # Check that is closed later
            text_in_line = True
            for j in range(i, len(contents) - 1):
                if contents[j] == '\n':
                    if not text_in_line:
                        break
                    text_in_line = False
                elif is_border(contents[j]) and contents[j + 1] == char:
                    is_valid_mark = True
                    closes.add(j + 1)
                    break
                else:
                    text_in_line |= is_body(contents[j])
            if is_valid_mark:
                if len(text) > 0:
                    tokens.append((TOKEN_TYPE_TEXT, ''.join(text)))
                    text = []
                tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
                has_changed = True
        elif i in closes:
            if len(text) > 0:
                tokens.append((TOKEN_TYPE_TEXT, ''.join(text)))
                text = []
            tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
            has_changed = True
        if not has_changed:
            text.append(char)
        last_char = char
    if len(text) > 0:
        tokens.append((TOKEN_TYPE_TEXT, ''.join(text)))
    return tokens
 def parse_contents(raw_contents:List[RawLine]):
-    NO_MODE = 0
+    NO_MODE =         0b0
-    BOLD_MODE = 1
+    BOLD_MODE =       0b1
-    CODE_MODE = 2
+    CODE_MODE =       0b10
-    ITALIC_MODE = 3
+    ITALIC_MODE =     0b100
-    STRIKE_MODE = 4
+    STRIKE_MODE =     0b1000
-    UNDERLINED_MODE = 5
+    UNDERLINED_MODE = 0b10000
-    VERBATIM_MODE = 6
+    VERBATIM_MODE =   0b100000
-    MODE_CLASS = {
+    MARKERS = {
-        NO_MODE: Line,
+        '*': BOLD_MODE,
        '~': CODE_MODE,
        '/': ITALIC_MODE,
        '+': STRIKE_MODE,
        '_': UNDERLINED_MODE,
        '=': VERBATIM_MODE,
    }
    MODES = (
        (BOLD_MODE, Bold),
        (CODE_MODE, Code),
        (ITALIC_MODE, Italic),
        (STRIKE_MODE, Strike),
        (UNDERLINED_MODE, Underlined),
        (VERBATIM_MODE, Verbatim),
    )
    _MODES = {
        BOLD_MODE: Bold,
        CODE_MODE: Code,
        ITALIC_MODE: Italic,
@ -213,68 +335,80 @@ def parse_contents(raw_contents:List[RawLine]):
        contents_buff.append(line.line)
    contents = '\n'.join(contents_buff)
    tokens = tokenize_contents(contents)
-    for c in contents:
+    # Use tokens to tag chunks of text with it's container type
-        if mode == NO_MODE:
+    for (tok_type, tok_val) in tokens:
-            if escaped:
+        if tok_type == TOKEN_TYPE_TEXT:
-                chunk.append(c)
+            chunks.append((mode, tok_val))
-                escaped = False
+        elif tok_type == TOKEN_TYPE_OPEN_MARKER:
            mode = mode | MARKERS[tok_val]
        elif tok_type == TOKEN_TYPE_OPEN_MARKER:
            mode = mode ^ MARKERS[tok_val]
-            elif c == '\\':
+    # Convert those chunks to a tree
-                escaped = True
+    def tree_for_tag(tag, in_mode):
-            elif c == '*':
+        tree = []
-                mode = BOLD_MODE
+        for (mask, mode) in MODES:
-            elif c == '~':
+            if (mask & tag) and not (mask & in_mode):
-                mode = CODE_MODE
+                tree.append(mode)
-            elif c == '/':
+        print(tree)
-                mode = ITALIC_MODE
+        if len(tree) == 0:
-            elif c == '+':
+            return Text
-                mode = STRIKE_MODE
+
-            elif c == '_':
+
-                mode = UNDERLINED_MODE
+    if len(raw_contents) > 0:
-            elif c == '=':
+        current_line = raw_contents[0].linenum
-                mode = VERBATIM_MODE
+
-            elif c == '\n':
+    # tree = []
-                chunks.append(Line(linenum, inline + [Text(chunk)]))
+    # pos = []
-                chunk = []
+    # print('\n'.join(map(str, chunks)))
-                inline = []
+    # for (tag, chunk) in chunks:
    #     if pos == []:
    #         tree.append(tree_for_tag(tag, NO_MODE)(chunk, line=current_line))
    #         pos.append(tree[-1])
    #     else:
    #         raise NotImplementedError()
    #     current_line += chunk.count('\n')
    tree = []
    mode_tree = []
    contents = []
    # Use tokens to tag chunks of text with it's container type
    for (tok_type, tok_val) in tokens:
        if tok_type == TOKEN_TYPE_TEXT:
            if len(mode_tree) == 0:
                tree.append(Text(tok_val, current_line))
            else:
-                chunk.append(c)
+                contents[-1].append(tok_val)
-            if mode != NO_MODE:
+            current_line += chunk.count('\n')
-                inline.append(Text([''.join(chunk)]))
+
-                chunk = []
+        elif tok_type == TOKEN_TYPE_OPEN_MARKER:
            mode_tree.append(_MODES[MARKERS[tok_val]])
            contents.append([])
        elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
            mode = _MODES[MARKERS[tok_val]]
            matching_mode = mode_tree.pop()
            assert mode == matching_mode
            value = mode(contents.pop(), current_line)
            current_line += chunk.count('\n')
            if len(mode_tree) == 0:  # Closed branch of tree
                tree.append(value)
            else:
-            if escaped:
+                print("{} <- {}".format(mode_tree[-1], mode))
-                chunk.append(c)
+                contents[-1].append(value)
                escaped = False
-            was_mode = mode
+            current_line += chunk.count('\n')
            if mode == BOLD_MODE and c == '*':
                mode = NO_MODE
            elif mode == CODE_MODE and c == '~':
                mode = NO_MODE
            elif mode == ITALIC_MODE and c == '/':
                mode = NO_MODE
            elif mode == STRIKE_MODE and c == '+':
                mode = NO_MODE
            elif mode == UNDERLINED_MODE and c == '_':
                mode = NO_MODE
            elif mode == VERBATIM_MODE and c == '=':
                mode = NO_MODE
            elif c == '\n':
                raise NotImplementedError("[{} | {}]".format(c, chunk))
            else:
                chunk.append(c)
-            if mode == NO_MODE:
+    if len(tree) > 3:
-                inline.append(MODE_CLASS[was_mode](''.join(chunk)))
+        print("L", len(tree))
-                chunk = []
+    print("F:", tree)
-
+    return tree
    assert(len(chunk) == 0)
    assert(len(inline) == 0)
    return chunks
 def parse_headline(hl) -> Headline:
    stars = hl['orig'].group('stars')
@ -353,7 +487,7 @@ class OrgDom:
    def dump_contents(self, raw):
        if isinstance(raw, RawLine):
            return (raw.linenum, raw.line)
-        else:
+
        return (raw.linenum, raw.get_raw())
    def dump_structural(self, structural: Tuple):
--- a/org_dom/utils.py
+++ b/org_dom/utils.py
@ -1,5 +1,4 @@
-from .org_dom import Headline, Line, RawLine
+from .org_dom import Headline, Line, RawLine, Text, Bold, Code, Italic, Strike, Underlined, Verbatim
 def get_hl_raw_contents(doc: Headline) -> str:
    lines = []
@ -18,6 +17,8 @@ def get_rawline_contents(doc: RawLine) -> str:
 def get_span_contents(doc: Line) -> str:
    return doc.get_raw()
 def get_text_contents(doc: Text) -> str:
    return doc.get_raw()
 def get_raw_contents(doc) -> str:
    if isinstance(doc, Headline):
@ -28,4 +29,7 @@ def get_raw_contents(doc) -> str:
        return get_span_contents(doc)
    if isinstance(doc, list):
        return ''.join([get_raw_contents(chunk) for chunk in doc])
    if isinstance(doc, (Text, Bold, Code, Italic, Strike, Underlined, Verbatim)):
        return doc.get_raw()
    print('Unhandled type: ' + str(doc))
    raise NotImplementedError('Unhandled type: ' + str(doc))
--- a/tests/02-markup.org
+++ b/tests/02-markup.org
@ -19,3 +19,18 @@
  This is a _underlined phrase_.
  This is a ~code phrase~.
  This is a nested *bold =verbatim /italic +strike _underlined ~code .~ ._ .+ ./ .= .*
  This is a _ non-underlined phrase because an incorrectly placed content _.
  This is a _ non-underlined phrase because an incorrectly placed content beginning_.
  This is a _non-underlined phrase because an incorrectly placed content end _.
  This is a _non-underlined phrase because the lack of an end.
  This is a _non-underlined phrase because an empty line between beginning and
  end._
--- a/tests/03-links.org
+++ b/tests/03-links.org
@ -0,0 +1,13 @@
 #+TITLE: 03-Links
 #+DESCRIPTION: Simple org file to test links
 #+TODO: TODO(t) PAUSED(p) |  DONE(d)
 * First level
  :PROPERTIES:
  :ID:       03-markup-first-level-id
  :CREATED:  [2020-01-01 Wed 01:01]
  :END:
  This is a [[https://codigoparallevar.com][web link]].
  This is an /italized [[https://codigoparallevar.com][web link]]/.
--- a/tests/test_dom.py
+++ b/tests/test_dom.py
@ -5,7 +5,7 @@ from datetime import datetime as DT
 from org_dom import dumps, load, loads
 from utils.dom_assertions import (BOLD, CODE, HL, ITALIC, SPAN, STRIKE,
-                                  UNDERLINED, VERBATIM, Dom)
+                                  UNDERLINED, VERBATIM, WEB_LINK, Dom,)
 DIR = os.path.dirname(os.path.abspath(__file__))
@ -47,6 +47,7 @@ class TestSerde(unittest.TestCase):
        self.assertEqual(dumps(doc), orig)
    def test_markup_file_02(self):
        self.maxDiff = 1024
        with open(os.path.join(DIR, '02-markup.org')) as f:
            doc = load(f)
@ -60,22 +61,73 @@ class TestSerde(unittest.TestCase):
                              ],
                              content=[
                                  SPAN("  This is a ", BOLD("bold phrase"),
-                                       "."),
+                                       ".\n"),
-                                  SPAN(""),
+                                  SPAN("\n"),
                                  SPAN("  This is a ",
-                                       VERBATIM("verbatim phrase"), "."),
+                                       VERBATIM("verbatim phrase"), ".\n"),
-                                  SPAN(""),
+                                  SPAN("\n"),
                                  SPAN("  This is a ", ITALIC("italic phrase"),
-                                       "."),
+                                       ".\n"),
-                                  SPAN(""),
+                                  SPAN("\n"),
                                  SPAN("  This is a ",
-                                       STRIKE("strike-through phrase"), "."),
+                                       STRIKE("strike-through phrase"), ".\n"),
-                                  SPAN(""),
+                                  SPAN("\n"),
                                  SPAN("  This is a ",
-                                       UNDERLINED("underlined phrase"), "."),
+                                       UNDERLINED("underlined phrase"), ".\n"),
-                                  SPAN(""),
+                                  SPAN("\n"),
                                  SPAN("  This is a ", CODE("code phrase"),
-                                       "."),
+                                       ".\n"),
                                  SPAN("\n"),
                                  SPAN("  This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])),
                                  SPAN("\n"),
                                  # SPAN(""),
                                  # # TODO: THIS IS INTERLEAVED, not nested
                                  # In ORG:   This is a interleaved *bold =verbatim /italic +strike _underlined ~code .* .= ./ .+ ._ .~
                                  # SPAN("  This is a nested ", BOLD(["bold ", VERBATIM(["verbatim ", ITALIC(["italic ", STRIKE(["strike ", UNDERLINED(["underlined ", CODE("code ."), " ."]), " ."]), " ."]), " ."]), " ."])),
                                  # SPAN(""),
                                  SPAN("\n"),
                                  SPAN("  This is a _ non-underlined phrase because an incorrectly placed content _.\n"),
                                  SPAN("\n"),
                                  SPAN("  This is a _ non-underlined phrase because an incorrectly placed content beginning_.\n"),
                                  SPAN("\n"),
                                  SPAN(""),
                                  SPAN("  This is a _non-underlined phrase because an incorrectly placed content end _.\n"),
                                  SPAN("\n"),
                                  SPAN(""),
                                  SPAN("  This is a _non-underlined phrase because the lack of an end.\n"),
                                  SPAN("\n"),
                                  SPAN("\n"),
                                  SPAN("  This is a _non-underlined phrase because an empty line between beginning and\n"),
                                  SPAN("\n"),
                                  SPAN(""),
                                  SPAN("  end._\n"),
                              ])))
        ex.assert_matches(self, doc)
    # def test_links_file_03(self):
    #     with open(os.path.join(DIR, '03-links.org')) as f:
    #         doc = load(f)
    #     ex = Dom(props=[('TITLE', '03-Links'),
    #                     ('DESCRIPTION', 'Simple org file to test links'),
    #                     ('TODO', 'TODO(t) PAUSED(p) |  DONE(d)')],
    #              children=(HL('First level',
    #                           props=[
    #                               ('ID', '03-markup-first-level-id'),
    #                               ('CREATED', DT(2020, 1, 1, 1, 1)),
    #                           ],
    #                           content=[
    #                               SPAN("  This is a ", WEB_LINK("web link", "https://codigoparallevar.com"),
    #                                    "."),
    #                           ])))
    #     ex.assert_matches(self, doc)
--- a/tests/utils/dom_assertions.py
+++ b/tests/utils/dom_assertions.py
@ -9,6 +9,15 @@ def timestamp_to_datetime(ts):
    return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute)
 def get_raw(doc):
    if isinstance(doc, str):
        return doc
    elif isinstance(doc, list):
        return ''.join([get_raw(e) for e in doc])
    else:
        return doc.get_raw()
 class Dom:
    def __init__(self, *, props=None, children=None):
        self.props = props
@ -65,15 +74,7 @@ class HL:
                    test_case.assertEqual(
                        timestamp_to_datetime(doc_props[i].value), prop[1])
-        if isinstance(self.content, str):
+        test_case.assertEqual(get_raw_contents(doc), self.get_raw())
            test_case.assertEqual(get_raw_contents(doc), self.content)
        else:
            if len(doc.contents) != len(self.content):
                print("Contents:", doc.contents)
                print("Expected:", self.content)
            test_case.assertEqual(len(doc.contents), len(self.content))
            for i, content in enumerate(self.content):
                content.assert_matches(test_case, doc.contents[i])
        # Check children
        if self.children is None:
@ -86,18 +87,21 @@ class HL:
            for i, children in enumerate(self.children):
                children.assert_matches(test_case, doc_headlines[i])
    def get_raw(self):
        return ''.join(map(get_raw, self.content))
 class SPAN:
    def __init__(self, *kwargs):
        self.contents = kwargs
-    def to_raw(self):
+    def get_raw(self):
        chunks = []
        for section in self.contents:
            if isinstance(section, str):
                chunks.append(section)
            else:
-                chunks.append(section.to_raw())
+                chunks.append(section.get_raw())
        return ''.join(chunks)
@ -116,8 +120,8 @@ class BOLD:
    def __init__(self, text):
        self.text = text
-    def to_raw(self):
+    def get_raw(self):
-        return '*{}*'.format(self.text)
+        return '*{}*'.format(get_raw(self.text))
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, Bold))
@ -128,8 +132,8 @@ class CODE:
    def __init__(self, text):
        self.text = text
-    def to_raw(self):
+    def get_raw(self):
-        return '~{}~'.format(self.text)
+        return '~{}~'.format(get_raw(self.text))
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, Code))
@ -139,8 +143,8 @@ class ITALIC:
    def __init__(self, text):
        self.text = text
-    def to_raw(self):
+    def get_raw(self):
-        return '/{}/'.format(self.text)
+        return '/{}/'.format(get_raw(self.text))
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, Italic))
@ -150,8 +154,8 @@ class STRIKE:
    def __init__(self, text):
        self.text = text
-    def to_raw(self):
+    def get_raw(self):
-        return '+{}+'.format(self.text)
+        return '+{}+'.format(get_raw(self.text))
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, Strike))
@ -162,8 +166,8 @@ class UNDERLINED:
    def __init__(self, text):
        self.text = text
-    def to_raw(self):
+    def get_raw(self):
-        return '_{}_'.format(self.text)
+        return '_{}_'.format(get_raw(self.text))
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, Underlined))
@ -173,9 +177,22 @@ class VERBATIM:
    def __init__(self, text):
        self.text = text
-    def to_raw(self):
+    def get_raw(self):
-        return '={}='.format(self.text)
+        return '={}='.format(get_raw(self.text))
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, Verbatim))
        test_case.assertEqual(self.text, other.contents)
 class WEB_LINK:
    def __init__(self, text, link):
        self.text = text
        self.link = link
    def get_raw(self):
        return '[[{}][{}]]'.format(self.link, self.text)
    def assertEqual(self, test_case, other):
        test_case.assertTrue(isinstance(other, WebLink))
        test_case.assertEqual(self.text, other.contents)
        test_case.assertEqual(self.link, other.link)