Add base link parsing.

2020-11-02 23:51:11 +01:00 · 2020-11-02 23:51:11 +01:00 · 2372fc597c
commit 2372fc597c
parent e73ce5d480
3 changed files with 195 additions and 46 deletions
--- a/org_dom/org_dom.py
+++ b/org_dom/org_dom.py
@ -60,25 +60,57 @@ INACTIVE_TIME_STAMP_RE = re.compile(r"\[{}\]".format(BASE_TIME_STAMP_RE))
 # BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
 #                       r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
-Headline = collections.namedtuple(
+def get_tokens(value):
-    "Headline",
+    if isinstance(value, Text):
-    (
+        return value.contents
-        "start_line",
+    if isinstance(value, RawLine):
-        "depth",
+        return [value.line]
-        "orig",
+    raise Exception("Unknown how to get tokens from: {}".format(value))
-        "properties",
+
-        "keywords",
+def get_links_from_content(content):
-        "priority_start",
+    in_link = False
-        "priority",
+    in_description = False
-        "title_start",
+    link_value = []
-        "title",
+    link_description = []
-        "tags_start",
+
-        "tags",
+    for tok in get_tokens(content):
-        "contents",
+        if isinstance(tok, LinkToken):
-        "children",
+            if tok.tok_type == LinkTokenType.OPEN_LINK:
-        "structural",
+                in_link = True
-    ),
+            elif tok.tok_type == LinkTokenType.OPEN_DESCRIPTION:
-)
+                in_description = True
            elif tok.tok_type == LinkTokenType.CLOSE:
                in_link = False
                in_description = False
                yield Link(''.join(link_value), ''.join(link_description))
                link_value = []
                link_description = []
        elif isinstance(tok, str) and in_link:
            if in_description:
                link_description.append(tok)
            else:
                link_value.append(tok)
 class Headline:
    def __init__(self, start_line, depth, orig, properties, keywords, priority_start, priority, title_start, title, tags_start, tags, contents, children, structural):
        self.start_line = start_line
        self.depth = depth
        self.orig = orig
        self.properties = properties
        self.keywords = keywords
        self.priority_start = priority_start
        self.priority = priority
        self.title_start = title_start
        self.title = title
        self.tags_start = tags_start
        self.tags = tags
        self.contents = contents
        self.children = children
        self.structural = structural
    def get_links(self):
        for content in self.contents:
            yield from get_links_from_content(content)
 RawLine = collections.namedtuple("RawLine", ("linenum", "line"))
 Keyword = collections.namedtuple(
@ -120,12 +152,17 @@ for tok, mode in MARKERS.items():
    ModeToMarker[mode] = tok
 MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type"))
 LinkToken = collections.namedtuple("LinkToken", ("tok_type"))
 class LinkTokenType(Enum):
    OPEN_LINK = 3
    OPEN_DESCRIPTION = 5
    CLOSE = 4
 BEGIN_PROPERTIES = "OPEN_PROPERTIES"
 END_PROPERTIES = "CLOSE_PROPERTIES"
 def token_from_type(tok_type):
    print(ModeToMarker, tok_type)
    return ModeToMarker[tok_type]
@ -211,16 +248,39 @@ class Line:
        return "".join(rawchunks) + "\n"
 class Link:
    def __init__(self, value, description):
        self.value = value
        self.description = description
    def get_raw(self):
        if self.description:
            return '[[{}][{}]]'.format(self.value, self.description)
        else:
            return '[[{}]]'.format(self.value)
 class Text:
    def __init__(self, contents, line):
        self.contents = contents
        self.linenum = line
    def __repr__(self):
        return "{{Text line: {}; content: {} }}".format(self.linenum, self.contents)
    def get_raw(self):
        contents = []
        for chunk in self.contents:
            if isinstance(chunk, str):
                contents.append(chunk)
            elif isinstance(chunk, LinkToken):
                if chunk.tok_type == LinkTokenType.OPEN_LINK:
                    contents.append('[[')
                elif chunk.tok_type == LinkTokenType.OPEN_DESCRIPTION:
                    contents.append('][')
                else:
                    assert chunk.tok_type == LinkTokenType.CLOSE
                    contents.append(']]')
            else:
                assert isinstance(chunk, MarkerToken)
                contents.append(token_from_type(chunk.tok_type))
@ -331,6 +391,9 @@ def is_post(char: str) -> bool:
 TOKEN_TYPE_TEXT = 0
 TOKEN_TYPE_OPEN_MARKER = 1
 TOKEN_TYPE_CLOSE_MARKER = 2
 TOKEN_TYPE_OPEN_LINK = 3
 TOKEN_TYPE_CLOSE_LINK = 4
 TOKEN_TYPE_OPEN_DESCRIPTION = 5
 def tokenize_contents(contents: str):
@ -339,11 +402,78 @@ def tokenize_contents(contents: str):
    text = []
    closes = set()
    in_link = False
    in_link_description = False
    last_link_start = 0
-    for i, char in enumerate(contents):
+    def cut_string():
        nonlocal text
        nonlocal tokens
        if len(text) > 0:
            tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
            text = []
    cursor = enumerate(contents)
    for i, char in cursor:
        has_changed = False
-        if (
+        # Possible link opening
        if char == '[':
            if (len(contents) > i + 3
                # At least 3 characters more to open and close a link
                and contents[i + 1] == '['):
                close = contents.find(']', i)
                if close != -1 and contents[close + 1] == ']':
                    # Link with no description
                    cut_string()
                    in_link = True
                    tokens.append((TOKEN_TYPE_OPEN_LINK, None))
                    assert '[' == (next(cursor)[1])
                    last_link_start = i
                    continue
                if close != -1 and contents[close + 1] == '[':
                    # Link with description?
                    close = contents.find(']', close + 1)
                    if close != -1 and contents[close + 1] == ']':
                        # No match here means this is not an Org link
                        cut_string()
                        in_link = True
                        tokens.append((TOKEN_TYPE_OPEN_LINK, None))
                        assert '[' == (next(cursor)[1])
                        last_link_start = i
                        continue
        # Possible link close or open of description
        if char == ']' and in_link:
            if contents[i + 1] == ']':
                cut_string()
                tokens.append((TOKEN_TYPE_CLOSE_LINK, None))
                assert ']' == (next(cursor)[1])
                in_link = False
                in_link_description = False
                continue
            if contents[i + 1] == '[' and not in_link_description:
                cut_string()
                tokens.append((TOKEN_TYPE_OPEN_DESCRIPTION, None))
                assert '[' == (next(cursor)[1])
                continue
            raise Exception("Link cannot contain ']' not followed by '[' or ']'. Starting with {}".format(contents[last_link_start:i + 10]))
        if (in_link and not in_link_description):
            # Link's pointer have no formatting
            pass
        elif (
            (i not in closes)
            and is_marker(char)
            and is_pre(last_char)
@ -366,15 +496,11 @@ def tokenize_contents(contents: str):
                    text_in_line |= is_body(contents[j])
            if is_valid_mark:
-                if len(text) > 0:
+                cut_string()
                    tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
                    text = []
                tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
                has_changed = True
        elif i in closes:
-            if len(text) > 0:
+            cut_string()
                tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
                text = []
            tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
            has_changed = True
@ -406,6 +532,12 @@ def parse_contents(raw_contents: List[RawLine]):
            contents.append(MarkerToken(False, MARKERS[tok_val]))
        elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
            contents.append(MarkerToken(True, MARKERS[tok_val]))
        elif tok_type == TOKEN_TYPE_OPEN_LINK:
            contents.append(LinkToken(LinkTokenType.OPEN_LINK))
        elif tok_type == TOKEN_TYPE_OPEN_DESCRIPTION:
            contents.append(LinkToken(LinkTokenType.OPEN_DESCRIPTION))
        elif tok_type == TOKEN_TYPE_CLOSE_LINK:
            contents.append(LinkToken(LinkTokenType.CLOSE))
    return [Text(contents, current_line)]
@ -447,6 +579,13 @@ class OrgDom:
        raise NotImplementedError()
    ## Querying
    def get_links(self):
        for headline in self.headlines:
            yield from headline.get_links()
        for content in self.contents:
            yield from get_links_from_content(content)
    def getProperties(self):
        return self.keywords
--- a/tests/03-links.org
+++ b/tests/03-links.org
@ -8,6 +8,6 @@
  :ID:       03-markup-first-level-id
  :CREATED:  [2020-01-01 Wed 01:01]
  :END:
-  This is a [[https://codigoparallevar.com][web link]].
+  This is a [[https://codigoparallevar.com/1][web link]].
-  This is an /italized [[https://codigoparallevar.com][web link]]/.
+  This is a /italized [[https://codigoparallevar.com/2][web link]]/.
--- a/tests/test_dom.py
+++ b/tests/test_dom.py
@ -135,21 +135,31 @@ class TestSerde(unittest.TestCase):
        ex.assert_matches(self, doc)
-    # def test_links_file_03(self):
+    def test_links_file_03(self):
-    #     with open(os.path.join(DIR, '03-links.org')) as f:
+        with open(os.path.join(DIR, '03-links.org')) as f:
-    #         doc = load(f)
+            doc = load(f)
-    #     ex = Dom(props=[('TITLE', '03-Links'),
+        links = list(doc.get_links())
-    #                     ('DESCRIPTION', 'Simple org file to test links'),
+        self.assertEqual(len(links), 2)
-    #                     ('TODO', 'TODO(t) PAUSED(p) |  DONE(d)')],
+        self.assertEqual(links[0].value, 'https://codigoparallevar.com/1')
-    #              children=(HL('First level',
+        self.assertEqual(links[0].description, 'web link')
    #                           props=[
    #                               ('ID', '03-markup-first-level-id'),
    #                               ('CREATED', DT(2020, 1, 1, 1, 1)),
    #                           ],
    #                           content=[
    #                               SPAN("  This is a ", WEB_LINK("web link", "https://codigoparallevar.com"),
    #                                    "."),
    #                           ])))
-    #     ex.assert_matches(self, doc)
+        self.assertEqual(links[1].value, 'https://codigoparallevar.com/2')
        self.assertEqual(links[1].description, 'web link')
        ex = Dom(props=[('TITLE', '03-Links'),
                        ('DESCRIPTION', 'Simple org file to test links'),
                        ('TODO', 'TODO(t) PAUSED(p) |  DONE(d)')],
                 children=(HL('First level',
                              props=[
                                  ('ID', '03-markup-first-level-id'),
                                  ('CREATED', DT(2020, 1, 1, 1, 1)),
                              ],
                              content=[
                                  SPAN("  This is a ", WEB_LINK("web link", "https://codigoparallevar.com/1"),
                                       ".\n"),
                                  SPAN("\n"),
                                  SPAN("  This is a ",  ITALIC(["italized ", WEB_LINK("web link", "https://codigoparallevar.com/2")]),
                                       ".\n"),
                              ])))
        ex.assert_matches(self, doc)