From 2372fc597c16f4bc4e8da24a04e27e082ef732d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?=
 <sergio@codigoparallevar.com>
Date: Mon, 2 Nov 2020 23:51:11 +0100
Subject: [PATCH] Add base link parsing.

---
 org_dom/org_dom.py | 195 ++++++++++++++++++++++++++++++++++++++-------
 tests/03-links.org |   4 +-
 tests/test_dom.py  |  42 ++++++----
 3 files changed, 195 insertions(+), 46 deletions(-)

diff --git a/org_dom/org_dom.py b/org_dom/org_dom.py
index 98fa5ff..36b905b 100644
--- a/org_dom/org_dom.py
+++ b/org_dom/org_dom.py
@@ -60,25 +60,57 @@ INACTIVE_TIME_STAMP_RE = re.compile(r"\[{}\]".format(BASE_TIME_STAMP_RE))
 # BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
 #                       r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
 
-Headline = collections.namedtuple(
-    "Headline",
-    (
-        "start_line",
-        "depth",
-        "orig",
-        "properties",
-        "keywords",
-        "priority_start",
-        "priority",
-        "title_start",
-        "title",
-        "tags_start",
-        "tags",
-        "contents",
-        "children",
-        "structural",
-    ),
-)
+def get_tokens(value):
+    if isinstance(value, Text):
+        return value.contents
+    if isinstance(value, RawLine):
+        return [value.line]
+    raise Exception("Unknown how to get tokens from: {}".format(value))
+
+def get_links_from_content(content):
+    in_link = False
+    in_description = False
+    link_value = []
+    link_description = []
+
+    for tok in get_tokens(content):
+        if isinstance(tok, LinkToken):
+            if tok.tok_type == LinkTokenType.OPEN_LINK:
+                in_link = True
+            elif tok.tok_type == LinkTokenType.OPEN_DESCRIPTION:
+                in_description = True
+            elif tok.tok_type == LinkTokenType.CLOSE:
+                in_link = False
+                in_description = False
+                yield Link(''.join(link_value), ''.join(link_description))
+                link_value = []
+                link_description = []
+        elif isinstance(tok, str) and in_link:
+            if in_description:
+                link_description.append(tok)
+            else:
+                link_value.append(tok)
+
+class Headline:
+    def __init__(self, start_line, depth, orig, properties, keywords, priority_start, priority, title_start, title, tags_start, tags, contents, children, structural):
+        self.start_line = start_line
+        self.depth = depth
+        self.orig = orig
+        self.properties = properties
+        self.keywords = keywords
+        self.priority_start = priority_start
+        self.priority = priority
+        self.title_start = title_start
+        self.title = title
+        self.tags_start = tags_start
+        self.tags = tags
+        self.contents = contents
+        self.children = children
+        self.structural = structural
+
+    def get_links(self):
+        for content in self.contents:
+            yield from get_links_from_content(content)
 
 RawLine = collections.namedtuple("RawLine", ("linenum", "line"))
 Keyword = collections.namedtuple(
@@ -120,12 +152,17 @@ for tok, mode in MARKERS.items():
     ModeToMarker[mode] = tok
 
 MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type"))
+LinkToken = collections.namedtuple("LinkToken", ("tok_type"))
+
+class LinkTokenType(Enum):
+    OPEN_LINK = 3
+    OPEN_DESCRIPTION = 5
+    CLOSE = 4
 
 BEGIN_PROPERTIES = "OPEN_PROPERTIES"
 END_PROPERTIES = "CLOSE_PROPERTIES"
 
 def token_from_type(tok_type):
-    print(ModeToMarker, tok_type)
     return ModeToMarker[tok_type]
 
 
@@ -211,16 +248,39 @@ class Line:
         return "".join(rawchunks) + "\n"
 
 
+class Link:
+    def __init__(self, value, description):
+        self.value = value
+        self.description = description
+
+    def get_raw(self):
+        if self.description:
+            return '[[{}][{}]]'.format(self.value, self.description)
+        else:
+            return '[[{}]]'.format(self.value)
+
+
 class Text:
     def __init__(self, contents, line):
         self.contents = contents
         self.linenum = line
 
+    def __repr__(self):
+        return "{{Text line: {}; content: {} }}".format(self.linenum, self.contents)
+
     def get_raw(self):
         contents = []
         for chunk in self.contents:
             if isinstance(chunk, str):
                 contents.append(chunk)
+            elif isinstance(chunk, LinkToken):
+                if chunk.tok_type == LinkTokenType.OPEN_LINK:
+                    contents.append('[[')
+                elif chunk.tok_type == LinkTokenType.OPEN_DESCRIPTION:
+                    contents.append('][')
+                else:
+                    assert chunk.tok_type == LinkTokenType.CLOSE
+                    contents.append(']]')
             else:
                 assert isinstance(chunk, MarkerToken)
                 contents.append(token_from_type(chunk.tok_type))
@@ -331,6 +391,9 @@ def is_post(char: str) -> bool:
 TOKEN_TYPE_TEXT = 0
 TOKEN_TYPE_OPEN_MARKER = 1
 TOKEN_TYPE_CLOSE_MARKER = 2
+TOKEN_TYPE_OPEN_LINK = 3
+TOKEN_TYPE_CLOSE_LINK = 4
+TOKEN_TYPE_OPEN_DESCRIPTION = 5
 
 
 def tokenize_contents(contents: str):
@@ -339,11 +402,78 @@ def tokenize_contents(contents: str):
 
     text = []
     closes = set()
+    in_link = False
+    in_link_description = False
+    last_link_start = 0
 
-    for i, char in enumerate(contents):
+    def cut_string():
+        nonlocal text
+        nonlocal tokens
+
+        if len(text) > 0:
+            tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
+            text = []
+
+
+    cursor = enumerate(contents)
+    for i, char in cursor:
         has_changed = False
 
-        if (
+        # Possible link opening
+        if char == '[':
+            if (len(contents) > i + 3
+                # At least 3 characters more to open and close a link
+                and contents[i + 1] == '['):
+                close = contents.find(']', i)
+
+                if close != -1 and contents[close + 1] == ']':
+                    # Link with no description
+                    cut_string()
+
+                    in_link = True
+                    tokens.append((TOKEN_TYPE_OPEN_LINK, None))
+                    assert '[' == (next(cursor)[1])
+                    last_link_start = i
+                    continue
+                if close != -1 and contents[close + 1] == '[':
+                    # Link with description?
+
+                    close = contents.find(']', close + 1)
+                    if close != -1 and contents[close + 1] == ']':
+                        # No match here means this is not an Org link
+                        cut_string()
+
+                        in_link = True
+                        tokens.append((TOKEN_TYPE_OPEN_LINK, None))
+                        assert '[' == (next(cursor)[1])
+                        last_link_start = i
+                        continue
+
+        # Possible link close or open of description
+        if char == ']' and in_link:
+            if contents[i + 1] == ']':
+                cut_string()
+
+                tokens.append((TOKEN_TYPE_CLOSE_LINK, None))
+                assert ']' == (next(cursor)[1])
+                in_link = False
+                in_link_description = False
+                continue
+
+            if contents[i + 1] == '[' and not in_link_description:
+                cut_string()
+
+                tokens.append((TOKEN_TYPE_OPEN_DESCRIPTION, None))
+                assert '[' == (next(cursor)[1])
+                continue
+
+            raise Exception("Link cannot contain ']' not followed by '[' or ']'. Starting with {}".format(contents[last_link_start:i + 10]))
+
+        if (in_link and not in_link_description):
+            # Link's pointer have no formatting
+            pass
+
+        elif (
             (i not in closes)
             and is_marker(char)
             and is_pre(last_char)
@@ -366,15 +496,11 @@ def tokenize_contents(contents: str):
                     text_in_line |= is_body(contents[j])
 
             if is_valid_mark:
-                if len(text) > 0:
-                    tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
-                    text = []
+                cut_string()
                 tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
                 has_changed = True
         elif i in closes:
-            if len(text) > 0:
-                tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
-                text = []
+            cut_string()
             tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
             has_changed = True
 
@@ -406,6 +532,12 @@ def parse_contents(raw_contents: List[RawLine]):
             contents.append(MarkerToken(False, MARKERS[tok_val]))
         elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
             contents.append(MarkerToken(True, MARKERS[tok_val]))
+        elif tok_type == TOKEN_TYPE_OPEN_LINK:
+            contents.append(LinkToken(LinkTokenType.OPEN_LINK))
+        elif tok_type == TOKEN_TYPE_OPEN_DESCRIPTION:
+            contents.append(LinkToken(LinkTokenType.OPEN_DESCRIPTION))
+        elif tok_type == TOKEN_TYPE_CLOSE_LINK:
+            contents.append(LinkToken(LinkTokenType.CLOSE))
 
     return [Text(contents, current_line)]
 
@@ -447,6 +579,13 @@ class OrgDom:
         raise NotImplementedError()
 
     ## Querying
+    def get_links(self):
+        for headline in self.headlines:
+            yield from headline.get_links()
+
+        for content in self.contents:
+            yield from get_links_from_content(content)
+
     def getProperties(self):
         return self.keywords
 
diff --git a/tests/03-links.org b/tests/03-links.org
index e31e9ad..cea26fb 100644
--- a/tests/03-links.org
+++ b/tests/03-links.org
@@ -8,6 +8,6 @@
   :ID:       03-markup-first-level-id
   :CREATED:  [2020-01-01 Wed 01:01]
   :END:
-  This is a [[https://codigoparallevar.com][web link]].
+  This is a [[https://codigoparallevar.com/1][web link]].
 
-  This is an /italized [[https://codigoparallevar.com][web link]]/.
+  This is a /italized [[https://codigoparallevar.com/2][web link]]/.
diff --git a/tests/test_dom.py b/tests/test_dom.py
index 9321502..a0010cf 100644
--- a/tests/test_dom.py
+++ b/tests/test_dom.py
@@ -135,21 +135,31 @@ class TestSerde(unittest.TestCase):
 
         ex.assert_matches(self, doc)
 
-    # def test_links_file_03(self):
-    #     with open(os.path.join(DIR, '03-links.org')) as f:
-    #         doc = load(f)
+    def test_links_file_03(self):
+        with open(os.path.join(DIR, '03-links.org')) as f:
+            doc = load(f)
 
-    #     ex = Dom(props=[('TITLE', '03-Links'),
-    #                     ('DESCRIPTION', 'Simple org file to test links'),
-    #                     ('TODO', 'TODO(t) PAUSED(p) |  DONE(d)')],
-    #              children=(HL('First level',
-    #                           props=[
-    #                               ('ID', '03-markup-first-level-id'),
-    #                               ('CREATED', DT(2020, 1, 1, 1, 1)),
-    #                           ],
-    #                           content=[
-    #                               SPAN("  This is a ", WEB_LINK("web link", "https://codigoparallevar.com"),
-    #                                    "."),
-    #                           ])))
+        links = list(doc.get_links())
+        self.assertEqual(len(links), 2)
+        self.assertEqual(links[0].value, 'https://codigoparallevar.com/1')
+        self.assertEqual(links[0].description, 'web link')
 
-    #     ex.assert_matches(self, doc)
+        self.assertEqual(links[1].value, 'https://codigoparallevar.com/2')
+        self.assertEqual(links[1].description, 'web link')
+        ex = Dom(props=[('TITLE', '03-Links'),
+                        ('DESCRIPTION', 'Simple org file to test links'),
+                        ('TODO', 'TODO(t) PAUSED(p) |  DONE(d)')],
+                 children=(HL('First level',
+                              props=[
+                                  ('ID', '03-markup-first-level-id'),
+                                  ('CREATED', DT(2020, 1, 1, 1, 1)),
+                              ],
+                              content=[
+                                  SPAN("  This is a ", WEB_LINK("web link", "https://codigoparallevar.com/1"),
+                                       ".\n"),
+                                  SPAN("\n"),
+                                  SPAN("  This is a ",  ITALIC(["italized ", WEB_LINK("web link", "https://codigoparallevar.com/2")]),
+                                       ".\n"),
+                              ])))
+
+        ex.assert_matches(self, doc)