Add base link parsing.

This commit is contained in:
Sergio Martínez Portela 2020-11-02 23:51:11 +01:00
parent e73ce5d480
commit 2372fc597c
3 changed files with 195 additions and 46 deletions

View File

@ -60,25 +60,57 @@ INACTIVE_TIME_STAMP_RE = re.compile(r"\[{}\]".format(BASE_TIME_STAMP_RE))
# BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?', # BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
# r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?') # r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
Headline = collections.namedtuple( def get_tokens(value):
"Headline", if isinstance(value, Text):
( return value.contents
"start_line", if isinstance(value, RawLine):
"depth", return [value.line]
"orig", raise Exception("Unknown how to get tokens from: {}".format(value))
"properties",
"keywords", def get_links_from_content(content):
"priority_start", in_link = False
"priority", in_description = False
"title_start", link_value = []
"title", link_description = []
"tags_start",
"tags", for tok in get_tokens(content):
"contents", if isinstance(tok, LinkToken):
"children", if tok.tok_type == LinkTokenType.OPEN_LINK:
"structural", in_link = True
), elif tok.tok_type == LinkTokenType.OPEN_DESCRIPTION:
) in_description = True
elif tok.tok_type == LinkTokenType.CLOSE:
in_link = False
in_description = False
yield Link(''.join(link_value), ''.join(link_description))
link_value = []
link_description = []
elif isinstance(tok, str) and in_link:
if in_description:
link_description.append(tok)
else:
link_value.append(tok)
class Headline:
def __init__(self, start_line, depth, orig, properties, keywords, priority_start, priority, title_start, title, tags_start, tags, contents, children, structural):
self.start_line = start_line
self.depth = depth
self.orig = orig
self.properties = properties
self.keywords = keywords
self.priority_start = priority_start
self.priority = priority
self.title_start = title_start
self.title = title
self.tags_start = tags_start
self.tags = tags
self.contents = contents
self.children = children
self.structural = structural
def get_links(self):
for content in self.contents:
yield from get_links_from_content(content)
RawLine = collections.namedtuple("RawLine", ("linenum", "line")) RawLine = collections.namedtuple("RawLine", ("linenum", "line"))
Keyword = collections.namedtuple( Keyword = collections.namedtuple(
@ -120,12 +152,17 @@ for tok, mode in MARKERS.items():
ModeToMarker[mode] = tok ModeToMarker[mode] = tok
MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type")) MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type"))
LinkToken = collections.namedtuple("LinkToken", ("tok_type"))
class LinkTokenType(Enum):
OPEN_LINK = 3
OPEN_DESCRIPTION = 5
CLOSE = 4
BEGIN_PROPERTIES = "OPEN_PROPERTIES" BEGIN_PROPERTIES = "OPEN_PROPERTIES"
END_PROPERTIES = "CLOSE_PROPERTIES" END_PROPERTIES = "CLOSE_PROPERTIES"
def token_from_type(tok_type): def token_from_type(tok_type):
print(ModeToMarker, tok_type)
return ModeToMarker[tok_type] return ModeToMarker[tok_type]
@ -211,16 +248,39 @@ class Line:
return "".join(rawchunks) + "\n" return "".join(rawchunks) + "\n"
class Link:
def __init__(self, value, description):
self.value = value
self.description = description
def get_raw(self):
if self.description:
return '[[{}][{}]]'.format(self.value, self.description)
else:
return '[[{}]]'.format(self.value)
class Text: class Text:
def __init__(self, contents, line): def __init__(self, contents, line):
self.contents = contents self.contents = contents
self.linenum = line self.linenum = line
def __repr__(self):
return "{{Text line: {}; content: {} }}".format(self.linenum, self.contents)
def get_raw(self): def get_raw(self):
contents = [] contents = []
for chunk in self.contents: for chunk in self.contents:
if isinstance(chunk, str): if isinstance(chunk, str):
contents.append(chunk) contents.append(chunk)
elif isinstance(chunk, LinkToken):
if chunk.tok_type == LinkTokenType.OPEN_LINK:
contents.append('[[')
elif chunk.tok_type == LinkTokenType.OPEN_DESCRIPTION:
contents.append('][')
else:
assert chunk.tok_type == LinkTokenType.CLOSE
contents.append(']]')
else: else:
assert isinstance(chunk, MarkerToken) assert isinstance(chunk, MarkerToken)
contents.append(token_from_type(chunk.tok_type)) contents.append(token_from_type(chunk.tok_type))
@ -331,6 +391,9 @@ def is_post(char: str) -> bool:
TOKEN_TYPE_TEXT = 0 TOKEN_TYPE_TEXT = 0
TOKEN_TYPE_OPEN_MARKER = 1 TOKEN_TYPE_OPEN_MARKER = 1
TOKEN_TYPE_CLOSE_MARKER = 2 TOKEN_TYPE_CLOSE_MARKER = 2
TOKEN_TYPE_OPEN_LINK = 3
TOKEN_TYPE_CLOSE_LINK = 4
TOKEN_TYPE_OPEN_DESCRIPTION = 5
def tokenize_contents(contents: str): def tokenize_contents(contents: str):
@ -339,11 +402,78 @@ def tokenize_contents(contents: str):
text = [] text = []
closes = set() closes = set()
in_link = False
in_link_description = False
last_link_start = 0
for i, char in enumerate(contents): def cut_string():
nonlocal text
nonlocal tokens
if len(text) > 0:
tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
text = []
cursor = enumerate(contents)
for i, char in cursor:
has_changed = False has_changed = False
if ( # Possible link opening
if char == '[':
if (len(contents) > i + 3
# At least 3 characters more to open and close a link
and contents[i + 1] == '['):
close = contents.find(']', i)
if close != -1 and contents[close + 1] == ']':
# Link with no description
cut_string()
in_link = True
tokens.append((TOKEN_TYPE_OPEN_LINK, None))
assert '[' == (next(cursor)[1])
last_link_start = i
continue
if close != -1 and contents[close + 1] == '[':
# Link with description?
close = contents.find(']', close + 1)
if close != -1 and contents[close + 1] == ']':
# No match here means this is not an Org link
cut_string()
in_link = True
tokens.append((TOKEN_TYPE_OPEN_LINK, None))
assert '[' == (next(cursor)[1])
last_link_start = i
continue
# Possible link close or open of description
if char == ']' and in_link:
if contents[i + 1] == ']':
cut_string()
tokens.append((TOKEN_TYPE_CLOSE_LINK, None))
assert ']' == (next(cursor)[1])
in_link = False
in_link_description = False
continue
if contents[i + 1] == '[' and not in_link_description:
cut_string()
tokens.append((TOKEN_TYPE_OPEN_DESCRIPTION, None))
assert '[' == (next(cursor)[1])
continue
raise Exception("Link cannot contain ']' not followed by '[' or ']'. Starting with {}".format(contents[last_link_start:i + 10]))
if (in_link and not in_link_description):
# Link's pointer have no formatting
pass
elif (
(i not in closes) (i not in closes)
and is_marker(char) and is_marker(char)
and is_pre(last_char) and is_pre(last_char)
@ -366,15 +496,11 @@ def tokenize_contents(contents: str):
text_in_line |= is_body(contents[j]) text_in_line |= is_body(contents[j])
if is_valid_mark: if is_valid_mark:
if len(text) > 0: cut_string()
tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
text = []
tokens.append((TOKEN_TYPE_OPEN_MARKER, char)) tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
has_changed = True has_changed = True
elif i in closes: elif i in closes:
if len(text) > 0: cut_string()
tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
text = []
tokens.append((TOKEN_TYPE_CLOSE_MARKER, char)) tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
has_changed = True has_changed = True
@ -406,6 +532,12 @@ def parse_contents(raw_contents: List[RawLine]):
contents.append(MarkerToken(False, MARKERS[tok_val])) contents.append(MarkerToken(False, MARKERS[tok_val]))
elif tok_type == TOKEN_TYPE_CLOSE_MARKER: elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
contents.append(MarkerToken(True, MARKERS[tok_val])) contents.append(MarkerToken(True, MARKERS[tok_val]))
elif tok_type == TOKEN_TYPE_OPEN_LINK:
contents.append(LinkToken(LinkTokenType.OPEN_LINK))
elif tok_type == TOKEN_TYPE_OPEN_DESCRIPTION:
contents.append(LinkToken(LinkTokenType.OPEN_DESCRIPTION))
elif tok_type == TOKEN_TYPE_CLOSE_LINK:
contents.append(LinkToken(LinkTokenType.CLOSE))
return [Text(contents, current_line)] return [Text(contents, current_line)]
@ -447,6 +579,13 @@ class OrgDom:
raise NotImplementedError() raise NotImplementedError()
## Querying ## Querying
def get_links(self):
for headline in self.headlines:
yield from headline.get_links()
for content in self.contents:
yield from get_links_from_content(content)
def getProperties(self): def getProperties(self):
return self.keywords return self.keywords

View File

@ -8,6 +8,6 @@
:ID: 03-markup-first-level-id :ID: 03-markup-first-level-id
:CREATED: [2020-01-01 Wed 01:01] :CREATED: [2020-01-01 Wed 01:01]
:END: :END:
This is a [[https://codigoparallevar.com][web link]]. This is a [[https://codigoparallevar.com/1][web link]].
This is an /italized [[https://codigoparallevar.com][web link]]/. This is a /italized [[https://codigoparallevar.com/2][web link]]/.

View File

@ -135,21 +135,31 @@ class TestSerde(unittest.TestCase):
ex.assert_matches(self, doc) ex.assert_matches(self, doc)
# def test_links_file_03(self): def test_links_file_03(self):
# with open(os.path.join(DIR, '03-links.org')) as f: with open(os.path.join(DIR, '03-links.org')) as f:
# doc = load(f) doc = load(f)
# ex = Dom(props=[('TITLE', '03-Links'), links = list(doc.get_links())
# ('DESCRIPTION', 'Simple org file to test links'), self.assertEqual(len(links), 2)
# ('TODO', 'TODO(t) PAUSED(p) | DONE(d)')], self.assertEqual(links[0].value, 'https://codigoparallevar.com/1')
# children=(HL('First level', self.assertEqual(links[0].description, 'web link')
# props=[
# ('ID', '03-markup-first-level-id'),
# ('CREATED', DT(2020, 1, 1, 1, 1)),
# ],
# content=[
# SPAN(" This is a ", WEB_LINK("web link", "https://codigoparallevar.com"),
# "."),
# ])))
# ex.assert_matches(self, doc) self.assertEqual(links[1].value, 'https://codigoparallevar.com/2')
self.assertEqual(links[1].description, 'web link')
ex = Dom(props=[('TITLE', '03-Links'),
('DESCRIPTION', 'Simple org file to test links'),
('TODO', 'TODO(t) PAUSED(p) | DONE(d)')],
children=(HL('First level',
props=[
('ID', '03-markup-first-level-id'),
('CREATED', DT(2020, 1, 1, 1, 1)),
],
content=[
SPAN(" This is a ", WEB_LINK("web link", "https://codigoparallevar.com/1"),
".\n"),
SPAN("\n"),
SPAN(" This is a ", ITALIC(["italized ", WEB_LINK("web link", "https://codigoparallevar.com/2")]),
".\n"),
])))
ex.assert_matches(self, doc)