Add base link parsing.
This commit is contained in:
parent
e73ce5d480
commit
2372fc597c
@ -60,25 +60,57 @@ INACTIVE_TIME_STAMP_RE = re.compile(r"\[{}\]".format(BASE_TIME_STAMP_RE))
|
|||||||
# BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
|
# BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
|
||||||
# r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
|
# r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
|
||||||
|
|
||||||
Headline = collections.namedtuple(
|
def get_tokens(value):
|
||||||
"Headline",
|
if isinstance(value, Text):
|
||||||
(
|
return value.contents
|
||||||
"start_line",
|
if isinstance(value, RawLine):
|
||||||
"depth",
|
return [value.line]
|
||||||
"orig",
|
raise Exception("Unknown how to get tokens from: {}".format(value))
|
||||||
"properties",
|
|
||||||
"keywords",
|
def get_links_from_content(content):
|
||||||
"priority_start",
|
in_link = False
|
||||||
"priority",
|
in_description = False
|
||||||
"title_start",
|
link_value = []
|
||||||
"title",
|
link_description = []
|
||||||
"tags_start",
|
|
||||||
"tags",
|
for tok in get_tokens(content):
|
||||||
"contents",
|
if isinstance(tok, LinkToken):
|
||||||
"children",
|
if tok.tok_type == LinkTokenType.OPEN_LINK:
|
||||||
"structural",
|
in_link = True
|
||||||
),
|
elif tok.tok_type == LinkTokenType.OPEN_DESCRIPTION:
|
||||||
)
|
in_description = True
|
||||||
|
elif tok.tok_type == LinkTokenType.CLOSE:
|
||||||
|
in_link = False
|
||||||
|
in_description = False
|
||||||
|
yield Link(''.join(link_value), ''.join(link_description))
|
||||||
|
link_value = []
|
||||||
|
link_description = []
|
||||||
|
elif isinstance(tok, str) and in_link:
|
||||||
|
if in_description:
|
||||||
|
link_description.append(tok)
|
||||||
|
else:
|
||||||
|
link_value.append(tok)
|
||||||
|
|
||||||
|
class Headline:
|
||||||
|
def __init__(self, start_line, depth, orig, properties, keywords, priority_start, priority, title_start, title, tags_start, tags, contents, children, structural):
|
||||||
|
self.start_line = start_line
|
||||||
|
self.depth = depth
|
||||||
|
self.orig = orig
|
||||||
|
self.properties = properties
|
||||||
|
self.keywords = keywords
|
||||||
|
self.priority_start = priority_start
|
||||||
|
self.priority = priority
|
||||||
|
self.title_start = title_start
|
||||||
|
self.title = title
|
||||||
|
self.tags_start = tags_start
|
||||||
|
self.tags = tags
|
||||||
|
self.contents = contents
|
||||||
|
self.children = children
|
||||||
|
self.structural = structural
|
||||||
|
|
||||||
|
def get_links(self):
|
||||||
|
for content in self.contents:
|
||||||
|
yield from get_links_from_content(content)
|
||||||
|
|
||||||
RawLine = collections.namedtuple("RawLine", ("linenum", "line"))
|
RawLine = collections.namedtuple("RawLine", ("linenum", "line"))
|
||||||
Keyword = collections.namedtuple(
|
Keyword = collections.namedtuple(
|
||||||
@ -120,12 +152,17 @@ for tok, mode in MARKERS.items():
|
|||||||
ModeToMarker[mode] = tok
|
ModeToMarker[mode] = tok
|
||||||
|
|
||||||
MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type"))
|
MarkerToken = collections.namedtuple("MarkerToken", ("closing", "tok_type"))
|
||||||
|
LinkToken = collections.namedtuple("LinkToken", ("tok_type"))
|
||||||
|
|
||||||
|
class LinkTokenType(Enum):
|
||||||
|
OPEN_LINK = 3
|
||||||
|
OPEN_DESCRIPTION = 5
|
||||||
|
CLOSE = 4
|
||||||
|
|
||||||
BEGIN_PROPERTIES = "OPEN_PROPERTIES"
|
BEGIN_PROPERTIES = "OPEN_PROPERTIES"
|
||||||
END_PROPERTIES = "CLOSE_PROPERTIES"
|
END_PROPERTIES = "CLOSE_PROPERTIES"
|
||||||
|
|
||||||
def token_from_type(tok_type):
|
def token_from_type(tok_type):
|
||||||
print(ModeToMarker, tok_type)
|
|
||||||
return ModeToMarker[tok_type]
|
return ModeToMarker[tok_type]
|
||||||
|
|
||||||
|
|
||||||
@ -211,16 +248,39 @@ class Line:
|
|||||||
return "".join(rawchunks) + "\n"
|
return "".join(rawchunks) + "\n"
|
||||||
|
|
||||||
|
|
||||||
|
class Link:
|
||||||
|
def __init__(self, value, description):
|
||||||
|
self.value = value
|
||||||
|
self.description = description
|
||||||
|
|
||||||
|
def get_raw(self):
|
||||||
|
if self.description:
|
||||||
|
return '[[{}][{}]]'.format(self.value, self.description)
|
||||||
|
else:
|
||||||
|
return '[[{}]]'.format(self.value)
|
||||||
|
|
||||||
|
|
||||||
class Text:
|
class Text:
|
||||||
def __init__(self, contents, line):
|
def __init__(self, contents, line):
|
||||||
self.contents = contents
|
self.contents = contents
|
||||||
self.linenum = line
|
self.linenum = line
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return "{{Text line: {}; content: {} }}".format(self.linenum, self.contents)
|
||||||
|
|
||||||
def get_raw(self):
|
def get_raw(self):
|
||||||
contents = []
|
contents = []
|
||||||
for chunk in self.contents:
|
for chunk in self.contents:
|
||||||
if isinstance(chunk, str):
|
if isinstance(chunk, str):
|
||||||
contents.append(chunk)
|
contents.append(chunk)
|
||||||
|
elif isinstance(chunk, LinkToken):
|
||||||
|
if chunk.tok_type == LinkTokenType.OPEN_LINK:
|
||||||
|
contents.append('[[')
|
||||||
|
elif chunk.tok_type == LinkTokenType.OPEN_DESCRIPTION:
|
||||||
|
contents.append('][')
|
||||||
|
else:
|
||||||
|
assert chunk.tok_type == LinkTokenType.CLOSE
|
||||||
|
contents.append(']]')
|
||||||
else:
|
else:
|
||||||
assert isinstance(chunk, MarkerToken)
|
assert isinstance(chunk, MarkerToken)
|
||||||
contents.append(token_from_type(chunk.tok_type))
|
contents.append(token_from_type(chunk.tok_type))
|
||||||
@ -331,6 +391,9 @@ def is_post(char: str) -> bool:
|
|||||||
TOKEN_TYPE_TEXT = 0
|
TOKEN_TYPE_TEXT = 0
|
||||||
TOKEN_TYPE_OPEN_MARKER = 1
|
TOKEN_TYPE_OPEN_MARKER = 1
|
||||||
TOKEN_TYPE_CLOSE_MARKER = 2
|
TOKEN_TYPE_CLOSE_MARKER = 2
|
||||||
|
TOKEN_TYPE_OPEN_LINK = 3
|
||||||
|
TOKEN_TYPE_CLOSE_LINK = 4
|
||||||
|
TOKEN_TYPE_OPEN_DESCRIPTION = 5
|
||||||
|
|
||||||
|
|
||||||
def tokenize_contents(contents: str):
|
def tokenize_contents(contents: str):
|
||||||
@ -339,11 +402,78 @@ def tokenize_contents(contents: str):
|
|||||||
|
|
||||||
text = []
|
text = []
|
||||||
closes = set()
|
closes = set()
|
||||||
|
in_link = False
|
||||||
|
in_link_description = False
|
||||||
|
last_link_start = 0
|
||||||
|
|
||||||
for i, char in enumerate(contents):
|
def cut_string():
|
||||||
|
nonlocal text
|
||||||
|
nonlocal tokens
|
||||||
|
|
||||||
|
if len(text) > 0:
|
||||||
|
tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
|
||||||
|
text = []
|
||||||
|
|
||||||
|
|
||||||
|
cursor = enumerate(contents)
|
||||||
|
for i, char in cursor:
|
||||||
has_changed = False
|
has_changed = False
|
||||||
|
|
||||||
if (
|
# Possible link opening
|
||||||
|
if char == '[':
|
||||||
|
if (len(contents) > i + 3
|
||||||
|
# At least 3 characters more to open and close a link
|
||||||
|
and contents[i + 1] == '['):
|
||||||
|
close = contents.find(']', i)
|
||||||
|
|
||||||
|
if close != -1 and contents[close + 1] == ']':
|
||||||
|
# Link with no description
|
||||||
|
cut_string()
|
||||||
|
|
||||||
|
in_link = True
|
||||||
|
tokens.append((TOKEN_TYPE_OPEN_LINK, None))
|
||||||
|
assert '[' == (next(cursor)[1])
|
||||||
|
last_link_start = i
|
||||||
|
continue
|
||||||
|
if close != -1 and contents[close + 1] == '[':
|
||||||
|
# Link with description?
|
||||||
|
|
||||||
|
close = contents.find(']', close + 1)
|
||||||
|
if close != -1 and contents[close + 1] == ']':
|
||||||
|
# No match here means this is not an Org link
|
||||||
|
cut_string()
|
||||||
|
|
||||||
|
in_link = True
|
||||||
|
tokens.append((TOKEN_TYPE_OPEN_LINK, None))
|
||||||
|
assert '[' == (next(cursor)[1])
|
||||||
|
last_link_start = i
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Possible link close or open of description
|
||||||
|
if char == ']' and in_link:
|
||||||
|
if contents[i + 1] == ']':
|
||||||
|
cut_string()
|
||||||
|
|
||||||
|
tokens.append((TOKEN_TYPE_CLOSE_LINK, None))
|
||||||
|
assert ']' == (next(cursor)[1])
|
||||||
|
in_link = False
|
||||||
|
in_link_description = False
|
||||||
|
continue
|
||||||
|
|
||||||
|
if contents[i + 1] == '[' and not in_link_description:
|
||||||
|
cut_string()
|
||||||
|
|
||||||
|
tokens.append((TOKEN_TYPE_OPEN_DESCRIPTION, None))
|
||||||
|
assert '[' == (next(cursor)[1])
|
||||||
|
continue
|
||||||
|
|
||||||
|
raise Exception("Link cannot contain ']' not followed by '[' or ']'. Starting with {}".format(contents[last_link_start:i + 10]))
|
||||||
|
|
||||||
|
if (in_link and not in_link_description):
|
||||||
|
# Link's pointer have no formatting
|
||||||
|
pass
|
||||||
|
|
||||||
|
elif (
|
||||||
(i not in closes)
|
(i not in closes)
|
||||||
and is_marker(char)
|
and is_marker(char)
|
||||||
and is_pre(last_char)
|
and is_pre(last_char)
|
||||||
@ -366,15 +496,11 @@ def tokenize_contents(contents: str):
|
|||||||
text_in_line |= is_body(contents[j])
|
text_in_line |= is_body(contents[j])
|
||||||
|
|
||||||
if is_valid_mark:
|
if is_valid_mark:
|
||||||
if len(text) > 0:
|
cut_string()
|
||||||
tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
|
|
||||||
text = []
|
|
||||||
tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
|
tokens.append((TOKEN_TYPE_OPEN_MARKER, char))
|
||||||
has_changed = True
|
has_changed = True
|
||||||
elif i in closes:
|
elif i in closes:
|
||||||
if len(text) > 0:
|
cut_string()
|
||||||
tokens.append((TOKEN_TYPE_TEXT, "".join(text)))
|
|
||||||
text = []
|
|
||||||
tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
|
tokens.append((TOKEN_TYPE_CLOSE_MARKER, char))
|
||||||
has_changed = True
|
has_changed = True
|
||||||
|
|
||||||
@ -406,6 +532,12 @@ def parse_contents(raw_contents: List[RawLine]):
|
|||||||
contents.append(MarkerToken(False, MARKERS[tok_val]))
|
contents.append(MarkerToken(False, MARKERS[tok_val]))
|
||||||
elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
|
elif tok_type == TOKEN_TYPE_CLOSE_MARKER:
|
||||||
contents.append(MarkerToken(True, MARKERS[tok_val]))
|
contents.append(MarkerToken(True, MARKERS[tok_val]))
|
||||||
|
elif tok_type == TOKEN_TYPE_OPEN_LINK:
|
||||||
|
contents.append(LinkToken(LinkTokenType.OPEN_LINK))
|
||||||
|
elif tok_type == TOKEN_TYPE_OPEN_DESCRIPTION:
|
||||||
|
contents.append(LinkToken(LinkTokenType.OPEN_DESCRIPTION))
|
||||||
|
elif tok_type == TOKEN_TYPE_CLOSE_LINK:
|
||||||
|
contents.append(LinkToken(LinkTokenType.CLOSE))
|
||||||
|
|
||||||
return [Text(contents, current_line)]
|
return [Text(contents, current_line)]
|
||||||
|
|
||||||
@ -447,6 +579,13 @@ class OrgDom:
|
|||||||
raise NotImplementedError()
|
raise NotImplementedError()
|
||||||
|
|
||||||
## Querying
|
## Querying
|
||||||
|
def get_links(self):
|
||||||
|
for headline in self.headlines:
|
||||||
|
yield from headline.get_links()
|
||||||
|
|
||||||
|
for content in self.contents:
|
||||||
|
yield from get_links_from_content(content)
|
||||||
|
|
||||||
def getProperties(self):
|
def getProperties(self):
|
||||||
return self.keywords
|
return self.keywords
|
||||||
|
|
||||||
|
@ -8,6 +8,6 @@
|
|||||||
:ID: 03-markup-first-level-id
|
:ID: 03-markup-first-level-id
|
||||||
:CREATED: [2020-01-01 Wed 01:01]
|
:CREATED: [2020-01-01 Wed 01:01]
|
||||||
:END:
|
:END:
|
||||||
This is a [[https://codigoparallevar.com][web link]].
|
This is a [[https://codigoparallevar.com/1][web link]].
|
||||||
|
|
||||||
This is an /italized [[https://codigoparallevar.com][web link]]/.
|
This is a /italized [[https://codigoparallevar.com/2][web link]]/.
|
||||||
|
@ -135,21 +135,31 @@ class TestSerde(unittest.TestCase):
|
|||||||
|
|
||||||
ex.assert_matches(self, doc)
|
ex.assert_matches(self, doc)
|
||||||
|
|
||||||
# def test_links_file_03(self):
|
def test_links_file_03(self):
|
||||||
# with open(os.path.join(DIR, '03-links.org')) as f:
|
with open(os.path.join(DIR, '03-links.org')) as f:
|
||||||
# doc = load(f)
|
doc = load(f)
|
||||||
|
|
||||||
# ex = Dom(props=[('TITLE', '03-Links'),
|
links = list(doc.get_links())
|
||||||
# ('DESCRIPTION', 'Simple org file to test links'),
|
self.assertEqual(len(links), 2)
|
||||||
# ('TODO', 'TODO(t) PAUSED(p) | DONE(d)')],
|
self.assertEqual(links[0].value, 'https://codigoparallevar.com/1')
|
||||||
# children=(HL('First level',
|
self.assertEqual(links[0].description, 'web link')
|
||||||
# props=[
|
|
||||||
# ('ID', '03-markup-first-level-id'),
|
|
||||||
# ('CREATED', DT(2020, 1, 1, 1, 1)),
|
|
||||||
# ],
|
|
||||||
# content=[
|
|
||||||
# SPAN(" This is a ", WEB_LINK("web link", "https://codigoparallevar.com"),
|
|
||||||
# "."),
|
|
||||||
# ])))
|
|
||||||
|
|
||||||
# ex.assert_matches(self, doc)
|
self.assertEqual(links[1].value, 'https://codigoparallevar.com/2')
|
||||||
|
self.assertEqual(links[1].description, 'web link')
|
||||||
|
ex = Dom(props=[('TITLE', '03-Links'),
|
||||||
|
('DESCRIPTION', 'Simple org file to test links'),
|
||||||
|
('TODO', 'TODO(t) PAUSED(p) | DONE(d)')],
|
||||||
|
children=(HL('First level',
|
||||||
|
props=[
|
||||||
|
('ID', '03-markup-first-level-id'),
|
||||||
|
('CREATED', DT(2020, 1, 1, 1, 1)),
|
||||||
|
],
|
||||||
|
content=[
|
||||||
|
SPAN(" This is a ", WEB_LINK("web link", "https://codigoparallevar.com/1"),
|
||||||
|
".\n"),
|
||||||
|
SPAN("\n"),
|
||||||
|
SPAN(" This is a ", ITALIC(["italized ", WEB_LINK("web link", "https://codigoparallevar.com/2")]),
|
||||||
|
".\n"),
|
||||||
|
])))
|
||||||
|
|
||||||
|
ex.assert_matches(self, doc)
|
||||||
|
Loading…
Reference in New Issue
Block a user