From 6710775882e6649da3a2b2a6f05c150c849be516 Mon Sep 17 00:00:00 2001 From: Lyz Date: Sat, 25 Jan 2025 14:22:23 +0100 Subject: [PATCH 1/8] fix: strip token_list_to_plaintext otherwise when you do headline.title.get_text() you may have trailing whitespaces --- org_rw/org_rw.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/org_rw/org_rw.py b/org_rw/org_rw.py index 31b904c..4fc5da5 100644 --- a/org_rw/org_rw.py +++ b/org_rw/org_rw.py @@ -415,7 +415,6 @@ class Headline: isinstance(line, DelimiterLine) and line.delimiter_type == DelimiterLineType.END_BLOCK ): - start = current_node.header.linenum end = line.linenum @@ -815,7 +814,6 @@ class Headline: def set_property(self, name: str, value: str): for prop in self.properties: - # A matching property is found, update it if prop.key == name: prop.value = value @@ -1000,7 +998,6 @@ class Headline: and result_first[0] == "structural" and result_first[1].strip().upper() == ":RESULTS:" ): - (end_line, _) = self.get_structural_end_after( kword.linenum + 1 ) @@ -1795,7 +1792,7 @@ def token_list_to_plaintext(tok_list) -> str: else: assert isinstance(chunk, MarkerToken) - return "".join(contents) + return "".join(contents).strip() def token_list_to_raw(tok_list): @@ -2017,7 +2014,6 @@ def tokenize_contents(contents: str) -> List[TokenItems]: and is_pre(last_char) and ((i + 1 < len(contents)) and is_border(contents[i + 1])) ): - is_valid_mark = False # Check that is closed later text_in_line = True @@ -2408,7 +2404,6 @@ class OrgDoc: # Writing def dump_headline(self, headline, recursive=True): - tags = "" if len(headline.shallow_tags) > 0: tags = ":" + ":".join(headline.shallow_tags) + ":" @@ -2422,7 +2417,14 @@ class OrgDoc: if not (raw_title.endswith(" ") or raw_title.endswith("\t")) and tags: tags_padding = " " - yield "*" * headline.depth + headline.spacing + state + raw_title + tags_padding + tags + yield ( + "*" * headline.depth + + headline.spacing + + state + + raw_title + + tags_padding + + tags + ) planning = headline.get_planning_line() if planning is not None: From 123f5c911541928c3d40f26afb1feeb5f20dcc91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 00:05:24 +0200 Subject: [PATCH 2/8] test: Propose tests for title parsing changes. --- tests/14-titles.org | 12 ++++++++++++ tests/test_org.py | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+) create mode 100644 tests/14-titles.org diff --git a/tests/14-titles.org b/tests/14-titles.org new file mode 100644 index 0000000..75b88a7 --- /dev/null +++ b/tests/14-titles.org @@ -0,0 +1,12 @@ +#+TITLE: 14-Simple +#+DESCRIPTION: Org file to evaluate titles +#+TODO: TODO(t) PAUSED(p) | DONE(d) + + +* Simple title + +* Simple title with tags :tag: + +* Simple title with trailing space + +* Simple title with leading space diff --git a/tests/test_org.py b/tests/test_org.py index a1fdff1..d6b4351 100644 --- a/tests/test_org.py +++ b/tests/test_org.py @@ -955,6 +955,24 @@ class TestSerde(unittest.TestCase): h1_2_h2 = h1_2.children[0] self.assertEqual(sorted(h1_2_h2.tags), ["otherh2tag"]) + def test_titles_file(self): + with open(os.path.join(DIR, "14-titles.org")) as f: + doc = load(f) + + h1, h2, h3, h4 = doc.getTopHeadlines() + self.assertEqual(h1.title.get_text(), "Simple title") + self.assertEqual(h2.title.get_text(), "Simple title with tags") + self.assertEqual(h3.title.get_text(), "Simple title with trailing space") + self.assertEqual(h4.title.get_text(), "Simple title with leading space") + + def test_mimic_write_file_14(self): + """A goal of this library is to be able to update a file without changing parts not directly modified.""" + with open(os.path.join(DIR, "14-titles.org")) as f: + orig = f.read() + doc = loads(orig) + + self.assertEqual(dumps(doc), orig) + def test_update_headline_from_none_to_todo(self): orig = "* First entry" doc = loads(orig) From 9c54f83ec7f4f868156bfc259a3e602e7d4fa083 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 00:06:00 +0200 Subject: [PATCH 3/8] revert: Remove old implementation change. This is reverted as it doesn't return accurately the information that's on the org-mode file. --- org_rw/org_rw.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/org_rw/org_rw.py b/org_rw/org_rw.py index a6ba0da..bc9657a 100644 --- a/org_rw/org_rw.py +++ b/org_rw/org_rw.py @@ -1816,7 +1816,7 @@ def token_list_to_plaintext(tok_list) -> str: else: assert isinstance(chunk, MarkerToken) - return "".join(contents).strip() + return "".join(contents) def token_list_to_raw(tok_list): From 527a9e7eb24599b15edadf5e41b66320ccaf5e85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 00:37:38 +0200 Subject: [PATCH 4/8] feat: Keep headline whitespaces info & remove them from title text. --- org_rw/org_rw.py | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/org_rw/org_rw.py b/org_rw/org_rw.py index bc9657a..5c00e75 100644 --- a/org_rw/org_rw.py +++ b/org_rw/org_rw.py @@ -67,7 +67,7 @@ BASE_ENVIRONMENT = { ), } -HEADLINE_TAGS_RE = re.compile(r"((:(\w|[0-9_@#%])+)+:)\s*$") +HEADLINE_TAGS_RE = re.compile(r"((?P\s+)(:(\w|[0-9_@#%])+)+:)(?P\s*)$") HEADLINE_RE = re.compile(r"^(?P\*+)(?P\s+)(?P.*?)$") KEYWORDS_RE = re.compile( r"^(?P\s*)#\+(?P[^:\[]+)(\[(?P[^\]]*)\])?:(?P\s*)(?P.*)$" @@ -315,6 +315,8 @@ class Headline: state, tags_start, tags, + space_before_tags, + space_after_tags, contents, children, structural, @@ -340,6 +342,8 @@ class Headline: self.title = parse_content_block([RawLine(linenum=start_line, line=title)]) self._state = state self.tags_start = tags_start + self.space_before_tags = space_before_tags + self.space_after_tags = space_after_tags self.shallow_tags = tags self.contents = contents self.children = children @@ -2182,8 +2186,11 @@ def parse_headline(hl, doc, parent) -> Headline: if hl_tags is None: tags = [] + space_before_tags = space_after_tags = '' else: - tags = hl_tags.group(0)[1:-1].split(":") + tags = hl_tags.group(0).strip()[1:-1].split(":") + space_before_tags = hl_tags.group('space_before_tags') or '' + space_after_tags = hl_tags.group('space_after_tags') or '' line = HEADLINE_TAGS_RE.sub("", line) hl_state = None @@ -2203,6 +2210,13 @@ def parse_headline(hl, doc, parent) -> Headline: is_done = True break + if len(tags) == 0: + # No tags, so title might contain trailing whitespaces, handle it + title_ends_with_whitespace_match = re.search(r'\s+$', title) + if title_ends_with_whitespace_match is not None: + space_before_tags = title_ends_with_whitespace_match.group(0) + title = title[:-len(space_before_tags)] + contents = parse_contents(hl["contents"]) if not (isinstance(parent, OrgDoc) or depth > parent.depth): @@ -2229,6 +2243,8 @@ def parse_headline(hl, doc, parent) -> Headline: priority_start=None, tags_start=None, tags=tags, + space_before_tags=space_before_tags, + space_after_tags=space_after_tags, parent=parent, is_todo=is_todo, is_done=is_done, @@ -2430,25 +2446,21 @@ class OrgDoc: # Writing def dump_headline(self, headline, recursive=True): - tags = "" + tags = headline.space_before_tags if len(headline.shallow_tags) > 0: - tags = ":" + ":".join(headline.shallow_tags) + ":" + tags += ":" + ":".join(headline.shallow_tags) + ":" + headline.space_after_tags state = "" if headline._state: state = headline._state["name"] + " " raw_title = token_list_to_raw(headline.title.contents) - tags_padding = "" - if not (raw_title.endswith(" ") or raw_title.endswith("\t")) and tags: - tags_padding = " " yield ( "*" * headline.depth + headline.spacing + state + raw_title - + tags_padding + tags ) From 14e344981bca94cb91e751dfbc39a3bd616580cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 00:39:08 +0200 Subject: [PATCH 5/8] format: Apply black formatter. --- org_rw/org_rw.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/org_rw/org_rw.py b/org_rw/org_rw.py index 5c00e75..fdbd177 100644 --- a/org_rw/org_rw.py +++ b/org_rw/org_rw.py @@ -67,7 +67,9 @@ BASE_ENVIRONMENT = { ), } -HEADLINE_TAGS_RE = re.compile(r"((?P\s+)(:(\w|[0-9_@#%])+)+:)(?P\s*)$") +HEADLINE_TAGS_RE = re.compile( + r"((?P\s+)(:(\w|[0-9_@#%])+)+:)(?P\s*)$" +) HEADLINE_RE = re.compile(r"^(?P\*+)(?P\s+)(?P.*?)$") KEYWORDS_RE = re.compile( r"^(?P\s*)#\+(?P[^:\[]+)(\[(?P[^\]]*)\])?:(?P\s*)(?P.*)$" @@ -2186,11 +2188,11 @@ def parse_headline(hl, doc, parent) -> Headline: if hl_tags is None: tags = [] - space_before_tags = space_after_tags = '' + space_before_tags = space_after_tags = "" else: tags = hl_tags.group(0).strip()[1:-1].split(":") - space_before_tags = hl_tags.group('space_before_tags') or '' - space_after_tags = hl_tags.group('space_after_tags') or '' + space_before_tags = hl_tags.group("space_before_tags") or "" + space_after_tags = hl_tags.group("space_after_tags") or "" line = HEADLINE_TAGS_RE.sub("", line) hl_state = None @@ -2212,10 +2214,10 @@ def parse_headline(hl, doc, parent) -> Headline: if len(tags) == 0: # No tags, so title might contain trailing whitespaces, handle it - title_ends_with_whitespace_match = re.search(r'\s+$', title) + title_ends_with_whitespace_match = re.search(r"\s+$", title) if title_ends_with_whitespace_match is not None: space_before_tags = title_ends_with_whitespace_match.group(0) - title = title[:-len(space_before_tags)] + title = title[: -len(space_before_tags)] contents = parse_contents(hl["contents"]) @@ -2448,7 +2450,9 @@ class OrgDoc: def dump_headline(self, headline, recursive=True): tags = headline.space_before_tags if len(headline.shallow_tags) > 0: - tags += ":" + ":".join(headline.shallow_tags) + ":" + headline.space_after_tags + tags += ( + ":" + ":".join(headline.shallow_tags) + ":" + headline.space_after_tags + ) state = "" if headline._state: @@ -2456,13 +2460,7 @@ class OrgDoc: raw_title = token_list_to_raw(headline.title.contents) - yield ( - "*" * headline.depth - + headline.spacing - + state - + raw_title - + tags - ) + yield ("*" * headline.depth + headline.spacing + state + raw_title + tags) planning = headline.get_planning_line() if planning is not None: From 3193ecbc363fea3f4c7f0c163f67f6cc68825711 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 00:41:27 +0200 Subject: [PATCH 6/8] fix: Creation of new headlines. --- org_rw/org_rw.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/org_rw/org_rw.py b/org_rw/org_rw.py index fdbd177..77644b7 100644 --- a/org_rw/org_rw.py +++ b/org_rw/org_rw.py @@ -1078,6 +1078,8 @@ class Headline: state="", tags_start=None, tags=[], + space_before_tags="", + space_after_tags="", contents=[], children=[], structural=[], From f936bccf7f668d356bdf1cee596d94a6a2e567a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 00:46:52 +0200 Subject: [PATCH 7/8] doc: Add a small "Principles" section to README. --- README.org | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.org b/README.org index 95ec98a..253c8f6 100644 --- a/README.org +++ b/README.org @@ -7,6 +7,10 @@ A python library to parse, modify and save Org-mode files. - Modify these data and write it back to disk. - Keep the original structure intact (indentation, spaces, format, ...). +** Principles +- Data structures should be exposed as it's read on Emacs's org-mode or when in doubt as raw as possible. +- Data in the objects should be modificable, as a way to update the document itself. *Consider this a Object-oriented design.* +- *Modification of the original text if there's no change is considered a bug (see [[id:7363ba38-1662-4d3c-9e83-0999824975b7][Known issues]]).* ** Safety mechanism As this library is still in early development. Running it over files might produce unexpected changes on them. For this reason it's heavily recommended to @@ -21,6 +25,9 @@ Also, see [[id:76e77f7f-c9e0-4c83-ad2f-39a5a8894a83][Known issues:Structure modi not properly stored and can trigger this safety mechanism on a false-positive. * Known issues +:PROPERTIES: +:ID: 7363ba38-1662-4d3c-9e83-0999824975b7 +:END: ** Structure modifications :PROPERTIES: :ID: 76e77f7f-c9e0-4c83-ad2f-39a5a8894a83 From 55fc87cfdcef23eea402148c0a237976988107cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20Mart=C3=ADnez=20Portela?= Date: Wed, 16 Apr 2025 01:00:09 +0200 Subject: [PATCH 8/8] Add absence of dependencies as principle. --- README.org | 6 ++++-- requirements.txt | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) delete mode 100644 requirements.txt diff --git a/README.org b/README.org index 253c8f6..6f03720 100644 --- a/README.org +++ b/README.org @@ -8,9 +8,11 @@ A python library to parse, modify and save Org-mode files. - Keep the original structure intact (indentation, spaces, format, ...). ** Principles -- Data structures should be exposed as it's read on Emacs's org-mode or when in doubt as raw as possible. -- Data in the objects should be modificable, as a way to update the document itself. *Consider this a Object-oriented design.* +- Avoid any dependency outside of Python's standard library. +- Don't do anything outside of the scope of parsing/re-serializing Org-mode files. - *Modification of the original text if there's no change is considered a bug (see [[id:7363ba38-1662-4d3c-9e83-0999824975b7][Known issues]]).* +- Data structures should be exposed as it's read on Emacs's org-mode or when in doubt as raw as possible. +- Data in the objects should be modificable as a way to update the document itself. *Consider this a Object-oriented design.* ** Safety mechanism As this library is still in early development. Running it over files might produce unexpected changes on them. For this reason it's heavily recommended to diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 1c51c66..0000000 --- a/requirements.txt +++ /dev/null @@ -1 +0,0 @@ -# No external requirements at this point