commit d29058cb5ee0aa9950e793d96f8f6bf3b439f011 Author: Sergio Martínez Portela Date: Sun Jun 21 21:27:40 2020 +0200 Initial commit, simplistic parsing. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a81c8ee --- /dev/null +++ b/.gitignore @@ -0,0 +1,138 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/org_dom/__init__.py b/org_dom/__init__.py new file mode 100644 index 0000000..cff8cf0 --- /dev/null +++ b/org_dom/__init__.py @@ -0,0 +1 @@ +from .org_dom import OrgDom, load, loads diff --git a/org_dom/org_dom.py b/org_dom/org_dom.py new file mode 100644 index 0000000..c3e3f53 --- /dev/null +++ b/org_dom/org_dom.py @@ -0,0 +1,211 @@ +import re +import collections +from typing import List + +BASE_ENVIRONMENT = { + 'org-footnote-section': 'Footnotes', + 'org-options-keywords': ( + "ARCHIVE:", + "AUTHOR:", + "BIND:", + "CATEGORY:", + "COLUMNS:", + "CREATOR:", + "DATE:", + "DESCRIPTION:", + "DRAWERS:", + "EMAIL:", + "EXCLUDE_TAGS:", + "FILETAGS:", + "INCLUDE:", + "INDEX:", + "KEYWORDS:", + "LANGUAGE:", + "MACRO:", + "OPTIONS:", + "PROPERTY:", + "PRIORITIES:", + "SELECT_TAGS:", + "SEQ_TODO:", + "SETUPFILE:", + "STARTUP:", + "TAGS:" + "TITLE:", + "TODO:", + "TYP_TODO:", + "SELECT_TAGS:", + "EXCLUDE_TAGS:" + ), +} + + +HEADLINE_RE = re.compile(r'^(?P\*+) (?P\s*)(?P.*)$') +KEYWORDS_RE = re.compile(r'^(?P\s*)#\+(?P[^:\[]+)(\[(?P[^\]]*)\])?:(?P\s*)(?P.*)$') +PROPERTY_DRAWER_RE = re.compile(r'^(?P\s*):PROPERTIES:(?P\s*)$') +DRAWER_END_RE = re.compile(r'^(?P\s*):END:(?P\s*)$') +NODE_PROPERTIES_RE = re.compile(r'^(?P\s*):(?P[^+:]+)(?P\+)?:(?P\s*)(?P.*)$') +RAW_LINE_RE = re.compile(r'^\s*([^\s#:*]|$)') +BASE_TIME_STAMP_RE = r'(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)( (?P\d{1,2}):(?P\d{1,2})(--(?P\d{1,2}):(?P\d{1,2}))?)?' + +ACTIVE_TIME_STAMP_RE = re.compile(r'<{}>'.format(BASE_TIME_STAMP_RE)) +INACTIVE_TIME_STAMP_RE = re.compile(r'\[{}\]'.format(BASE_TIME_STAMP_RE)) + +# BASE_TIME_RANGE_RE = (r'(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)((?P\d{1,2}):(?P\d{1,2}))?', +# r'(?P\d{4})-(?P\d{2})-(?P\d{2}) (?P[^ ]+)((?P\d{1,2}):(?P\d{1,2}))?') + +Headline = collections.namedtuple('Headline', ('start_line', 'depth', + 'keyword_start', 'keyword', + 'priority_start', 'priority', + 'title_start', 'title', + 'tags_start', 'tags', + 'content', + 'children', +)) + +Property = collections.namedtuple('Property', ('name', 'value', 'options')) +TimeRange = collections.namedtuple('TimeRange', ('start_time', 'end_time')) +Timestamp = collections.namedtuple('Timestamp', ('year', 'month', 'day', 'dow', 'hour', 'minute')) + + +def parse_org_time(value): + if m := ACTIVE_TIME_STAMP_RE.match(value): + active = True + elif m := INACTIVE_TIME_STAMP_RE.match(value): + active = False + else: + return None + + if m.group('end_hour'): + return TimeRange(Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))), + Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('end_hour')), int(m.group('end_minute')))) + return Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))) + + +class OrgDom: + def __init__(self, headlines, keywords): + self.headlines: List[Headline] = headlines + self.keywords: List[Property] = keywords + + def serialize(self): + raise NotImplementedError() + + + ## Querying + def getProperties(self): + return [ + Property(name=kw.group('key'), + value=kw.group('value'), + options=kw.group('options'), + ) + for kw in self.keywords + ] + + def getTopHeadlines(self): + return self.headlines + +class OrgDomReader: + + def __init__(self): + self.headlines: List[Headline] = [] + self.keywords: List[Property] = [] + self.headline_hierarchy: List[OrgDom] = [] + + def finalize(self): + return OrgDom(self.headlines, self.keywords) + + ## Construction + def add_headline(self, linenum: int, match: re.Match) -> int: + # Position reader on the proper headline + stars = match.group('stars') + depth = len(stars) - 1 + + headline = { + 'linenum': linenum, + 'orig': match, + 'title': match.group('line'), + 'contents': [], + 'children': [], + 'keywords': [], + 'properties': [], + } + + while (depth - 1) > len(self.headline_hierarchy): + # Introduce structural headlines + self.headline_hierarchy.append(None) + while depth < len(self.headline_hierarchy): + self.headline_hierarchy.pop() + + if depth == 0: + self.headlines.append(headline) + else: + self.headline_hierarchy[-1]['children'].append(headline) + self.headline_hierarchy.append(headline) + + + def add_keyword_line(self, linenum: int, match: re.Match) -> int: + if len(self.headline_hierarchy) == 0: + self.keywords.append(match) + else: + self.headline_hierarchy[-1]['keywords'].append('match') + + def add_raw_line(self, linenum: int, line: str) -> int: + print('>>', line) + pass + + def add_property_drawer_line(self, linenum: int, match: re.Match) -> int: + self.current_drawer = self.headline_hierarchy[-1]['properties'] + + def add_drawer_end_line(self, linenum: int, match: re.Match) -> int: + self.current_drawer = None + + def add_node_properties_line(self, linenum: int, match: re.Match) -> int: + key = match.group('key') + value = match.group('value').strip() + + if (value.count('>--<') == 1) or (value.count(']--[') == 1): + # Time ranges with two different dates + # @TODO properly consider "=> DURATION" section + chunks = value.split('=').split('--') + as_time_range = parse_org_time(chunks[0], chunks[1]) + if (as_time_range[0] is not None) and (as_time_range[1] is not None): + value = TimeRange(as_time_range[0], as_time_range[1]) + elif as_time := parse_org_time(value): + value = as_time + + self.current_drawer.append(Property(key, value, None)) + + def read(self, s, environment): + lines = s.split('\n') + reader = enumerate(lines) + + for linenum, line in reader: + if m := RAW_LINE_RE.match(line): + # TODO: Parse line + self.add_raw_line(linenum, line) + elif m := HEADLINE_RE.match(line): + # TODO: Parse headline + self.add_headline(linenum, m) + elif m := KEYWORDS_RE.match(line): + # TODO: Parse line + self.add_keyword_line(linenum, m) + elif m := PROPERTY_DRAWER_RE.match(line): + # TODO: Parse line + self.add_property_drawer_line(linenum, m) + elif m := DRAWER_END_RE.match(line): + # TODO: Parse line + self.add_drawer_end_line(linenum, m) + elif m := NODE_PROPERTIES_RE.match(line): + # TODO: Parse line + self.add_node_properties_line(linenum, m) + else: + raise NotImplementedError('{}: ‘{}’'.format(linenum, line)) + + +def loads(s, environment=BASE_ENVIRONMENT): + doc = OrgDomReader() + doc.read(s, environment) + return doc.finalize() + + +def load(f, environment=BASE_ENVIRONMENT): + return loads(f.read(), environment) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1c51c66 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +# No external requirements at this point diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..55a7d22 --- /dev/null +++ b/setup.py @@ -0,0 +1,15 @@ +from setuptools import setup + +setup( + name='org-dom', + version='0.0.1', + description= + 'Library to de/serialize org-files and manipulate them in a DOM-like manner.', + author='kenkeiras', + author_email='kenkeiras@codigoparallevar.com', + license='Apache License 2.0', + packages=['org_dom'], + scripts=[], + include_package_data=False, + install_requires=[], + zip_safe=True) diff --git a/tests/01-simple.org b/tests/01-simple.org new file mode 100644 index 0000000..edb3d27 --- /dev/null +++ b/tests/01-simple.org @@ -0,0 +1,25 @@ +#+TITLE: 01-Simple +#+DESCRIPTION: Simple org file +#+TODO: TODO(t) PAUSED(p) | DONE(d) + + +* First level + :PROPERTIES: + :ID: 01-simple-first-level-id + :CREATED: [2020-01-01 Wed 01:01] + :END: + First level content + +** Second level + :PROPERTIES: + :ID: 01-simple-second-level-id + :END: + + Second level content + +*** Third level + :PROPERTIES: + :ID: 01-simple-third-level-id + :END: + + Third level content diff --git a/tests/test_dom.py b/tests/test_dom.py new file mode 100644 index 0000000..ddf4249 --- /dev/null +++ b/tests/test_dom.py @@ -0,0 +1,39 @@ +import logging +import os +import sys +import unittest +from datetime import datetime as DT + +from org_dom import load, loads +from utils.dom_assertions import HL, Dom + +DIR = os.path.dirname(os.path.abspath(__file__)) + + +class TestSerde(unittest.TestCase): + def test_simple_file_01(self): + with open(os.path.join(DIR, '01-simple.org')) as f: + doc = load(f) + + ex = Dom(props=[('TITLE', '01-Simple'), + ('DESCRIPTION', 'Simple org file'), + ('TODO', 'TODO(t) PAUSED(p) | DONE(d)')], + children=(HL( + 'First level', + props=[ + ('ID', '01-simple-first-level-id'), + ('CREATED', DT(2020, 1, 1, 1, 1)), + ], + content='First level content', + children=[ + HL('Second level', + props=[('ID', '01-simple-second-level-id')], + content='Second level content', + children=[ + HL('Third level', + props=[('ID', '01-simple-third-level-id')], + content='Third level content') + ]) + ]))) + + ex.assert_matches(self, doc) diff --git a/tests/utils/dom_assertions.py b/tests/utils/dom_assertions.py new file mode 100644 index 0000000..61a33c1 --- /dev/null +++ b/tests/utils/dom_assertions.py @@ -0,0 +1,77 @@ +import collections +import unittest +from datetime import datetime + + +def timestamp_to_datetime(ts): + return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute) + + +class Dom: + def __init__(self, *, props=None, children=None): + self.props = props + self.children = children + if isinstance(self.children, HL): + self.children = [self.children] + + def assert_matches(self, test_case: unittest.TestCase, doc): + # Check properties + if self.props is None: + test_case.assertEqual(len(doc.getProperties()), 0) + else: + doc_props = doc.getProperties() + test_case.assertEqual(len(doc_props), len(self.props)) + + for i, prop in enumerate(self.props): + test_case.assertEqual(doc_props[i].name, prop[0]) + test_case.assertEqual(doc_props[i].value, prop[1]) + + # @TODO: Check properties + + # Check children + if self.children is None: + test_case.assertEqual(len(doc.getTopHeadlines()), 0, "Top") + else: + doc_headlines = doc.getTopHeadlines() + test_case.assertEqual(len(doc_headlines), len(self.children), + "Top") + + for i, children in enumerate(self.children): + children.assert_matches(test_case, doc_headlines[i]) + + +class HL: + def __init__(self, title, *, props=None, content=None, children=None): + self.title = title + self.props = props + self.content = content + self.children = children + + def assert_matches(self, test_case: unittest.TestCase, doc): + test_case.assertEqual(self.title, doc['title']) + + # Check properties + if self.props is None: + test_case.assertEqual(len(doc['properties']), 0) + else: + doc_props = doc['properties'] + test_case.assertEqual(len(doc_props), len(self.props)) + + for i, prop in enumerate(self.props): + test_case.assertEqual(doc_props[i].name, prop[0]) + if isinstance(prop[1], datetime): + test_case.assertEqual( + timestamp_to_datetime(doc_props[i].value), prop[1]) + + # @TODO: Check properties + + # Check children + if self.children is None: + test_case.assertEqual(len(doc['children']), 0) + else: + doc_headlines = doc['children'] + test_case.assertEqual(len(doc_headlines), len(self.children), + self.title) + + for i, children in enumerate(self.children): + children.assert_matches(test_case, doc_headlines[i]) diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..e76fbb1 --- /dev/null +++ b/tox.ini @@ -0,0 +1,18 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +# envlist = py27,py34,py35,py36,py37 +envlist = py38 + +[testenv] +commands = + python -m pytest --cov-report term-missing --cov org_dom tests +deps = + -r requirements.txt + pytest + pytest-cov +setenv = + PYTHONPATH = {toxinidir} \ No newline at end of file