Initial commit, simplistic parsing.

2020-06-21 21:27:40 +02:00 · 2020-06-21 21:27:40 +02:00 · d29058cb5e
commit d29058cb5e
9 changed files with 525 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,138 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
--- a/org_dom/init.py
+++ b/org_dom/init.py
@ -0,0 +1 @@
+from .org_dom import OrgDom, load, loads
--- a/org_dom/org_dom.py
+++ b/org_dom/org_dom.py
@ -0,0 +1,211 @@
+import re
+import collections
+from typing import List
+
+BASE_ENVIRONMENT = {
+    'org-footnote-section': 'Footnotes',
+    'org-options-keywords': (
+        "ARCHIVE:",
+        "AUTHOR:",
+        "BIND:",
+        "CATEGORY:",
+        "COLUMNS:",
+        "CREATOR:",
+        "DATE:",
+        "DESCRIPTION:",
+        "DRAWERS:",
+        "EMAIL:",
+        "EXCLUDE_TAGS:",
+        "FILETAGS:",
+        "INCLUDE:",
+        "INDEX:",
+        "KEYWORDS:",
+        "LANGUAGE:",
+        "MACRO:",
+        "OPTIONS:",
+        "PROPERTY:",
+        "PRIORITIES:",
+        "SELECT_TAGS:",
+        "SEQ_TODO:",
+        "SETUPFILE:",
+        "STARTUP:",
+        "TAGS:"
+        "TITLE:",
+        "TODO:",
+        "TYP_TODO:",
+        "SELECT_TAGS:",
+        "EXCLUDE_TAGS:"
+    ),
+}
+
+
+HEADLINE_RE = re.compile(r'^(?P<stars>\*+) (?P<spacing>\s*)(?P<line>.*)$')
+KEYWORDS_RE = re.compile(r'^(?P<indentation>\s*)#\+(?P<key>[^:\[]+)(\[(?P<options>[^\]]*)\])?:(?P<spacing>\s*)(?P<value>.*)$')
+PROPERTY_DRAWER_RE = re.compile(r'^(?P<indentation>\s*):PROPERTIES:(?P<end_indentation>\s*)$')
+DRAWER_END_RE = re.compile(r'^(?P<indentation>\s*):END:(?P<end_indentation>\s*)$')
+NODE_PROPERTIES_RE = re.compile(r'^(?P<indentation>\s*):(?P<key>[^+:]+)(?P<plus>\+)?:(?P<spacing>\s*)(?P<value>.*)$')
+RAW_LINE_RE = re.compile(r'^\s*([^\s#:*]|$)')
+BASE_TIME_STAMP_RE = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<dow>[^ ]+)( (?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2})(--(?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?)?'
+
+ACTIVE_TIME_STAMP_RE = re.compile(r'<{}>'.format(BASE_TIME_STAMP_RE))
+INACTIVE_TIME_STAMP_RE = re.compile(r'\[{}\]'.format(BASE_TIME_STAMP_RE))
+
+# BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
+#                       r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
+
+Headline = collections.namedtuple('Headline', ('start_line', 'depth',
+                                               'keyword_start', 'keyword',
+                                               'priority_start', 'priority',
+                                               'title_start', 'title',
+                                               'tags_start', 'tags',
+                                               'content',
+                                               'children',
+))
+
+Property = collections.namedtuple('Property', ('name', 'value', 'options'))
+TimeRange = collections.namedtuple('TimeRange', ('start_time', 'end_time'))
+Timestamp = collections.namedtuple('Timestamp', ('year', 'month', 'day', 'dow', 'hour', 'minute'))
+
+
+def parse_org_time(value):
+    if m := ACTIVE_TIME_STAMP_RE.match(value):
+        active = True
+    elif m := INACTIVE_TIME_STAMP_RE.match(value):
+        active = False
+    else:
+        return None
+
+    if m.group('end_hour'):
+        return TimeRange(Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))),
+                         Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('end_hour')), int(m.group('end_minute'))))
+    return Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute')))
+
+
+class OrgDom:
+    def __init__(self, headlines, keywords):
+        self.headlines: List[Headline] = headlines
+        self.keywords: List[Property] = keywords
+
+    def serialize(self):
+        raise NotImplementedError()
+
+
+    ## Querying
+    def getProperties(self):
+        return [
+            Property(name=kw.group('key'),
+                     value=kw.group('value'),
+                     options=kw.group('options'),
+            )
+            for kw in self.keywords
+        ]
+
+    def getTopHeadlines(self):
+        return self.headlines
+
+class OrgDomReader:
+
+    def __init__(self):
+        self.headlines: List[Headline] = []
+        self.keywords: List[Property] = []
+        self.headline_hierarchy: List[OrgDom] = []
+
+    def finalize(self):
+        return OrgDom(self.headlines, self.keywords)
+
+    ## Construction
+    def add_headline(self, linenum: int, match: re.Match) -> int:
+        # Position reader on the proper headline
+        stars = match.group('stars')
+        depth = len(stars) - 1
+
+        headline = {
+            'linenum': linenum,
+            'orig': match,
+            'title': match.group('line'),
+            'contents': [],
+            'children': [],
+            'keywords': [],
+            'properties': [],
+        }
+
+        while (depth - 1) > len(self.headline_hierarchy):
+            # Introduce structural headlines
+            self.headline_hierarchy.append(None)
+        while depth < len(self.headline_hierarchy):
+            self.headline_hierarchy.pop()
+
+        if depth == 0:
+            self.headlines.append(headline)
+        else:
+            self.headline_hierarchy[-1]['children'].append(headline)
+        self.headline_hierarchy.append(headline)
+
+
+    def add_keyword_line(self, linenum: int, match: re.Match) -> int:
+        if len(self.headline_hierarchy) == 0:
+            self.keywords.append(match)
+        else:
+            self.headline_hierarchy[-1]['keywords'].append('match')
+
+    def add_raw_line(self, linenum: int, line: str) -> int:
+        print('>>', line)
+        pass
+
+    def add_property_drawer_line(self, linenum: int, match: re.Match) -> int:
+        self.current_drawer = self.headline_hierarchy[-1]['properties']
+
+    def add_drawer_end_line(self, linenum: int, match: re.Match) -> int:
+        self.current_drawer = None
+
+    def add_node_properties_line(self, linenum: int, match: re.Match) -> int:
+        key = match.group('key')
+        value = match.group('value').strip()
+
+        if (value.count('>--<') == 1) or (value.count(']--[') == 1):
+            # Time ranges with two different dates
+            # @TODO properly consider "=> DURATION" section
+            chunks = value.split('=').split('--')
+            as_time_range = parse_org_time(chunks[0], chunks[1])
+            if (as_time_range[0] is not None) and (as_time_range[1] is not None):
+                value = TimeRange(as_time_range[0], as_time_range[1])
+        elif as_time := parse_org_time(value):
+            value = as_time
+
+        self.current_drawer.append(Property(key, value, None))
+
+    def read(self, s, environment):
+        lines = s.split('\n')
+        reader = enumerate(lines)
+
+        for linenum, line in reader:
+            if m := RAW_LINE_RE.match(line):
+                # TODO: Parse line
+                self.add_raw_line(linenum, line)
+            elif m := HEADLINE_RE.match(line):
+                # TODO: Parse headline
+                self.add_headline(linenum, m)
+            elif m := KEYWORDS_RE.match(line):
+                # TODO: Parse line
+                self.add_keyword_line(linenum, m)
+            elif m := PROPERTY_DRAWER_RE.match(line):
+                # TODO: Parse line
+                self.add_property_drawer_line(linenum, m)
+            elif m := DRAWER_END_RE.match(line):
+                # TODO: Parse line
+                self.add_drawer_end_line(linenum, m)
+            elif m := NODE_PROPERTIES_RE.match(line):
+                # TODO: Parse line
+                self.add_node_properties_line(linenum, m)
+            else:
+                raise NotImplementedError('{}: ‘{}’'.format(linenum, line))
+
+
+def loads(s, environment=BASE_ENVIRONMENT):
+    doc = OrgDomReader()
+    doc.read(s, environment)
+    return doc.finalize()
+
+
+def load(f, environment=BASE_ENVIRONMENT):
+    return loads(f.read(), environment)
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+# No external requirements at this point
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,15 @@
+from setuptools import setup
+
+setup(
+    name='org-dom',
+    version='0.0.1',
+    description=
+    'Library to de/serialize org-files and manipulate them in a DOM-like manner.',
+    author='kenkeiras',
+    author_email='kenkeiras@codigoparallevar.com',
+    license='Apache License 2.0',
+    packages=['org_dom'],
+    scripts=[],
+    include_package_data=False,
+    install_requires=[],
+    zip_safe=True)
--- a/tests/01-simple.org
+++ b/tests/01-simple.org
@ -0,0 +1,25 @@
+#+TITLE: 01-Simple
+#+DESCRIPTION: Simple org file
+#+TODO: TODO(t) PAUSED(p) |  DONE(d)
+
+
+* First level
+  :PROPERTIES:
+  :ID:       01-simple-first-level-id
+  :CREATED:  [2020-01-01 Wed 01:01]
+  :END:
+  First level content
+
+** Second level
+   :PROPERTIES:
+   :ID:       01-simple-second-level-id
+   :END:
+
+   Second level content
+
+*** Third level
+    :PROPERTIES:
+    :ID:       01-simple-third-level-id
+    :END:
+
+    Third level content
--- a/tests/test_dom.py
+++ b/tests/test_dom.py
@ -0,0 +1,39 @@
+import logging
+import os
+import sys
+import unittest
+from datetime import datetime as DT
+
+from org_dom import load, loads
+from utils.dom_assertions import HL, Dom
+
+DIR = os.path.dirname(os.path.abspath(__file__))
+
+
+class TestSerde(unittest.TestCase):
+    def test_simple_file_01(self):
+        with open(os.path.join(DIR, '01-simple.org')) as f:
+            doc = load(f)
+
+        ex = Dom(props=[('TITLE', '01-Simple'),
+                        ('DESCRIPTION', 'Simple org file'),
+                        ('TODO', 'TODO(t) PAUSED(p) |  DONE(d)')],
+                 children=(HL(
+                     'First level',
+                     props=[
+                         ('ID', '01-simple-first-level-id'),
+                         ('CREATED', DT(2020, 1, 1, 1, 1)),
+                     ],
+                     content='First level content',
+                     children=[
+                         HL('Second level',
+                            props=[('ID', '01-simple-second-level-id')],
+                            content='Second level content',
+                            children=[
+                                HL('Third level',
+                                   props=[('ID', '01-simple-third-level-id')],
+                                   content='Third level content')
+                            ])
+                     ])))
+
+        ex.assert_matches(self, doc)
--- a/tests/utils/dom_assertions.py
+++ b/tests/utils/dom_assertions.py
@ -0,0 +1,77 @@
+import collections
+import unittest
+from datetime import datetime
+
+
+def timestamp_to_datetime(ts):
+    return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute)
+
+
+class Dom:
+    def __init__(self, *, props=None, children=None):
+        self.props = props
+        self.children = children
+        if isinstance(self.children, HL):
+            self.children = [self.children]
+
+    def assert_matches(self, test_case: unittest.TestCase, doc):
+        # Check properties
+        if self.props is None:
+            test_case.assertEqual(len(doc.getProperties()), 0)
+        else:
+            doc_props = doc.getProperties()
+            test_case.assertEqual(len(doc_props), len(self.props))
+
+            for i, prop in enumerate(self.props):
+                test_case.assertEqual(doc_props[i].name, prop[0])
+                test_case.assertEqual(doc_props[i].value, prop[1])
+
+        # @TODO: Check properties
+
+        # Check children
+        if self.children is None:
+            test_case.assertEqual(len(doc.getTopHeadlines()), 0, "Top")
+        else:
+            doc_headlines = doc.getTopHeadlines()
+            test_case.assertEqual(len(doc_headlines), len(self.children),
+                                  "Top")
+
+            for i, children in enumerate(self.children):
+                children.assert_matches(test_case, doc_headlines[i])
+
+
+class HL:
+    def __init__(self, title, *, props=None, content=None, children=None):
+        self.title = title
+        self.props = props
+        self.content = content
+        self.children = children
+
+    def assert_matches(self, test_case: unittest.TestCase, doc):
+        test_case.assertEqual(self.title, doc['title'])
+
+        # Check properties
+        if self.props is None:
+            test_case.assertEqual(len(doc['properties']), 0)
+        else:
+            doc_props = doc['properties']
+            test_case.assertEqual(len(doc_props), len(self.props))
+
+            for i, prop in enumerate(self.props):
+                test_case.assertEqual(doc_props[i].name, prop[0])
+                if isinstance(prop[1], datetime):
+                    test_case.assertEqual(
+                        timestamp_to_datetime(doc_props[i].value), prop[1])
+
+        # @TODO: Check properties
+
+        # Check children
+        if self.children is None:
+            test_case.assertEqual(len(doc['children']), 0)
+        else:
+            doc_headlines = doc['children']
+            test_case.assertEqual(len(doc_headlines), len(self.children),
+                                  self.title)
+
+            for i, children in enumerate(self.children):
+                children.assert_matches(test_case, doc_headlines[i])
--- a/tox.ini
+++ b/tox.ini
@ -0,0 +1,18 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+# envlist = py27,py34,py35,py36,py37
+envlist = py38
+
+[testenv]
+commands =
+    python -m pytest --cov-report term-missing --cov org_dom tests
+deps =
+    -r requirements.txt
+    pytest
+    pytest-cov
+setenv =
+    PYTHONPATH = {toxinidir}