Initial commit, simplistic parsing.

This commit is contained in:
Sergio Martínez Portela 2020-06-21 21:27:40 +02:00
commit d29058cb5e
9 changed files with 525 additions and 0 deletions

138
.gitignore vendored Normal file
View File

@ -0,0 +1,138 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/

1
org_dom/__init__.py Normal file
View File

@ -0,0 +1 @@
from .org_dom import OrgDom, load, loads

211
org_dom/org_dom.py Normal file
View File

@ -0,0 +1,211 @@
import re
import collections
from typing import List
BASE_ENVIRONMENT = {
'org-footnote-section': 'Footnotes',
'org-options-keywords': (
"ARCHIVE:",
"AUTHOR:",
"BIND:",
"CATEGORY:",
"COLUMNS:",
"CREATOR:",
"DATE:",
"DESCRIPTION:",
"DRAWERS:",
"EMAIL:",
"EXCLUDE_TAGS:",
"FILETAGS:",
"INCLUDE:",
"INDEX:",
"KEYWORDS:",
"LANGUAGE:",
"MACRO:",
"OPTIONS:",
"PROPERTY:",
"PRIORITIES:",
"SELECT_TAGS:",
"SEQ_TODO:",
"SETUPFILE:",
"STARTUP:",
"TAGS:"
"TITLE:",
"TODO:",
"TYP_TODO:",
"SELECT_TAGS:",
"EXCLUDE_TAGS:"
),
}
HEADLINE_RE = re.compile(r'^(?P<stars>\*+) (?P<spacing>\s*)(?P<line>.*)$')
KEYWORDS_RE = re.compile(r'^(?P<indentation>\s*)#\+(?P<key>[^:\[]+)(\[(?P<options>[^\]]*)\])?:(?P<spacing>\s*)(?P<value>.*)$')
PROPERTY_DRAWER_RE = re.compile(r'^(?P<indentation>\s*):PROPERTIES:(?P<end_indentation>\s*)$')
DRAWER_END_RE = re.compile(r'^(?P<indentation>\s*):END:(?P<end_indentation>\s*)$')
NODE_PROPERTIES_RE = re.compile(r'^(?P<indentation>\s*):(?P<key>[^+:]+)(?P<plus>\+)?:(?P<spacing>\s*)(?P<value>.*)$')
RAW_LINE_RE = re.compile(r'^\s*([^\s#:*]|$)')
BASE_TIME_STAMP_RE = r'(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2}) (?P<dow>[^ ]+)( (?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2})(--(?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?)?'
ACTIVE_TIME_STAMP_RE = re.compile(r'<{}>'.format(BASE_TIME_STAMP_RE))
INACTIVE_TIME_STAMP_RE = re.compile(r'\[{}\]'.format(BASE_TIME_STAMP_RE))
# BASE_TIME_RANGE_RE = (r'(?P<start_year>\d{4})-(?P<start_month>\d{2})-(?P<start_day>\d{2}) (?P<start_dow>[^ ]+)((?P<start_hour>\d{1,2}):(?P<start_minute>\d{1,2}))?',
# r'(?P<end_year>\d{4})-(?P<end_month>\d{2})-(?P<end_day>\d{2}) (?P<end_dow>[^ ]+)((?P<end_hour>\d{1,2}):(?P<end_minute>\d{1,2}))?')
Headline = collections.namedtuple('Headline', ('start_line', 'depth',
'keyword_start', 'keyword',
'priority_start', 'priority',
'title_start', 'title',
'tags_start', 'tags',
'content',
'children',
))
Property = collections.namedtuple('Property', ('name', 'value', 'options'))
TimeRange = collections.namedtuple('TimeRange', ('start_time', 'end_time'))
Timestamp = collections.namedtuple('Timestamp', ('year', 'month', 'day', 'dow', 'hour', 'minute'))
def parse_org_time(value):
if m := ACTIVE_TIME_STAMP_RE.match(value):
active = True
elif m := INACTIVE_TIME_STAMP_RE.match(value):
active = False
else:
return None
if m.group('end_hour'):
return TimeRange(Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute'))),
Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('end_hour')), int(m.group('end_minute'))))
return Timestamp(int(m.group('year')), int(m.group('month')), int(m.group('day')), m.group('dow'), int(m.group('start_hour')), int(m.group('start_minute')))
class OrgDom:
def __init__(self, headlines, keywords):
self.headlines: List[Headline] = headlines
self.keywords: List[Property] = keywords
def serialize(self):
raise NotImplementedError()
## Querying
def getProperties(self):
return [
Property(name=kw.group('key'),
value=kw.group('value'),
options=kw.group('options'),
)
for kw in self.keywords
]
def getTopHeadlines(self):
return self.headlines
class OrgDomReader:
def __init__(self):
self.headlines: List[Headline] = []
self.keywords: List[Property] = []
self.headline_hierarchy: List[OrgDom] = []
def finalize(self):
return OrgDom(self.headlines, self.keywords)
## Construction
def add_headline(self, linenum: int, match: re.Match) -> int:
# Position reader on the proper headline
stars = match.group('stars')
depth = len(stars) - 1
headline = {
'linenum': linenum,
'orig': match,
'title': match.group('line'),
'contents': [],
'children': [],
'keywords': [],
'properties': [],
}
while (depth - 1) > len(self.headline_hierarchy):
# Introduce structural headlines
self.headline_hierarchy.append(None)
while depth < len(self.headline_hierarchy):
self.headline_hierarchy.pop()
if depth == 0:
self.headlines.append(headline)
else:
self.headline_hierarchy[-1]['children'].append(headline)
self.headline_hierarchy.append(headline)
def add_keyword_line(self, linenum: int, match: re.Match) -> int:
if len(self.headline_hierarchy) == 0:
self.keywords.append(match)
else:
self.headline_hierarchy[-1]['keywords'].append('match')
def add_raw_line(self, linenum: int, line: str) -> int:
print('>>', line)
pass
def add_property_drawer_line(self, linenum: int, match: re.Match) -> int:
self.current_drawer = self.headline_hierarchy[-1]['properties']
def add_drawer_end_line(self, linenum: int, match: re.Match) -> int:
self.current_drawer = None
def add_node_properties_line(self, linenum: int, match: re.Match) -> int:
key = match.group('key')
value = match.group('value').strip()
if (value.count('>--<') == 1) or (value.count(']--[') == 1):
# Time ranges with two different dates
# @TODO properly consider "=> DURATION" section
chunks = value.split('=').split('--')
as_time_range = parse_org_time(chunks[0], chunks[1])
if (as_time_range[0] is not None) and (as_time_range[1] is not None):
value = TimeRange(as_time_range[0], as_time_range[1])
elif as_time := parse_org_time(value):
value = as_time
self.current_drawer.append(Property(key, value, None))
def read(self, s, environment):
lines = s.split('\n')
reader = enumerate(lines)
for linenum, line in reader:
if m := RAW_LINE_RE.match(line):
# TODO: Parse line
self.add_raw_line(linenum, line)
elif m := HEADLINE_RE.match(line):
# TODO: Parse headline
self.add_headline(linenum, m)
elif m := KEYWORDS_RE.match(line):
# TODO: Parse line
self.add_keyword_line(linenum, m)
elif m := PROPERTY_DRAWER_RE.match(line):
# TODO: Parse line
self.add_property_drawer_line(linenum, m)
elif m := DRAWER_END_RE.match(line):
# TODO: Parse line
self.add_drawer_end_line(linenum, m)
elif m := NODE_PROPERTIES_RE.match(line):
# TODO: Parse line
self.add_node_properties_line(linenum, m)
else:
raise NotImplementedError('{}: {}'.format(linenum, line))
def loads(s, environment=BASE_ENVIRONMENT):
doc = OrgDomReader()
doc.read(s, environment)
return doc.finalize()
def load(f, environment=BASE_ENVIRONMENT):
return loads(f.read(), environment)

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
# No external requirements at this point

15
setup.py Normal file
View File

@ -0,0 +1,15 @@
from setuptools import setup
setup(
name='org-dom',
version='0.0.1',
description=
'Library to de/serialize org-files and manipulate them in a DOM-like manner.',
author='kenkeiras',
author_email='kenkeiras@codigoparallevar.com',
license='Apache License 2.0',
packages=['org_dom'],
scripts=[],
include_package_data=False,
install_requires=[],
zip_safe=True)

25
tests/01-simple.org Normal file
View File

@ -0,0 +1,25 @@
#+TITLE: 01-Simple
#+DESCRIPTION: Simple org file
#+TODO: TODO(t) PAUSED(p) | DONE(d)
* First level
:PROPERTIES:
:ID: 01-simple-first-level-id
:CREATED: [2020-01-01 Wed 01:01]
:END:
First level content
** Second level
:PROPERTIES:
:ID: 01-simple-second-level-id
:END:
Second level content
*** Third level
:PROPERTIES:
:ID: 01-simple-third-level-id
:END:
Third level content

39
tests/test_dom.py Normal file
View File

@ -0,0 +1,39 @@
import logging
import os
import sys
import unittest
from datetime import datetime as DT
from org_dom import load, loads
from utils.dom_assertions import HL, Dom
DIR = os.path.dirname(os.path.abspath(__file__))
class TestSerde(unittest.TestCase):
def test_simple_file_01(self):
with open(os.path.join(DIR, '01-simple.org')) as f:
doc = load(f)
ex = Dom(props=[('TITLE', '01-Simple'),
('DESCRIPTION', 'Simple org file'),
('TODO', 'TODO(t) PAUSED(p) | DONE(d)')],
children=(HL(
'First level',
props=[
('ID', '01-simple-first-level-id'),
('CREATED', DT(2020, 1, 1, 1, 1)),
],
content='First level content',
children=[
HL('Second level',
props=[('ID', '01-simple-second-level-id')],
content='Second level content',
children=[
HL('Third level',
props=[('ID', '01-simple-third-level-id')],
content='Third level content')
])
])))
ex.assert_matches(self, doc)

View File

@ -0,0 +1,77 @@
import collections
import unittest
from datetime import datetime
def timestamp_to_datetime(ts):
return datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute)
class Dom:
def __init__(self, *, props=None, children=None):
self.props = props
self.children = children
if isinstance(self.children, HL):
self.children = [self.children]
def assert_matches(self, test_case: unittest.TestCase, doc):
# Check properties
if self.props is None:
test_case.assertEqual(len(doc.getProperties()), 0)
else:
doc_props = doc.getProperties()
test_case.assertEqual(len(doc_props), len(self.props))
for i, prop in enumerate(self.props):
test_case.assertEqual(doc_props[i].name, prop[0])
test_case.assertEqual(doc_props[i].value, prop[1])
# @TODO: Check properties
# Check children
if self.children is None:
test_case.assertEqual(len(doc.getTopHeadlines()), 0, "Top")
else:
doc_headlines = doc.getTopHeadlines()
test_case.assertEqual(len(doc_headlines), len(self.children),
"Top")
for i, children in enumerate(self.children):
children.assert_matches(test_case, doc_headlines[i])
class HL:
def __init__(self, title, *, props=None, content=None, children=None):
self.title = title
self.props = props
self.content = content
self.children = children
def assert_matches(self, test_case: unittest.TestCase, doc):
test_case.assertEqual(self.title, doc['title'])
# Check properties
if self.props is None:
test_case.assertEqual(len(doc['properties']), 0)
else:
doc_props = doc['properties']
test_case.assertEqual(len(doc_props), len(self.props))
for i, prop in enumerate(self.props):
test_case.assertEqual(doc_props[i].name, prop[0])
if isinstance(prop[1], datetime):
test_case.assertEqual(
timestamp_to_datetime(doc_props[i].value), prop[1])
# @TODO: Check properties
# Check children
if self.children is None:
test_case.assertEqual(len(doc['children']), 0)
else:
doc_headlines = doc['children']
test_case.assertEqual(len(doc_headlines), len(self.children),
self.title)
for i, children in enumerate(self.children):
children.assert_matches(test_case, doc_headlines[i])

18
tox.ini Normal file
View File

@ -0,0 +1,18 @@
# Tox (http://tox.testrun.org/) is a tool for running tests
# in multiple virtualenvs. This configuration file will run the
# test suite on all supported python versions. To use it, "pip install tox"
# and then run "tox" from this directory.
[tox]
# envlist = py27,py34,py35,py36,py37
envlist = py38
[testenv]
commands =
python -m pytest --cov-report term-missing --cov org_dom tests
deps =
-r requirements.txt
pytest
pytest-cov
setenv =
PYTHONPATH = {toxinidir}