Hyper-basic wikitionary reader.

This commit is contained in:
kenkeiras 2017-05-09 23:59:01 +02:00
parent 5bf75cf03e
commit a1925f5383
5 changed files with 95306 additions and 0 deletions

1
wikitionary/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
data.xml

10
wikitionary/parse.py Normal file
View File

@ -0,0 +1,10 @@
import wiktionary_reader
reader = wiktionary_reader.WiktionaryReader('data.xml')
for i, entry in enumerate(reader):
print(entry['title'])
if (entry['title'] == 'avatar'):
print(entry['wikitext'])
break
print(i)

View File

@ -0,0 +1 @@
libarchive

95232
wikitionary/sample Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,62 @@
from xml.dom.minidom import parseString
def read_headers(fname):
with open(fname) as f:
top_header = None
siteinfo = []
offset = 0
for i, l in enumerate(f):
offset += len(l)
if top_header is None:
top_header = l
continue
siteinfo.append(l)
if l.strip() == '</siteinfo>':
break
return offset, ({
'siteinfo': siteinfo,
'top_header': top_header,
})
def parse_block(xmlblock):
xmlentry = parseString(xmlblock)
title = (xmlentry
.getElementsByTagName('title')[0]
.childNodes[0].wholeText)
wikitext = (xmlentry
.getElementsByTagName('revision')[0]
.getElementsByTagName('text')[0]
.childNodes[0].wholeText)
return {
'title': title,
'wikitext': wikitext,
}
class WiktionaryReaderIter:
def __init__(self, reader):
self.f = open(reader.fname)
self.f.seek(reader.data_start)
def __next__(self):
block = []
for l in self.f:
block.append(l)
if l.strip() == '</page>':
break
if len(block) == 0:
raise StopIteration
return parse_block(''.join(block))
class WiktionaryReader:
def __init__(self, fname):
self.fname = fname
self.data_start, self.headers = read_headers(fname)
def __iter__(self):
return WiktionaryReaderIter(self)