63 lines
1.5 KiB
Python
63 lines
1.5 KiB
Python
from xml.dom.minidom import parseString
|
|
|
|
def read_headers(fname):
|
|
with open(fname) as f:
|
|
top_header = None
|
|
siteinfo = []
|
|
offset = 0
|
|
for i, l in enumerate(f):
|
|
offset += len(l)
|
|
if top_header is None:
|
|
top_header = l
|
|
continue
|
|
|
|
siteinfo.append(l)
|
|
if l.strip() == '</siteinfo>':
|
|
break
|
|
return offset, ({
|
|
'siteinfo': siteinfo,
|
|
'top_header': top_header,
|
|
})
|
|
|
|
|
|
def parse_block(xmlblock):
|
|
xmlentry = parseString(xmlblock)
|
|
title = (xmlentry
|
|
.getElementsByTagName('title')[0]
|
|
.childNodes[0].wholeText)
|
|
|
|
wikitext = (xmlentry
|
|
.getElementsByTagName('revision')[0]
|
|
.getElementsByTagName('text')[0]
|
|
.childNodes[0].wholeText)
|
|
return {
|
|
'title': title,
|
|
'wikitext': wikitext,
|
|
}
|
|
|
|
|
|
class WiktionaryReaderIter:
|
|
def __init__(self, reader):
|
|
self.f = open(reader.fname)
|
|
self.f.seek(reader.data_start)
|
|
|
|
def __next__(self):
|
|
block = []
|
|
for l in self.f:
|
|
block.append(l)
|
|
if l.strip() == '</page>':
|
|
break
|
|
if len(block) == 0:
|
|
raise StopIteration
|
|
|
|
return parse_block(''.join(block))
|
|
|
|
|
|
class WiktionaryReader:
|
|
def __init__(self, fname):
|
|
self.fname = fname
|
|
self.data_start, self.headers = read_headers(fname)
|
|
|
|
def __iter__(self):
|
|
return WiktionaryReaderIter(self)
|