lang-model/wikitionary/wiktionary_reader.py

63 lines
1.5 KiB
Python

from xml.dom.minidom import parseString
def read_headers(fname):
with open(fname) as f:
top_header = None
siteinfo = []
offset = 0
for i, l in enumerate(f):
offset += len(l)
if top_header is None:
top_header = l
continue
siteinfo.append(l)
if l.strip() == '</siteinfo>':
break
return offset, ({
'siteinfo': siteinfo,
'top_header': top_header,
})
def parse_block(xmlblock):
xmlentry = parseString(xmlblock)
title = (xmlentry
.getElementsByTagName('title')[0]
.childNodes[0].wholeText)
wikitext = (xmlentry
.getElementsByTagName('revision')[0]
.getElementsByTagName('text')[0]
.childNodes[0].wholeText)
return {
'title': title,
'wikitext': wikitext,
}
class WiktionaryReaderIter:
def __init__(self, reader):
self.f = open(reader.fname)
self.f.seek(reader.data_start)
def __next__(self):
block = []
for l in self.f:
block.append(l)
if l.strip() == '</page>':
break
if len(block) == 0:
raise StopIteration
return parse_block(''.join(block))
class WiktionaryReader:
def __init__(self, fname):
self.fname = fname
self.data_start, self.headers = read_headers(fname)
def __iter__(self):
return WiktionaryReaderIter(self)