from xml.dom.minidom import parseString def read_headers(fname): with open(fname) as f: top_header = None siteinfo = [] offset = 0 for i, l in enumerate(f): offset += len(l) if top_header is None: top_header = l continue siteinfo.append(l) if l.strip() == '': break return offset, ({ 'siteinfo': siteinfo, 'top_header': top_header, }) def parse_block(xmlblock): xmlentry = parseString(xmlblock) title = (xmlentry .getElementsByTagName('title')[0] .childNodes[0].wholeText) wikitext = (xmlentry .getElementsByTagName('revision')[0] .getElementsByTagName('text')[0] .childNodes[0].wholeText) return { 'title': title, 'wikitext': wikitext, } class WiktionaryReaderIter: def __init__(self, reader): self.f = open(reader.fname) self.f.seek(reader.data_start) def __next__(self): block = [] for l in self.f: block.append(l) if l.strip() == '': break if len(block) == 0: raise StopIteration return parse_block(''.join(block)) class WiktionaryReader: def __init__(self, fname): self.fname = fname self.data_start, self.headers = read_headers(fname) def __iter__(self): return WiktionaryReaderIter(self)