Hyper-basic wikitionary reader.
This commit is contained in:
parent
5bf75cf03e
commit
a1925f5383
1
wikitionary/.gitignore
vendored
Normal file
1
wikitionary/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
|||||||
|
data.xml
|
10
wikitionary/parse.py
Normal file
10
wikitionary/parse.py
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
import wiktionary_reader
|
||||||
|
|
||||||
|
reader = wiktionary_reader.WiktionaryReader('data.xml')
|
||||||
|
for i, entry in enumerate(reader):
|
||||||
|
print(entry['title'])
|
||||||
|
if (entry['title'] == 'avatar'):
|
||||||
|
print(entry['wikitext'])
|
||||||
|
break
|
||||||
|
|
||||||
|
print(i)
|
1
wikitionary/requirements.txt
Normal file
1
wikitionary/requirements.txt
Normal file
@ -0,0 +1 @@
|
|||||||
|
libarchive
|
95232
wikitionary/sample
Normal file
95232
wikitionary/sample
Normal file
File diff suppressed because it is too large
Load Diff
62
wikitionary/wiktionary_reader.py
Normal file
62
wikitionary/wiktionary_reader.py
Normal file
@ -0,0 +1,62 @@
|
|||||||
|
from xml.dom.minidom import parseString
|
||||||
|
|
||||||
|
def read_headers(fname):
|
||||||
|
with open(fname) as f:
|
||||||
|
top_header = None
|
||||||
|
siteinfo = []
|
||||||
|
offset = 0
|
||||||
|
for i, l in enumerate(f):
|
||||||
|
offset += len(l)
|
||||||
|
if top_header is None:
|
||||||
|
top_header = l
|
||||||
|
continue
|
||||||
|
|
||||||
|
siteinfo.append(l)
|
||||||
|
if l.strip() == '</siteinfo>':
|
||||||
|
break
|
||||||
|
return offset, ({
|
||||||
|
'siteinfo': siteinfo,
|
||||||
|
'top_header': top_header,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def parse_block(xmlblock):
|
||||||
|
xmlentry = parseString(xmlblock)
|
||||||
|
title = (xmlentry
|
||||||
|
.getElementsByTagName('title')[0]
|
||||||
|
.childNodes[0].wholeText)
|
||||||
|
|
||||||
|
wikitext = (xmlentry
|
||||||
|
.getElementsByTagName('revision')[0]
|
||||||
|
.getElementsByTagName('text')[0]
|
||||||
|
.childNodes[0].wholeText)
|
||||||
|
return {
|
||||||
|
'title': title,
|
||||||
|
'wikitext': wikitext,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class WiktionaryReaderIter:
|
||||||
|
def __init__(self, reader):
|
||||||
|
self.f = open(reader.fname)
|
||||||
|
self.f.seek(reader.data_start)
|
||||||
|
|
||||||
|
def __next__(self):
|
||||||
|
block = []
|
||||||
|
for l in self.f:
|
||||||
|
block.append(l)
|
||||||
|
if l.strip() == '</page>':
|
||||||
|
break
|
||||||
|
if len(block) == 0:
|
||||||
|
raise StopIteration
|
||||||
|
|
||||||
|
return parse_block(''.join(block))
|
||||||
|
|
||||||
|
|
||||||
|
class WiktionaryReader:
|
||||||
|
def __init__(self, fname):
|
||||||
|
self.fname = fname
|
||||||
|
self.data_start, self.headers = read_headers(fname)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return WiktionaryReaderIter(self)
|
Loading…
Reference in New Issue
Block a user