Hyper-basic wikitionary reader.
This commit is contained in:
parent
5bf75cf03e
commit
a1925f5383
1
wikitionary/.gitignore
vendored
Normal file
1
wikitionary/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
data.xml
|
10
wikitionary/parse.py
Normal file
10
wikitionary/parse.py
Normal file
@ -0,0 +1,10 @@
|
||||
import wiktionary_reader
|
||||
|
||||
reader = wiktionary_reader.WiktionaryReader('data.xml')
|
||||
for i, entry in enumerate(reader):
|
||||
print(entry['title'])
|
||||
if (entry['title'] == 'avatar'):
|
||||
print(entry['wikitext'])
|
||||
break
|
||||
|
||||
print(i)
|
1
wikitionary/requirements.txt
Normal file
1
wikitionary/requirements.txt
Normal file
@ -0,0 +1 @@
|
||||
libarchive
|
95232
wikitionary/sample
Normal file
95232
wikitionary/sample
Normal file
File diff suppressed because it is too large
Load Diff
62
wikitionary/wiktionary_reader.py
Normal file
62
wikitionary/wiktionary_reader.py
Normal file
@ -0,0 +1,62 @@
|
||||
from xml.dom.minidom import parseString
|
||||
|
||||
def read_headers(fname):
|
||||
with open(fname) as f:
|
||||
top_header = None
|
||||
siteinfo = []
|
||||
offset = 0
|
||||
for i, l in enumerate(f):
|
||||
offset += len(l)
|
||||
if top_header is None:
|
||||
top_header = l
|
||||
continue
|
||||
|
||||
siteinfo.append(l)
|
||||
if l.strip() == '</siteinfo>':
|
||||
break
|
||||
return offset, ({
|
||||
'siteinfo': siteinfo,
|
||||
'top_header': top_header,
|
||||
})
|
||||
|
||||
|
||||
def parse_block(xmlblock):
|
||||
xmlentry = parseString(xmlblock)
|
||||
title = (xmlentry
|
||||
.getElementsByTagName('title')[0]
|
||||
.childNodes[0].wholeText)
|
||||
|
||||
wikitext = (xmlentry
|
||||
.getElementsByTagName('revision')[0]
|
||||
.getElementsByTagName('text')[0]
|
||||
.childNodes[0].wholeText)
|
||||
return {
|
||||
'title': title,
|
||||
'wikitext': wikitext,
|
||||
}
|
||||
|
||||
|
||||
class WiktionaryReaderIter:
|
||||
def __init__(self, reader):
|
||||
self.f = open(reader.fname)
|
||||
self.f.seek(reader.data_start)
|
||||
|
||||
def __next__(self):
|
||||
block = []
|
||||
for l in self.f:
|
||||
block.append(l)
|
||||
if l.strip() == '</page>':
|
||||
break
|
||||
if len(block) == 0:
|
||||
raise StopIteration
|
||||
|
||||
return parse_block(''.join(block))
|
||||
|
||||
|
||||
class WiktionaryReader:
|
||||
def __init__(self, fname):
|
||||
self.fname = fname
|
||||
self.data_start, self.headers = read_headers(fname)
|
||||
|
||||
def __iter__(self):
|
||||
return WiktionaryReaderIter(self)
|
Loading…
Reference in New Issue
Block a user