Ok, here it goes. If copypaste to your editor does not work, try this temporary link:
http://plok.in/tmp/PlokParser.py
Just running it ("/path/to/python2.2 PlokParser.py") should show some results. If this does not work and you need more instructions how to test PlokParser, please reply to this thread. NOTE: sgmllib.py and markupbase.py are needed, but usually they are found by python if python installation is successful.
Suggestions and improvements are appreciated.
Code:
# -*- coding: iso-8859-15 -*-
# $Id: PlokParser.py 213 2006-08-23 19:26:18Z arista $
# Copyright Aapo Rista and others 2006
# License GPL
import sgmllib
import urllib
import re
class PlokParser(sgmllib.SGMLParser):
"""A simple parser class. See exampledata.xml for data details."""
# Characters which will be encoded to entities
chartable = {
u'&':u'&',
u'"':u'"',
# u"'":u"'", # This entity (single quote) went broken in nokia's forum-software
# Try to replace second single quote with & #039; (without space inside)
u'<':u'<',
u'>':u'>',
}
# Entities which will be decoded to characters (see init)
entitytable = {}
def __init__(self, verbose=0):
"""Initialise an object, passing 'verbose' to the superclass."""
sgmllib.SGMLParser.__init__(self, verbose)
# This will hold parsed tags until parser is clear()'ed
self.lists = {}
# Construct entitytable by swapping keys and values in chartable
for (key, value) in self.chartable.items():
self.entitytable[value] = key
charlist = self.chartable.keys()
# re which matches all chars which must be entities in XML
self.char_re = re.compile("[%s]" % ("".join(charlist)))
# re which matches all possible entities you'd like to decode
self.ent_re = re.compile(r'&.*?;')
def parse(self, s):
"""Parse the given string 's'."""
self.feed(s)
self.close()
def clear(self):
"""Empty all lists."""
self.lists = {}
def decode_entity(self, match):
"""Callback function for re.sub in unknown_starttag().
Replaces entities found with character equivalent in entitytable."""
entity = match.group()
if self.entitytable.has_key(entity):
return self.entitytable[entity]
else:
return entity
def encode_entity(self, match):
"""Callback function for re.sub in create_tag().
Replaces characters found in chartable with entities."""
char = match.group()
if self.chartable.has_key(char):
return self.chartable[char]
else:
return char
def unknown_starttag(self, tag, attrs):
"""Handles all tags and puts their attributes into the appropriate list.
Decodes also html entities."""
attr_dict = {}
for i in range(0, len(attrs)):
attr_dict[attrs[i][0]] = self.ent_re.sub(self.decode_entity, attrs[i][1])
if attr_dict.has_key("time"):
# I hope sortfield will not confuse folks too much
# It is used to keep lists in order (see python's sort())
sortfield = int(attr_dict["time"])
else:
sortfield = 0
if len(attr_dict) > 0:
if not self.lists.has_key(tag):
self.lists[tag] = []
self.lists[tag].append([sortfield, attr_dict])
def create_tag(self, tag, attr_dict):
"""Create XML-like tag including attributes found in attr_dict"""
attr_list = []
for k in attr_dict.keys():
val = self.char_re.sub(self.encode_entity, attr_dict[k])
attr_list.append(u'%s="%s"' % (k, val))
return u"<%s %s></%s>" % (tag, " ".join(attr_list), tag)
if __name__ == "__main__":
import time
# Initialize parser
pp = PlokParser()
# Populate tags-list
tags1 = []
tags1.append(pp.create_tag(u"img", {u"id":u"1234", u"name":u"foo.jpg", u"title":u"Mom & Dad"}))
tags1.append(pp.create_tag(u"img", {u"id":u"42", u"name":u"bar.jpg", u"title":u"Mom's shoe"}))
tags1.append(pp.create_tag(u"msg", {u"id":u"3187", u"sender":u"Aapo", u"text":u"Ugh"}))
tags1.append(pp.create_tag(u"msg", {u"id":u"3188", u"sender":u"Teemu", u"text":u"<:->"}))
# Create single xml-string from tags-list
xml1 = u"\r\n".join(tags1)
# Parse xml, parsed data will be found in dictionary pp.lists
pp.parse(xml1)
list1 = pp.lists.copy()
tags2 = []
for (tag, data) in list1.items():
for i in range(len(data)):
tags2.append(pp.create_tag(tag, data[i][1]))
xml2 = u"\r\n".join(tags2)
pp.clear()
pp.parse(xml2)
list2 = pp.lists.copy()
print xml1
print
print list1
print
print xml2
print
print list2