Python Programming/XML Tools
Appearance
From Wikibooks, open books for an open world
Introduction
[edit | edit source ]Python includes several modules for manipulating xml.
xml.sax.handler
[edit | edit source ]importxml.sax.handlerassaxhandler importxml.saxassaxparser classMyReport: def__init__(self): self.Y = 1 classMyCH(saxhandler.ContentHandler): def__init__(self, report): self.X = 1 self.report = report defstartDocument(self): print('startDocument') defstartElement(self, name, attrs): print('Element:', name) report = MyReport() #for future use ch = MyCH(report) xml = """\ <collection> <comic title=\"Sandman\" number='62'> <writer>Neil Gaiman</writer> <penciller pages='1-9,18-24'>Glyn Dillon</penciller> <penciller pages="10-17">Charles Vess</penciller> </comic> </collection> """ print(xml) saxparser.parseString(xml, ch)
xml.dom.minidom
[edit | edit source ]An example of doing RSS feed parsing with DOM
fromxml.domimport minidom as dom importurllib2 deffetchPage(url): a = urllib2.urlopen(url) return ''.join(a.readlines()) defextract(page): a = dom.parseString(page) item = a.getElementsByTagName('item') for i in item: if i.hasChildNodes(): t = i.getElementsByTagName('title')[0].firstChild.wholeText l = i.getElementsByTagName('link')[0].firstChild.wholeText d = i.getElementsByTagName('description')[0].firstChild.wholeText print(t, l, d) if __name__=='__main__': page = fetchPage("http://rss.slashdot.org/Slashdot/slashdot") extract(page)
XML document provided by pyxml documentation.