#!/usr/bin/python """ glean.py -- GRDDL client implementation USAGE: python glean.py --help python glean.py [opts] --output output.rdf http://.../input.html opts: --content=input-copy --debug to glean RDF statements as per transformations in input and write them to output.rdf. input-copy.html is an option local copy of the input document. REQUIREMENTS: /usr/bin/xsltproc REFERENCES: GRDDL Data Views: Getting Started, Learning More http://www.w3.org/2003/g/data-view and Gleaning Resource Descriptions from Dialects of Languages (GRDDL) http://www.w3.org/2004/01/rdxh/spec """ #TODO: peeking-into-profile-docs (working; need to update reference docs) # #ISSUE: I'd rather use pipes than temp files, # but spawn() is more secure than popen() # since it allows args to be parsed individually # rather than as one command string to be parsed # again by a shell. # # #LICENSE: Open Source: Share and Enjoy. # #GRDDL Workspace: http://www.w3.org/2003/g/ # #Copyright 2002-2003 World Wide Web Consortium, (Massachusetts #Institute of Technology, European Research Consortium for #Informatics and Mathematics, Keio University). All Rights #Reserved. This work is distributed under the W3C(R) Software License # http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231 #in the hope that it will be useful, but WITHOUT ANY WARRANTY; #without even the implied warranty of MERCHANTABILITY or FITNESS FOR #A PARTICULAR PURPOSE. # See change log at end __version__ = '$Id: glean.py,v 1.11 2005年03月23日 19:07:58 connolly Exp $' import sys, getopt import os, tempfile import urllib2 import RDF # Redland RDF Application Framework - Python API Reference: # http://www.redland.opensource.ac.uk/docs/pydoc/RDF.html#Model # tested with: http://packages.debian.org/python2.3-librdf 0.9.16-1_i386 XSLTPROC="/usr/bin/xsltproc" GET_TRANSFORMS="getTransforms.xsl" HTMLns = 'http://www.w3.org/1999/xhtml' DataViewDoc = "http://www.w3.org/2003/g/data-view" DataView = RDF.NS(DataViewDoc + "#") RDF_mediaType = "application/rdf+xml" def grddl(addr, inf=None, already=[DataViewDoc, HTMLns]): """ glean formal meaning of doc at addr and return as an RDF Graph (Model) inf optionally names a file that has the contents of the doc at addr. """ progress("grddl(", (addr, inf, already),")") if inf is None: (tmp, meta) = localCopy(addr, (RDF_mediaType, "text/html", "text/xml", "*")) try: if meta["content-type"].startswith(RDF_mediaType): return parseRDF(tmp, addr) else: return grddl(addr, tmp, already) finally: os.remove(tmp) data = RDF.Model() rdfp = RDF.Parser(name="rdfxml") (rdffd, rdffn) = tempfile.mkstemp(".rdf", "grddl-statements") os.close(rdffd) try: for xform in getTransforms(inf, addr, already): doXSLT(xform, inf, rdffn, {"xmlfile": addr}) progress("parsing...", rdffn) rdfp.parse_into_model(data, "file:" + rdffn, RDF.Uri(addr)) progress("data size:", data.size()) return data finally: pass #@@os.remove(rdffn) def parseRDF(inf, addr): graph = RDF.Model() rdfp = RDF.Parser(name="rdfxml") rdfp.parse_into_model(graph, "file:" + inf, RDF.Uri(addr)) return graph def localCopy(addr, acceptTypes): req = urllib2.Request(url=addr) req.add_header("Accept", ",".join(acceptTypes)) f = urllib2.urlopen(req) (tfd, tnam) = tempfile.mkstemp(".txt", "grddl-xforms") tf = os.fdopen(tfd, "w") tf.write(f.read()) tf.close() return tnam, f.info() def getTransforms(inf, base, already): (outfd, outfname) = tempfile.mkstemp(".txt", "grddl-xforms") os.close(outfd) ret = [] try: doXSLT(GET_TRANSFORMS, inf, outfname, {"xmlfile": base}) outfp = open(outfname) profiles = [] for l in outfp.readlines(): progress("getTransforms line:", l.rstrip("\r\n")) k, u = l.rstrip("\r\n").split(None, 1) if k in ('P', 'R'): profiles.append(u) else: ret.append(u) finally: os.remove(outfname) # @@add profileTransformation to spec, namespace doc for profile in profiles: if profile in already: continue progress("PROFILE: ", profile) try: profileData = grddl(profile, None, already + [profile]) except IOError: # @@ also catch parsing errors continue q = RDF.Statement(subject = RDF.Uri(profile), predicate = DataView.profileTransformation, object = None) for s in profileData.find_statements(q): progress("PROFILE XFORM: ", s.object.uri) ret.append(str(s.object.uri)) q = RDF.Statement(subject = RDF.Uri(profile), predicate = DataView.namespaceTransformation, object = None) for s in profileData.find_statements(q): progress("NAMESPACE XFORM: ", s.object.uri) ret.append(str(s.object.uri)) already.append(profile) return ret def doXSLT(xform, inf, outf, params = {}): args = ["xsltproc", "--novalid", "-o", outf] for k in params.keys(): args.extend(("--stringparam", k, params[k])) spawn(XSLTPROC, args + [xform, inf]) def spawn(path, args): progress("SPAWN: ", " ".join(args)) status = os.spawnv(os.P_WAIT, path, args) if status == 0: return raise IOError, (status, path, args) DEBUG=0 def progress(*args): if not DEBUG: return for a in args: sys.stderr.write(str(a)) sys.stderr.write("\n") # All Things Pythonic # Python main() functions # by Guido van Rossum # May 15, 2003 # http://www.artima.com/weblogs/viewpost.jsp?thread=4829 def main(argv=None): if argv is None: argv = sys.argv try: opts, args = getopt.getopt(argv[1:], "ho:c:d", ["help", "output=", "copy=", "debug"]) except getopt.GetoptError: usage() return 2 output = None inf = None global DEBUG for o, a in opts: if o in ("-d", "--debug"): DEBUG = 1 if o in ("-h", "--help"): usage() return 0 if o in ("-o", "--output"): output = a if o in ("-c", "--copy"): inf = a if not (output and len(args) == 1): usage() return 2 data = grddl(args[0], inf) progress("GRDDL size:", data.size()) RDF.Serializer().serialize_model_to_file(output, data) return 0 def usage(): print __doc__ print __version__ if __name__ == '__main__': sys.exit(main()) # $Log: glean.py,v $ # Revision 1.11 2005年03月23日 19:07:58 connolly # added grddl txform pointers # # Revision 1.10 2004年06月04日 20:53:40 connolly # recover from 404s when following namespace and profile links # # Revision 1.9 2004年06月04日 20:16:32 connolly # - general XML support # - peeking-into-namespace docs # # Revision 1.8 2004年06月04日 18:39:53 connolly # - fix recursion loop # - getopt, usage() # # Revision 1.7 2004年05月28日 19:35:21 connolly # fixed usage doc # # Revision 1.6 2004年05月28日 19:16:25 connolly # point to debian package for python redland # # Revision 1.5 2004年05月28日 19:07:23 connolly # - peeking-into-profile-docs works! # - grddl is called recursively, so there's a new arg, # already, to prevent loops # - input file is now optional; command-line args reordered # - likewise grddl() args changed # # Revision 1.4 2004年05月28日 18:01:55 connolly # in preparation for peeking-into-profile docs, # use redland RDF API to merge transform # outputs, rather than reroot.xsl hack # # Revision 1.3 2004年05月28日 16:22:44 connolly # - use mkstemp() to get rid of hard-coded temp filenames # - code cleanup: # - full module docstring # - moved main closer to article citation #

AltStyle によって変換されたページ (->オリジナル) /