/* -*- Mode: c; c-basic-offset: 2 -*-
*
* raptor_xslt_parse.c - Raptor GRDDL XSLT Parser implementation
*
* $Id$
*
* Copyright (C) 2005, David Beckett http://purl.org/net/dajobe/
* Institute for Learning and Research Technology http://www.ilrt.bristol.ac.uk/
* University of Bristol, UK http://www.bristol.ac.uk/
*
* This package is Free Software and part of Redland http://librdf.org/
*
* It is licensed under the following three licenses as alternatives:
* 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
* 2. GNU General Public License (GPL) V2 or any newer version
* 3. Apache License, V2.0 or any newer version
*
* You may not use this file except in compliance with at least one of
* the above three licenses.
*
* See LICENSE.html or LICENSE.txt at the top of this package for the
* complete terms and further detail along with the license texts for
* the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
*
*/
/*
* W3C Gleaning Resource Descriptions from Dialects of Languages (GRDDL)
* http://www.w3.org/2004/01/rdxh/spec
*
* See also
* http://www.w3.org/2003/g/data-view
*
*
* Looks for indication of GRDDL meaning intended in the XML (XHTML)
* document source.
*
* 1. /html/head[@profile="http://www.w3.org/2003/g/data-view"]
* 2. /html/head/link[@rel="transformation"] (may be repeated)
*
* Indicating that the sheet in the value of @href of #2 transforms
* the document into RDF/XML and hence RDF triples.
*
* In example:
*
*
*
* ...
*
*
* The may be repeated.
*/
#ifdef HAVE_CONFIG_H
#include
#endif
#ifdef WIN32
#include
#endif
#include
#include
#include
#include
#ifdef HAVE_ERRNO_H
#include
#endif
#ifdef HAVE_STDLIB_H
#include
#endif
/* Raptor includes */
#include "raptor.h"
#include "raptor_internal.h"
#include
#include
#include
#include
#include
/*
* libxslt API notes
*
* Inputs to an XSLT transformation process with libxslt are:
* 1. A set of (key:value) parameters.
* 2. An xsltStylesheetPtr for the XSLT sheet
* Which could be made from a file or an xmlDoc; and the xmlDoc.
* made from a file or memory buffer.
* 3. An xmlDoc for the XML source
* Which could be made from a file or a memory buffer.
*
*/
/*
* XSLT parser object
*/
struct raptor_xslt_parser_context_s {
xmlSAXHandler sax;
/* XML document ctxt */
xmlParserCtxtPtr ctxt;
/* Create xpath evaluation context */
xmlXPathContextPtr xpathCtx;
/* Evaluate xpath expression */
xmlXPathObjectPtr xpathObj;
/* (RDF/XML) parser for dealing with the result */
raptor_parser* rdfxml;
};
typedef struct raptor_xslt_parser_context_s raptor_xslt_parser_context;
static int
raptor_xslt_parse_init(raptor_parser* rdf_parser, const char *name)
{
raptor_xslt_parser_context *xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context;
xslt_parser->rdfxml=raptor_new_parser("rdfxml");
if(!xslt_parser->rdfxml) {
raptor_parser_error(rdf_parser, "Failed to create RDF/XML parser");
return 1;
}
return 0;
}
static void
raptor_xslt_parse_terminate(raptor_parser *rdf_parser)
{
raptor_xslt_parser_context *xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context;
if(xslt_parser->ctxt) {
if(xslt_parser->ctxt->myDoc) {
xmlFreeDoc(xslt_parser->ctxt->myDoc);
xslt_parser->ctxt->myDoc=NULL;
}
xmlFreeParserCtxt(xslt_parser->ctxt);
}
if(xslt_parser->xpathCtx)
xmlXPathFreeContext(xslt_parser->xpathCtx);
if(xslt_parser->xpathObj)
xmlXPathFreeObject(xslt_parser->xpathObj);
if(xslt_parser->rdfxml)
raptor_free_parser(xslt_parser->rdfxml);
}
static int
raptor_xslt_parse_start(raptor_parser *rdf_parser)
{
raptor_xslt_parser_context* xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context;
raptor_locator *locator=&rdf_parser->locator;
raptor_parser *p=xslt_parser->rdfxml;
locator->line=1;
/* copy any user data to the internal parser */
raptor_parser_copy_user_state(p, rdf_parser);
return 0;
}
static const xmlChar* xpathExpressions[4]={
/* XHTML document where the GRDDL profile is in
* inside the html
*/
(const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/html:link[@rel=\"transformation\"]/@href",
/* XHTML document where the GRDDL profile is in
* inside the html
*/
(const xmlChar*)"/html:html/html:head[contains(@profile,\"http://www.w3.org/2003/g/data-view\")]/../..//html:a[@rel=\"transformation\"]/@href",
/* XML document linking to transform via attribute dataview:transformation */
(const xmlChar*)"//@dataview:transformation",
NULL
};
static void
raptor_xslt_uri_parse_bytes(raptor_www* www,
void *userdata,
const void *ptr, size_t size, size_t nmemb)
{
xmlParserCtxtPtr* ctxt_ptr=(xmlParserCtxtPtr*)userdata;
int len=size*nmemb;
int rc=0;
if(!*ctxt_ptr) {
xmlParserCtxtPtr xc;
xc = xmlCreatePushParserCtxt(NULL, NULL,
(const char*)ptr, len,
(const char*)raptor_uri_as_string(www->uri));
if(!xc)
rc=1;
else {
xc->replaceEntities = 1;
xc->loadsubset = 1;
}
*ctxt_ptr=xc;
} else
rc=xmlParseChunk(*ctxt_ptr, (const char*)ptr, len, 0);
if(rc)
raptor_www_abort(www, "Parsing failed");
}
static int
raptor_xslt_parse_chunk(raptor_parser* rdf_parser,
const unsigned char *s, size_t len,
int is_end)
{
raptor_xslt_parser_context* xslt_parser=(raptor_xslt_parser_context*)rdf_parser->context;
int i;
int ret=0;
const unsigned char* uri_string;
raptor_uri* uri;
/* XML document DOM */
xmlDocPtr doc;
xmlNodeSetPtr nodes;
int expri;
if(!xslt_parser->ctxt) {
uri_string=raptor_uri_as_string(rdf_parser->base_uri);
/* first time, so init context with first read bytes */
xslt_parser->ctxt = xmlCreatePushParserCtxt(NULL, NULL,
(const char*)s, len,
(const char*)uri_string);
if(!xslt_parser->ctxt) {
raptor_parser_error(rdf_parser, "Failed to create XML parser");
return 1;
}
raptor_libxml_init_sax_error_handlers(&xslt_parser->sax);
raptor_libxml_init_generic_error_handlers(rdf_parser);
xslt_parser->ctxt->replaceEntities = 1;
xslt_parser->ctxt->loadsubset = 1;
if(is_end)
xmlParseChunk(xslt_parser->ctxt, (const char*)s, 0, is_end);
} else if((s && len) || is_end)
xmlParseChunk(xslt_parser->ctxt, (const char*)s, len, is_end);
if(!is_end)
return 0;
doc=xslt_parser->ctxt->myDoc;
if(!doc) {
raptor_parser_error(rdf_parser, "Failed to create XML DOM for document");
return 1;
}
/* Create the XPath evaluation context */
xslt_parser->xpathCtx=NULL;
xslt_parser->xpathCtx = xmlXPathNewContext(doc);
if(!xslt_parser->xpathCtx) {
raptor_parser_error(rdf_parser, "Failed to create XPath context for document");
return 1;
}
xmlXPathRegisterNs(xslt_parser->xpathCtx,
(const xmlChar*)"html",
(const xmlChar*)"http://www.w3.org/1999/xhtml");
xmlXPathRegisterNs(xslt_parser->xpathCtx,
(const xmlChar*)"dataview",
(const xmlChar*)"http://www.w3.org/2003/g/data-view#");
/* Try all XPaths */
for(expri=0; xpathExpressions[expri]; expri++) {
const xmlChar* xpathExpr=xpathExpressions[expri];
/* Evaluate xpath expression */
xslt_parser->xpathObj = xmlXPathEvalExpression(xpathExpr,
xslt_parser->xpathCtx);
if(!xslt_parser->xpathObj) {
raptor_parser_error(rdf_parser,
"Unable to evaluate XPath expression \"%s\"",
xpathExpr);
return 1;
}
nodes=xslt_parser->xpathObj->nodesetval;
if(!nodes || xmlXPathNodeSetIsEmpty(nodes)) {
RAPTOR_DEBUG3("No GRDDL found with XPath expression \"%s\" over '%s'\n",
xpathExpr, raptor_uri_as_string(rdf_parser->base_uri));
continue;
}
for(i=0; i < xmlXPathNodeSetGetLength(nodes); i++) {
xmlNodePtr node=nodes->nodeTab[i];
xsltStylesheetPtr sheet=NULL;
xmlDocPtr res=NULL;
xmlParserCtxtPtr xslt_ctxt;
raptor_www *www;
xmlChar *doc_txt=NULL;
int doc_txt_len=0;
xmlChar *base_uri_string;
raptor_uri* base_uri=NULL;
if(node->type != XML_ATTRIBUTE_NODE) {
raptor_parser_error(rdf_parser, "Got unexpected node type %d",
node->type);
continue;
}
/* returns base URI or NULL - must be freed with xmlFree() */
base_uri_string=xmlNodeGetBase(doc, node);
if(base_uri_string) {
base_uri=raptor_new_uri(base_uri_string);
xmlFree(base_uri_string);
RAPTOR_DEBUG2("Got XML base URI '%s'\n", raptor_uri_as_string(base_uri));
} else if(rdf_parser->base_uri)
base_uri=raptor_uri_copy(rdf_parser->base_uri);
uri_string=(const unsigned char*)node->children->content;
uri=raptor_new_uri_relative_to_base(base_uri, uri_string);
if(base_uri)
raptor_free_uri(base_uri);
RAPTOR_DEBUG2("Running GRDDL transform with URI '%s'\n",
raptor_uri_as_string(uri));
/* make an xsltStylesheetPtr via the raptor_xslt_uri_parse_bytes
* callback as bytes are returned
*/
xslt_ctxt=NULL;
www=raptor_www_new();
raptor_www_set_write_bytes_handler(www,
raptor_xslt_uri_parse_bytes,
&xslt_ctxt);
if(raptor_www_fetch(www, uri)) {
ret=1;
goto cleanup_xslt;
}
xmlParseChunk(xslt_ctxt, NULL, 0, 1);
sheet = xsltParseStylesheetDoc(xslt_ctxt->myDoc);
if(!sheet) {
raptor_parser_error(rdf_parser, "Failed to parse stylesheet in '%s'",
raptor_uri_as_string(uri));
ret=1;
goto cleanup_xslt;
}
res = xsltApplyStylesheet(sheet, doc, NULL); /* no params */
if(!res) {
raptor_parser_error(rdf_parser, "Failed to apply stylesheet in '%s'",
raptor_uri_as_string(uri));
ret=1;
goto cleanup_xslt;
}
/* write the resulting XML to a string */
xsltSaveResultToString(&doc_txt, &doc_txt_len, res, sheet);
if(!doc_txt || !doc_txt_len) {
/* empty document - continue? FIXME */
raptor_parser_warning(rdf_parser,
"Stylesheet returned an empty document");
} else {
RAPTOR_DEBUG2("XSLT gave %d bytes XML result\n", doc_txt_len);
/* generate the triples */
raptor_start_parse(xslt_parser->rdfxml, rdf_parser->base_uri);
raptor_parse_chunk(xslt_parser->rdfxml, doc_txt, doc_txt_len, 1);
}
cleanup_xslt:
if(doc_txt)
xmlFree(doc_txt);
if(res)
xmlFreeDoc(res);
if(sheet)
xsltFreeStylesheet(sheet);
if(xslt_ctxt)
xmlFreeParserCtxt(xslt_ctxt);
if(uri)
raptor_free_uri(uri);
if(www)
raptor_www_free(www);
} /* end node loop */
if(rdf_parser->failed || ret != 0)
break;
} /* end XPath expression loop */
if(rdf_parser->failed)
return 1;
return (ret != 0);
}
static int
raptor_xslt_parse_recognise_syntax(raptor_parser_factory* factory,
const unsigned char *buffer, size_t len,
const unsigned char *identifier,
const unsigned char *suffix,
const char *mime_type)
{
int score= 0;
if(suffix) {
if(!strcmp((const char*)suffix, "xhtml"))
score=7;
if(!strcmp((const char*)suffix, "html"))
score=2;
}
if(identifier) {
if(strstr((const char*)identifier, "xhtml"))
score+=5;
}
return score;
}
static void
raptor_xslt_parser_register_factory(raptor_parser_factory *factory)
{
factory->context_length = sizeof(raptor_xslt_parser_context);
factory->init = raptor_xslt_parse_init;
factory->terminate = raptor_xslt_parse_terminate;
factory->start = raptor_xslt_parse_start;
factory->chunk = raptor_xslt_parse_chunk;
factory->recognise_syntax = raptor_xslt_parse_recognise_syntax;
}
void
raptor_init_parser_grddl(void)
{
raptor_parser_register_factory("grddl", "GRDDL over XHTML/XML using XSLT",
NULL,
NULL,
&raptor_xslt_parser_register_factory);
}