/* -*- Mode: c; c-basic-offset: 2 -*- * * raptor_ntriples.c - Raptor N-Triples parsing utilities * * Copyright (C) 2013, David Beckett http://www.dajobe.org/ * * This package is Free Software and part of Redland http://librdf.org/ * * It is licensed under the following three licenses as alternatives: * 1. GNU Lesser General Public License (LGPL) V2.1 or any newer version * 2. GNU General Public License (GPL) V2 or any newer version * 3. Apache License, V2.0 or any newer version * * You may not use this file except in compliance with at least one of * the above three licenses. * * See LICENSE.html or LICENSE.txt at the top of this package for the * complete terms and further detail along with the license texts for * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively. * */ #ifdef HAVE_CONFIG_H #include #endif #include #include #include #include #ifdef HAVE_ERRNO_H #include #endif #ifdef HAVE_STDLIB_H #include #endif /* Raptor includes */ #include "raptor2.h" #include "raptor_internal.h" /* These are for 7-bit ASCII and not locale-specific */ #define IS_ASCII_ALPHA(c) (((c) > 0x40 && (c) < 0x5B) || ((c) > 0x60 && (c) < 0x7B)) #define IS_ASCII_UPPER(c) ((c) > 0x40 && (c) < 0x5B) #define IS_ASCII_DIGIT(c) ((c) > 0x2F && (c) < 0x3A) #define IS_ASCII_PRINT(c) ((c) > 0x1F && (c) < 0x7F) #define TO_ASCII_LOWER(c) ((c)+0x20) typedef enum { RAPTOR_TERM_CLASS_URI, /* ends on > */ RAPTOR_TERM_CLASS_BNODEID, /* ends on first non [A-Za-z][A-Za-z0-9]* */ RAPTOR_TERM_CLASS_STRING, /* ends on non-escaped " */ RAPTOR_TERM_CLASS_LANGUAGE /* ends on first non [a-z0-9]+ ('-' [a-z0-9]+ )? */ } raptor_ntriples_term_class; static int raptor_ntriples_term_valid(unsigned char c, int position, raptor_ntriples_term_class term_class) { int result = 0; switch(term_class) { case RAPTOR_TERM_CLASS_URI: /* ends on > */ result = (c != '>'); break; case RAPTOR_TERM_CLASS_BNODEID: /* ends on first non [A-Za-z0-9_:][-.A-Za-z0-9]* */ result = IS_ASCII_ALPHA(c) || IS_ASCII_DIGIT(c) || c == '_' || c == ':'; if(position) /* FIXME * This isn't correct; '.' is allowed in positions 1..N-1 but * this calling convention of character-by-character cannot * check this. */ result = (result || c == '-' || c == '.'); break; case RAPTOR_TERM_CLASS_STRING: /* ends on " */ result = (c != '"'); break; case RAPTOR_TERM_CLASS_LANGUAGE: /* ends on first non [a-zA-Z]+ ('-' [a-zA-Z0-9]+ )? * Accept _ as synonym / typo for -. */ result = IS_ASCII_ALPHA(c); if(position) result = (result || IS_ASCII_DIGIT(c) || c == '-' || c == '_'); break; default: RAPTOR_DEBUG2("Unknown N-Triples term class %u", term_class); } return result; } /* * raptor_ntriples_parse_term_internal: * @world: raptor world * @locator: locator object (in/out) (or NULL) * @start: pointer to starting character of string (in) * @dest: destination of string (in) * @lenp: pointer to length of string (in/out) * @dest_lenp: pointer to length of destination string (out) * @end_char: string ending character * @class: string class * * INTERNAL - Parse an N-Triples term with escapes. * * Relies that @dest is long enough; it need only be as large as the * input string @start since when UTF-8 encoding, the escapes are * removed and the result is always less than or equal to length of * input. * * N-Triples strings / URIs are written in ASCII at present; * characters outside the printable ASCII range are discarded with a * warning. See the grammar for full details of the allowed ranges. * * UTF-8 and the \u and \U esapes are both allowed. * * URIs may not have \t \b \n \r \f or raw ' ' or \u0020 or \u003C or \u003E * * Return value: Non 0 on failure **/ static int raptor_ntriples_parse_term_internal(raptor_world* world, raptor_locator* locator, const unsigned char **start, unsigned char *dest, size_t *lenp, size_t *dest_lenp, char end_char, raptor_ntriples_term_class term_class) { const unsigned char *p = *start; unsigned char c = '\0'; size_t ulen = 0; unsigned long unichar = 0; unsigned int position = 0; int end_char_seen = 0; /* find end of string, fixing backslashed characters on the way */ while(*lenp > 0) { int unichar_width; c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } if(term_class == RAPTOR_TERM_CLASS_URI && c == ' ') { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal character %d (0x%02X) found.", c, RAPTOR_GOOD_CAST(unsigned int, c)); return 1; } if(c > 0x7f) { /* just copy the UTF-8 bytes through */ int unichar_len; unichar_len = raptor_unicode_utf8_string_get_char(p - 1, 1 + *lenp, NULL); if(unichar_len < 0 || RAPTOR_GOOD_CAST(size_t, unichar_len) > *lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "UTF-8 encoding error at character %d (0x%02X) found.", c, RAPTOR_GOOD_CAST(unsigned int, c)); /* UTF-8 encoding had an error or ended in the middle of a string */ return 1; } memmove(dest, p-1, unichar_len); dest += unichar_len; unichar_len--; /* p, *lenp were moved on by 1 earlier */ p += unichar_len; (*lenp) -= unichar_len; if(locator) { locator->column += unichar_len; locator->byte += unichar_len; } continue; } if(c != '\\') { /* finish at non-backslashed end_char */ if(end_char && c == end_char) { end_char_seen = 1; break; } if(!raptor_ntriples_term_valid(c, position, term_class)) { if(end_char) { /* end char was expected, so finding an invalid thing is an error */ raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' (found '%c')", end_char, c); return 0; } else { /* it's the end - so rewind 1 to save next char */ p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } if(term_class == RAPTOR_TERM_CLASS_BNODEID && dest[-1] == '.') { /* If bnode id ended on '.' move back one */ dest--; p--; (*lenp)++; if(locator) { locator->column--; locator->byte--; } } break; } } /* otherwise store and move on */ *dest++ = c; position++; continue; } if(!*lenp) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "\\ at end of input."); return 0; } c = *p; p++; (*lenp)--; if(locator) { locator->column++; locator->byte++; } switch(c) { case '"': case '\\': *dest++ = c; break; case 'b': case 'f': case 'n': case 'r': case 't': if(term_class == RAPTOR_TERM_CLASS_URI) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal URI escape '\\%c'.", c); return 1; } if(c == 'b') *dest++ = '\b'; else if(c == 'f') *dest++ = '\f'; else if(c == 'n') *dest++ = '\n'; else if(c == 'r') *dest++ = '\r'; else /* 't' */ *dest++ = '\t'; break; case '<': case '>': case '{': case '}': case '|': case '^': case '`': /* Turtle 2013 allows these in URIs (as well as \" and \\) */ *dest++ = c; break; case 'u': case 'U': ulen = (c == 'u') ? 4 : 8; if(*lenp < ulen) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "%c over end of input.", c); return 0; } if(1) { unsigned int ii; int n = 0; for(ii = 0; ii < ulen; ii++) { char cc = p[ii]; if(!isxdigit(RAPTOR_GOOD_CAST(char, cc))) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "N-Triples string error - illegal hex digit %c in Unicode escape '%c%s...'", cc, c, p); n = 1; break; } } if(n) break; n = sscanf((const char*)p, ((ulen == 4) ? "%04lx" : "%08lx"), &unichar); if(n != 1) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Uncode escape '%c%s...'", c, p); break; } } p += ulen; (*lenp) -= ulen; if(locator) { locator->column += RAPTOR_GOOD_CAST(int, ulen); locator->byte += RAPTOR_GOOD_CAST(int, ulen); } if(term_class == RAPTOR_TERM_CLASS_URI && (unichar == 0x0020 || unichar == 0x003C || unichar == 0x003E)) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI error - illegal Unicode escape \\u%04lX in URI.", unichar); break; } if(unichar > raptor_unicode_max_codepoint) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX (max #x%lX).", unichar, raptor_unicode_max_codepoint); break; } unichar_width = raptor_unicode_utf8_string_put_char(unichar, dest, 4); if(unichar_width < 0) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal Unicode character with code point #x%lX.", unichar); break; } /* The destination length is set here to 4 since we know that in * all cases, the UTF-8 encoded output sequence is always shorter * than the input sequence, and the buffer is edited in place. * \uXXXX: 6 bytes input - UTF-8 max 3 bytes output * \uXXXXXXXX: 10 bytes input - UTF-8 max 4 bytes output */ dest += (int)unichar_width; break; default: raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal string escape \\%c in \"%s\"", c, (char*)start); return 0; } position++; } /* end while */ if(end_char && !end_char_seen) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing terminating '%c' before end of input.", end_char); return 1; } /* terminate dest, can be shorter than source */ *dest = '\0'; if(dest_lenp) *dest_lenp = p - *start; *start = p; return 0; } static int raptor_parse_turtle_term_internal(raptor_world* world, raptor_locator* locator, const unsigned char **start, unsigned char *dest, size_t *len_p, size_t *dest_lenp, raptor_uri** datatype_uri_p) { const unsigned char *p = *start; unsigned int position = 0; /* 0 = xsd:integer; 1= xsd:decimal; 2= xsd:double */ short dtype = 0; int after_e = 0; while(*len_p > 0) { unsigned char c = *p; if(after_e) { if(!((c >= '0' && c <'9') || c == '+' || c == '-')) break; after_e = 0; } else if((position > 0 && (c == '+' || c == '-')) || !((c >= '0' && c <'9') || c == '.' || c == 'e' || c == 'E')) break; if(c == '.') dtype = 1; else if(c == 'e' || c == 'E') { dtype = 2; after_e = 1; } p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } *dest++ = c; position++; } *dest = '\0'; if(dest_lenp) *dest_lenp = p - *start; *start = p; if(dtype == 0) *datatype_uri_p = raptor_uri_copy(world->xsd_integer_uri); else if (dtype == 1) *datatype_uri_p = raptor_uri_copy(world->xsd_decimal_uri); else *datatype_uri_p = raptor_uri_copy(world->xsd_double_uri); return 0; } /* * raptor_ntriples_parse_term: * @world: raptor world * @locator: raptor locator (in/out) (or NULL) * @string: string input (in) * @len_p: pointer to length of @string (in/out) * @term_p: pointer to store term (out) * @allow_turtle: non-0 to allow Turtle forms such as integers, boolean * * INTERNAL - Parse an N-Triples string into a #raptor_term * * The @len_p destination and @locator fields are modified as parsing * proceeds to be used in error messages. The final value is written * into the #raptor_term pointed at by @term_p * * Return value: number of bytes processed or 0 on failure */ size_t raptor_ntriples_parse_term(raptor_world* world, raptor_locator* locator, unsigned char *string, size_t *len_p, raptor_term** term_p, int allow_turtle) { unsigned char *p = string; unsigned char *dest; size_t term_length = 0; switch(*p) { case '<': dest = p; p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } if(raptor_ntriples_parse_term_internal(world, locator, (const unsigned char**)&p, dest, len_p, &term_length, '>', RAPTOR_TERM_CLASS_URI)) { goto fail; } if(1) { raptor_uri *uri; /* Check for bad ordinal predicate */ if(!strncmp((const char*)dest, "http://www.w3.org/1999/02/22-rdf-syntax-ns#_", 44)) { int ordinal = raptor_check_ordinal(dest + 44); if(ordinal <= 0) raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal ordinal value %d in property '%s'.", ordinal, dest); } if(raptor_uri_uri_string_is_absolute(dest) <= 0) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "URI '%s' is not absolute.", dest); goto fail; } uri = raptor_new_uri(world, dest); if(!uri) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Could not create URI for '%s'", (const char *)dest); goto fail; } *term_p = raptor_new_term_from_uri(world, uri); raptor_free_uri(uri); } break; case '-': case '+': case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if(allow_turtle) { raptor_uri* datatype_uri = NULL; dest = p; if(raptor_parse_turtle_term_internal(world, locator, (const unsigned char**)&p, dest, len_p, &term_length, &datatype_uri)) { goto fail; } *term_p = raptor_new_term_from_literal(world, dest, datatype_uri, NULL /* language */); } else goto fail; break; case '"': dest = p; p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } if(raptor_ntriples_parse_term_internal(world, locator, (const unsigned char**)&p, dest, len_p, &term_length, '"', RAPTOR_TERM_CLASS_STRING)) { goto fail; } if(1) { unsigned char *object_literal_language = NULL; unsigned char *object_literal_datatype = NULL; raptor_uri* datatype_uri = NULL; if(*len_p && *p == '@') { unsigned char *q; size_t lang_len; object_literal_language = p; /* Skip - */ p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } if(!*len_p) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing language after \"string\"-"); goto fail; } if(raptor_ntriples_parse_term_internal(world, locator, (const unsigned char**)&p, object_literal_language, len_p, &lang_len, '\0', RAPTOR_TERM_CLASS_LANGUAGE)) { goto fail; } if(!lang_len) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Invalid language tag at @%s", p); goto fail; } /* Normalize language to lowercase * http://www.w3.org/TR/rdf-concepts/#dfn-language-identifier * Also convert _ to - as synonym / typo. */ for(q = object_literal_language; *q; q++) { if(IS_ASCII_UPPER(*q)) *q = RAPTOR_GOOD_CAST(unsigned char, TO_ASCII_LOWER(*q)); if(*q == '_') *q = '-'; } } if(*len_p > 1 && *p == '^' && p[1] == '^') { object_literal_datatype = p; /* Skip ^^ */ p += 2; *len_p -= 2; if(locator) { locator->column += 2; locator->byte += 2; } if(!*len_p || (*len_p && *p != '<')) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Missing datatype URI-ref in\"string\"^^ after ^^"); goto fail; } p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } if(raptor_ntriples_parse_term_internal(world, locator, (const unsigned char**)&p, object_literal_datatype, len_p, NULL, '>', RAPTOR_TERM_CLASS_URI)) { goto fail; } if(raptor_uri_uri_string_is_absolute(object_literal_datatype) <= 0) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Datatype URI '%s' is not absolute.", object_literal_datatype); goto fail; } } if(object_literal_datatype && object_literal_language) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Typed literal used with a language - ignoring the language"); object_literal_language = NULL; } if(object_literal_datatype) { datatype_uri = raptor_new_uri(world, object_literal_datatype); if(!datatype_uri) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Could not create literal datatype uri '%s'", object_literal_datatype); goto fail; } object_literal_language = NULL; } *term_p = raptor_new_term_from_literal(world, dest, datatype_uri, object_literal_language); if(datatype_uri) raptor_free_uri(datatype_uri); } break; case '_': /* store where _ was */ dest = p; p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } if(!*len_p || (*len_p > 0 && *p != ':')) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Illegal bNodeID - _ not followed by :"); goto fail; } /* Found ':' - move on */ p++; (*len_p)--; if(locator) { locator->column++; locator->byte++; } if(raptor_ntriples_parse_term_internal(world, locator, (const unsigned char**)&p, dest, len_p, &term_length, '\0', RAPTOR_TERM_CLASS_BNODEID)) { goto fail; } if(!term_length) { raptor_log_error_formatted(world, RAPTOR_LOG_LEVEL_ERROR, locator, "Bad or missing bNodeID after _:"); goto fail; } *term_p = raptor_new_term_from_blank(world, dest); break; default: RAPTOR_DEBUG2("Unknown term type '%c'", *p); goto fail; } fail: return p - string; }