diff options
author | Nick Wellnhofer <wellnhofer@aevum.de> | 2023-03-21 19:07:12 +0100 |
---|---|---|
committer | Nick Wellnhofer <wellnhofer@aevum.de> | 2023-03-21 21:35:15 +0100 |
commit | 98840d40da6d6f4a6aa62b9a624d7ef13636de38 (patch) | |
tree | f2fe7e9d50bb626bca763e1ba81d0c5f496cf036 /encoding.c | |
parent | 3eb9f5ca4e6b0933ac1dc7fbcce38669ac002b7f (diff) | |
download | libxml2-98840d40da6d6f4a6aa62b9a624d7ef13636de38.tar.gz |
parser: Rework EBCDIC code page detection
To detect EBCDIC code pages, we used to switch the encoding twice and
had to be very careful not to decode data after the XML declaration
before the second switch. This relied on a hard-coded expected size of
the XML declaration and was complicated and unreliable.
Now we convert the first 200 bytes to EBCDIC-US and parse the encoding
declaration manually.
Diffstat (limited to 'encoding.c')
-rw-r--r-- | encoding.c | 183 |
1 files changed, 3 insertions, 180 deletions
@@ -2037,7 +2037,7 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, * as the return value is 0, else unpredictable. * The value of @outlen after return is the number of octets produced. */ -static int +int xmlEncInputChunk(xmlCharEncodingHandler *handler, unsigned char *out, int *outlen, const unsigned char *in, int *inlen, int flush) { int ret; @@ -2123,189 +2123,12 @@ xmlEncOutputChunk(xmlCharEncodingHandler *handler, unsigned char *out, * @out: an xmlBuffer for the output. * @in: an xmlBuffer for the input * - * Front-end for the encoding handler input function, but handle only - * the very first line, i.e. limit itself to 45 chars. - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or + * DEPERECATED: Don't use. */ int xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, xmlBufferPtr in) { - int ret; - int written; - int toconv; - - if (handler == NULL) return(-1); - if (out == NULL) return(-1); - if (in == NULL) return(-1); - - /* calculate space available */ - written = out->size - out->use - 1; /* count '\0' */ - toconv = in->use; - /* - * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 - * 45 chars should be sufficient to reach the end of the encoding - * declaration without going too far inside the document content. - * on UTF-16 this means 90bytes, on UCS4 this means 180 - * The actual value depending on guessed encoding is passed as @len - * if provided - */ - if (toconv > 180) - toconv = 180; - if (toconv * 2 >= written) { - xmlBufferGrow(out, toconv * 2); - written = out->size - out->use - 1; - } - - ret = xmlEncInputChunk(handler, &out->content[out->use], &written, - in->content, &toconv, 0); - xmlBufferShrink(in, toconv); - out->use += written; - out->content[out->use] = 0; - if (ret == -1) ret = -3; - -#ifdef DEBUG_ENCODING - switch (ret) { - case 0: - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input\n", - toconv, written); - break; - case -1: - xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", - toconv, written, in->use); - break; - case -2: - xmlGenericError(xmlGenericErrorContext, - "input conversion failed due to input error\n"); - break; - case -3: - xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of input, %d left\n", - toconv, written, in->use); - break; - default: - xmlGenericError(xmlGenericErrorContext,"Unknown input conversion failed %d\n", ret); - } -#endif /* DEBUG_ENCODING */ - /* - * Ignore when input buffer is not on a boundary - */ - if (ret == -3) ret = 0; - if (ret == -1) ret = 0; - return(written ? written : ret); -} - -/** - * xmlCharEncFirstLineInput: - * @input: a parser input buffer - * @len: number of bytes to convert for the first line, or -1 - * - * Front-end for the encoding handler input function, but handle only - * the very first line. Point is that this is based on autodetection - * of the encoding and once that first line is converted we may find - * out that a different decoder is needed to process the input. - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or - */ -int -xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) -{ - int ret; - size_t written; - size_t toconv; - int c_in; - int c_out; - xmlBufPtr in; - xmlBufPtr out; - - if ((input == NULL) || (input->encoder == NULL) || - (input->buffer == NULL) || (input->raw == NULL)) - return (-1); - out = input->buffer; - in = input->raw; - - toconv = xmlBufUse(in); - if (toconv == 0) - return (0); - written = xmlBufAvail(out); - /* - * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 - * 45 chars should be sufficient to reach the end of the encoding - * declaration without going too far inside the document content. - * on UTF-16 this means 90bytes, on UCS4 this means 180 - * The actual value depending on guessed encoding is passed as @len - * if provided - */ - if (len >= 0) { - if (toconv > (unsigned int) len) - toconv = len; - } else { - if (toconv > 180) - toconv = 180; - } - if (toconv * 2 >= written) { - xmlBufGrow(out, toconv * 2); - written = xmlBufAvail(out); - } - if (written > 360) - written = 360; - - c_in = toconv; - c_out = written; - ret = xmlEncInputChunk(input->encoder, xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in, 0); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - if (ret == -1) - ret = -3; - - switch (ret) { - case 0: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input\n", - c_in, c_out); -#endif - break; - case -1: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -3: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -2: { - char buf[50]; - const xmlChar *content = xmlBufContent(in); - - snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", - content[0], content[1], - content[2], content[3]); - buf[49] = 0; - xmlEncodingErr(XML_I18N_CONV_FAILED, - "input conversion failed due to input error, bytes %s\n", - buf); - } - } - /* - * Ignore when input buffer is not on a boundary - */ - if (ret == -3) ret = 0; - if (ret == -1) ret = 0; - return(c_out ? c_out : ret); + return(xmlCharEncInFunc(handler, out, in)); } /** |