New parser (with exactly the same API as gmime-parser.c) that is meant to

2001-09-29 Jeffrey Stedfast <fejj@ximian.com> * pan-mime-parser.c: New parser (with exactly the same API as gmime-parser.c) that is meant to parse MIME objects without needing them to be memory mapped. Quite a bit slower on average, but is vastly sped up by using a GMimeStreamBuffer in BLOCK_READ mode.
author: Jeffrey Stedfast <fejj@ximian.com> 2001-09-29 21:35:32 +0000
committer: Jeffrey Stedfast <fejj@src.gnome.org> 2001-09-29 21:35:32 +0000
commit: b775ca73f16a217f152e32523db855a16e70d0f7 (patch)
tree: 4c480bd86bd601394847e9e24404ea89c6243053
parent: 47a0415345eb7f10bb4914870613de0d8ef96331 (diff)
download: gmime-b775ca73f16a217f152e32523db855a16e70d0f7.tar.gz
2 files changed, 625 insertions, 0 deletions
diff --git a/ChangeLog b/ChangeLog
index b3b39cfb..1cdebe08 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,11 @@
 2001-09-29  Jeffrey Stedfast  <fejj@ximian.com>
 
+	* pan-mime-parser.c: New parser (with exactly the same API as
+	gmime-parser.c) that is meant to parse MIME objects without
+	needing them to be memory mapped. Quite a bit slower on average,
+	but is vastly sped up by using a GMimeStreamBuffer in BLOCK_READ
+	mode.
+
 	* gmime-stream-buffer.c (stream_write): Incremement the stream
 	position by the number of bytes we wrote.
 	(stream_tell): Return stream->position.
diff --git a/pan-mime-parser.c b/pan-mime-parser.c
new file mode 100644
index 00000000..d8be10bc
--- /dev/null
+++ b/pan-mime-parser.c
@@ -0,0 +1,619 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ *  Authors: Jeffrey Stedfast <fejj@helixcode.com>
+ *
+ *  Copyright 2000, 2001 Helix Code, Inc. (www.helixcode.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include "gmime-parser.h"
+#include "gmime-utils.h"
+#include "gmime-header.h"
+#include "gmime-stream-mem.h"
+#include <string.h>
+#include <ctype.h>
+
+#define d(x) x
+
+#ifndef HAVE_ISBLANK
+#define isblank(c) ((c) == ' ' || (c) == '\t')
+#endif /* HAVE_ISBLANK */
+
+static void
+header_unfold (char *header)
+{
+	/* strip all \n's and replace tabs with spaces - this should
+           undo any header folding */
+	char *src, *dst;
+	
+	for (src = dst = header; *src; src++) {
+		if (*src != '\n')
+			*dst++ = *src != '\t' ? *src : ' ';
+	}
+	*dst = '\0';
+}
+
+enum {
+	CONTENT_TYPE = 0,
+	CONTENT_TRANSFER_ENCODING,
+	CONTENT_DISPOSITION,
+	CONTENT_DESCRIPTION,
+	CONTENT_LOCATION,
+	CONTENT_MD5,
+	CONTENT_ID
+};
+
+static char *content_headers[] = {
+	"Content-Type:",
+	"Content-Transfer-Encoding:",
+	"Content-Disposition:",
+	"Content-Description:",
+	"Content-Location:",
+	"Content-Md5:",
+	"Content-Id:",
+	NULL
+};
+
+static int
+content_header (const char *field)
+{
+	int i;
+	
+	for (i = 0; content_headers[i]; i++)
+		if (!g_strncasecmp (field, content_headers[i], strlen (content_headers[i])))
+			return i;
+	
+	return -1;
+}
+
+enum {
+	HEADER_FROM = 0,
+	HEADER_REPLY_TO,
+	HEADER_TO,
+	HEADER_CC,
+	HEADER_BCC,
+	HEADER_SUBJECT,
+	HEADER_DATE,
+	HEADER_MESSAGE_ID,
+	HEADER_UNKNOWN
+};
+
+static char *fields[] = {
+	"From:",
+	"Reply-To:",
+	"To:",
+	"Cc:",
+	"Bcc:",
+	"Subject:",
+	"Date:",
+	"Message-Id:",
+	NULL
+};
+
+static gboolean
+special_header (const char *field)
+{
+	return (!g_strcasecmp (field, "MIME-Version:") || content_header (field) != -1);
+}
+
+static const char *
+g_strstrbound (const char *haystack, const char *needle, const char *end)
+{
+	gboolean matches = FALSE;
+	const char *ptr;
+	guint nlen;
+	
+	nlen = strlen (needle);
+	ptr = haystack;
+	
+	while (ptr + nlen <= end) {
+		if (!strncmp (ptr, needle, nlen)) {
+			matches = TRUE;
+			break;
+		}
+		ptr++;
+	}
+	
+	if (matches)
+		return ptr;
+	else
+		return NULL;
+}
+
+static void
+parser_readln (GMimeStream *stream, GByteArray *buffer)
+{
+	char linebuf[1024];
+	ssize_t len;
+	
+	while (!g_mime_stream_eos (stream)) {
+		len = g_mime_stream_buffer_gets (stream, linebuf, sizeof (linebuf));
+		if (len <= 0)
+			break;
+		
+		g_byte_array_append (buffer, linebuf, len);
+		if (linebuf[len - 1] == '\n')
+			break;
+	}
+}
+
+static void
+parser_read_headers (GMimeStream *stream, GByteArray *buffer)
+{
+	guint offset;
+	
+	do {
+		offset = buffer->len;
+		parser_readln (stream, buffer);
+	} while (!g_mime_stream_eos (stream) && *(buffer->data + offset) != '\n');
+	
+	/* strip off the empty line */
+	g_byte_array_set_size (buffer, MAX (buffer->len - 1, 0));
+}
+
+enum {
+	FOUND_BOUNDARY,
+	FOUND_END_BOUNDARY,
+	FOUND_EOS,
+};
+
+static guint
+parser_read_until_boundary (GMimeStream *stream, GByteArray *buffer,
+			    const char *boundary, const char *end_boundary, int *found)
+{
+	guint offset, len, total = 0;
+	gboolean internal;
+	
+	*found = FOUND_EOS;
+	
+	if (!buffer) {
+		buffer = g_byte_array_new ();
+		internal = TRUE;
+	} else
+		internal = FALSE;
+	
+	do {
+		offset = buffer->len;
+		parser_readln (stream, buffer);
+		len = buffer->len - offset;
+		
+		if (boundary && len == strlen (boundary) &&
+		    !memcmp (buffer->data + offset, boundary, len)) {
+			g_byte_array_set_size (buffer, offset);
+			*found = FOUND_BOUNDARY;
+			break;
+		}
+		
+		if (end_boundary && len == strlen (end_boundary) &&
+		    !memcmp (buffer->data + offset, end_boundary, len)) {
+			g_byte_array_set_size (buffer, offset);
+			*found = FOUND_END_BOUNDARY;
+			break;
+		}
+		
+		total += len;
+		
+		if (internal)
+			g_byte_array_set_size (buffer, 0);
+		
+	} while (!g_mime_stream_eos (stream));
+	
+	if (internal)
+		g_byte_array_free (buffer, TRUE);
+	
+	return total;
+}
+
+
+/**
+ * parse_content_heaaders:
+ * @headers: content header string
+ * @inlen: length of the header block.
+ * @mime_part: mime part to populate with the information we get from the Content-* headers.
+ * @is_multipart: set to TRUE if the part is a multipart, FALSE otherwise (to be set by function)
+ * @boundary: multipart boundary string (to be set by function)
+ * @end_boundary: multipart end boundary string (to be set by function)
+ *
+ * Parse a header block for content information.
+ */
+static void
+construct_content_headers (GMimePart *mime_part, GByteArray *headers, gboolean *is_multipart,
+			   char **boundary, char **end_boundary)
+{
+	const char *inptr = headers->data;
+	const char *inend = inptr + headers->len;
+	
+	*boundary = NULL;
+	*end_boundary = NULL;
+	
+	while (inptr < inend) {
+		const gint type = content_header (inptr);
+		const char *hvalptr;
+		const char *hvalend;
+		char *value;
+		
+		if (type == -1) {
+			if (!(hvalptr = memchr (inptr, ':', inend - inptr)))
+				break;
+			hvalptr++;
+		} else {
+			hvalptr = inptr + strlen (content_headers[type]);
+		}
+		
+		for (hvalend = hvalptr; hvalend < inend; hvalend++)
+			if (*hvalend == '\n' && !isblank (*(hvalend + 1)))
+				break;
+		
+		value = g_strndup (hvalptr, (gint) (hvalend - hvalptr));
+		
+		header_unfold (value);
+		g_strstrip (value);
+		
+		switch (type) {
+		case CONTENT_DESCRIPTION: {
+			char *description = g_mime_utils_8bit_header_decode (value);
+			
+			g_strstrip (description);
+			g_mime_part_set_content_description (mime_part, description);
+			g_free (description);
+			
+			break;
+		}
+		case CONTENT_LOCATION:
+			g_mime_part_set_content_location (mime_part, value);
+			break;
+		case CONTENT_MD5:
+			g_mime_part_set_content_md5 (mime_part, value);
+			break;
+		case CONTENT_ID:
+			g_mime_part_set_content_id (mime_part, value);
+			break;
+		case CONTENT_TRANSFER_ENCODING:
+			g_mime_part_set_encoding (mime_part, g_mime_part_encoding_from_string (value));
+			break;
+		case CONTENT_TYPE: {
+			GMimeContentType *mime_type;
+			
+			mime_type = g_mime_content_type_new_from_string (value);
+			
+			*is_multipart = g_mime_content_type_is_type (mime_type, "multipart", "*");
+			if (*is_multipart) {
+				const char *b;
+				
+				b = g_mime_content_type_get_parameter (mime_type, "boundary");
+				if (b != NULL) {
+					/* create our temp boundary vars */
+					*boundary = g_strdup_printf ("--%s\n", b);
+					*end_boundary = g_strdup_printf ("--%s--\n", b);
+				} else {
+					g_warning ("Invalid MIME structure: boundary not found for multipart"
+						   " - defaulting to text/plain.");
+					
+					/* let's continue onward as if this was not a multipart */
+					g_mime_content_type_destroy (mime_type);
+					mime_type = g_mime_content_type_new ("text", "plain");
+					is_multipart = FALSE;
+				}
+			}
+			g_mime_part_set_content_type (mime_part, mime_type);
+			
+			break;
+		}
+		case CONTENT_DISPOSITION: {
+			char *disposition, *ptr;
+			
+			/* get content disposition part */
+			for (ptr = value; *ptr && *ptr != ';'; ptr++); /* find ; or \0 */
+			disposition = g_strndup (value, (gint)(ptr - value));
+			g_strstrip (disposition);
+			g_mime_part_set_content_disposition (mime_part, disposition);
+			g_free (disposition);
+			
+			/* parse the parameters, if any */
+			while (*ptr == ';') {
+				char *pname, *pval;
+				
+				/* get the param name */
+				for (pname = ptr + 1; *pname && !isspace ((int)*pname); pname++);
+				for (ptr = pname; *ptr && *ptr != '='; ptr++);
+				pname = g_strndup (pname, (gint) (ptr - pname));
+				g_strstrip (pname);
+				
+				/* convert param name to lowercase */
+				g_strdown (pname);
+				
+				/* skip any whitespace */
+				for (pval = ptr + 1; *pval && isspace ((int) *pval); pval++);
+				
+				if (*pval == '"') {
+					/* value is in quotes */
+					pval++;
+					for (ptr = pval; *ptr; ptr++)
+						if (*ptr == '"' && *(ptr - 1) != '\\')
+							break;
+					pval = g_strndup (pval, (gint) (ptr - pval));
+					g_strstrip (pval);
+					g_mime_utils_unquote_string (pval);
+					
+					for ( ; *ptr && *ptr != ';'; ptr++);
+				} else {
+					/* value is not in quotes */
+					for (ptr = pval; *ptr && *ptr != ';'; ptr++);
+					pval = g_strndup (pval, (gint) (ptr - pval));
+					g_strstrip (pval);
+				}
+				
+				g_mime_part_add_content_disposition_parameter (mime_part, pname, pval);
+				
+				g_free (pname);
+				g_free (pval);
+			}
+			
+			break;
+		}
+		default:
+			/* ignore this header */
+			break;
+		}
+		
+		g_free (value);
+		inptr = hvalend + 1;
+	}
+}
+
+static GMimePart *
+g_mime_parser_construct_part_internal (GMimeStream *stream, GByteArray *headers,
+				       const char *parent_boundary,
+				       const char *parent_end_boundary,
+				       int *parent_boundary_found)
+{
+	GMimePart *mime_part;
+	char *boundary;
+	char *end_boundary;
+	gboolean is_multipart;
+	
+	mime_part = g_mime_part_new ();
+	is_multipart = FALSE;
+	construct_content_headers (mime_part, headers, &is_multipart,
+				   &boundary, &end_boundary);
+	
+	/* Content */
+	if (is_multipart && boundary && end_boundary) {
+		/* get all the subparts */
+		GMimePart *subpart;
+		GByteArray *preface;
+		off_t start, end, pos;
+		int found;
+		
+		pos = g_mime_stream_tell (stream);
+		start = stream->bound_start;
+		end = stream->bound_end;
+		
+		preface = g_byte_array_new ();
+		parser_read_until_boundary (stream, preface, boundary, end_boundary, &found);
+		/* FIXME: save the preface? */
+		g_byte_array_free (preface, TRUE);
+		
+		while (found == FOUND_BOUNDARY) {
+			GByteArray *content_headers;
+			
+			content_headers = g_byte_array_new ();
+			parser_read_headers (stream, content_headers);
+			
+			g_mime_stream_set_bounds (stream, g_mime_stream_tell (stream), end);
+			subpart = g_mime_parser_construct_part_internal (stream, content_headers,
+									 boundary,
+									 end_boundary,
+									 &found);
+			g_mime_part_add_subpart (mime_part, subpart);
+			g_byte_array_free (content_headers, TRUE);
+		}
+		
+		g_mime_stream_set_bounds (stream, start, end);
+		
+		if (parent_boundary) {
+			parser_read_until_boundary (stream, NULL, parent_boundary,
+						    parent_end_boundary, parent_boundary_found);
+		}
+		
+		/* free our temp boundary strings */
+		g_free (boundary);
+		g_free (end_boundary);
+	} else {
+		GMimePartEncodingType encoding;
+		GMimeDataWrapper *wrapper;
+		GMimeStream *substream;
+		off_t start, end;
+		guint len;
+		
+		start = g_mime_stream_tell (stream);		
+		
+		len = parser_read_until_boundary (stream, NULL, parent_boundary,
+						  parent_end_boundary,
+						  parent_boundary_found);
+		
+		if (*parent_boundary_found != FOUND_EOS)
+			end = start + len;
+		else
+			end = g_mime_stream_tell (stream);
+		
+		encoding = g_mime_part_get_encoding (mime_part);
+		
+		substream = g_mime_stream_substream (stream, start, end);
+		wrapper = g_mime_data_wrapper_new_with_stream (substream, encoding);
+		g_mime_part_set_content_object (mime_part, wrapper);
+		g_mime_stream_unref (substream);
+	}
+	
+	return mime_part;
+}
+
+
+/**
+ * g_mime_parser_construct_part: Construct a GMimePart object
+ * @stream: raw MIME Part stream
+ *
+ * Returns a GMimePart object based on the data.
+ **/
+GMimePart *
+g_mime_parser_construct_part (GMimeStream *stream)
+{
+	GMimePart *part = NULL;
+	GByteArray *headers;
+	int found;
+	
+	g_return_val_if_fail (stream != NULL, NULL);
+	
+	headers = g_byte_array_new ();
+	parser_read_headers (stream, headers);
+	
+	if (headers->len)
+		part = g_mime_parser_construct_part_internal (stream, headers, NULL, NULL, &found);
+	
+	g_byte_array_free (headers, TRUE);
+	
+	return part;
+}
+
+static void
+construct_message_headers (GMimeMessage *message, GByteArray *headers, gboolean save_extra_headers)
+{
+	char *field, *value, *raw, *q;
+	char *inptr, *inend;
+	time_t date;
+	int offset = 0;
+	int i;
+	
+	inptr = (char *) headers->data;
+	inend = inptr + headers->len;
+	
+	for ( ; inptr < inend; inptr++) {
+		for (i = 0; fields[i]; i++)
+			if (!g_strncasecmp (fields[i], inptr, strlen (fields[i])))
+				break;
+		
+		if (!fields[i]) {
+			field = inptr;
+			for (q = field; q < inend && *q != ':'; q++);
+			field = g_strndup (field, (gint) (q - field + 1));
+			g_strstrip (field);
+		} else {
+			field = g_strdup (fields[i]);
+		}
+		
+		value = inptr + strlen (field);
+		for (q = value; q < inend; q++)
+			if (*q == '\n' && !isblank (*(q + 1)))
+				break;
+		
+		value = g_strndup (value, (gint) (q - value));
+		g_strstrip (value);
+		header_unfold (value);
+		
+		switch (i) {
+		case HEADER_FROM:
+			raw = g_mime_utils_8bit_header_decode (value);
+			g_mime_message_set_sender (message, raw);
+			g_free (raw);
+			break;
+		case HEADER_REPLY_TO:
+			raw = g_mime_utils_8bit_header_decode (value);
+			g_mime_message_set_reply_to (message, raw);
+			g_free (raw);
+			break;
+		case HEADER_TO:
+			g_mime_message_add_recipients_from_string (message, GMIME_RECIPIENT_TYPE_TO, value);
+			break;
+		case HEADER_CC:
+			g_mime_message_add_recipients_from_string (message, GMIME_RECIPIENT_TYPE_CC, value);
+			break;
+		case HEADER_BCC:
+			g_mime_message_add_recipients_from_string (message, GMIME_RECIPIENT_TYPE_BCC, value);
+			break;
+		case HEADER_SUBJECT:
+			raw = g_mime_utils_8bit_header_decode (value);
+			g_mime_message_set_subject (message, raw);
+			g_free (raw);
+			break;
+		case HEADER_DATE:
+			date = g_mime_utils_header_decode_date (value, &offset);
+			g_mime_message_set_date (message, date, offset);
+			break;
+		case HEADER_MESSAGE_ID:
+			raw = g_mime_utils_8bit_header_decode (value);
+			g_mime_message_set_message_id (message, raw);
+			g_free (raw);
+			break;
+		case HEADER_UNKNOWN:
+		default:
+			break;
+		}
+		
+		/* possibly save the raw header */
+		if ((save_extra_headers || fields[i]) && !special_header (field)) {
+			field[strlen (field) - 1] = '\0'; /* kill the ';' */
+			g_strstrip (field);
+			g_mime_header_set (message->header->headers, field, value);
+		}
+		
+		g_free (field);
+		g_free (value);
+		
+		if (q >= inend)
+			break;
+		else
+			inptr = q;
+	}
+}
+
+
+/**
+ * g_mime_parser_construct_message: Construct a GMimeMessage object
+ * @stream: an rfc0822 message stream
+ * @save_extra_headers: if TRUE, then store the arbitrary headers
+ *
+ * Returns a GMimeMessage object based on the rfc0822 data.
+ **/
+GMimeMessage *
+g_mime_parser_construct_message (GMimeStream *stream, gboolean save_extra_headers)
+{
+	GMimeMessage *message = NULL;
+	GByteArray *headers;
+	
+	g_return_val_if_fail (stream != NULL, NULL);
+	
+	headers = g_byte_array_new ();
+	parser_read_headers (stream, headers);
+	
+	if (headers->len) {
+		GMimePart *part;
+		int found;
+		
+		message = g_mime_message_new ();
+		construct_message_headers (message, headers, save_extra_headers);
+		part = g_mime_parser_construct_part_internal (stream, headers, NULL, NULL, &found);
+		g_mime_message_set_mime_part (message, part);
+	}
+	
+	g_byte_array_free (headers, TRUE);
+	
+	return message;
+}
author	Jeffrey Stedfast <fejj@ximian.com>	2001-09-29 21:35:32 +0000
committer	Jeffrey Stedfast <fejj@src.gnome.org>	2001-09-29 21:35:32 +0000
commit	b775ca73f16a217f152e32523db855a16e70d0f7 (patch)
tree	4c480bd86bd601394847e9e24404ea89c6243053
parent	47a0415345eb7f10bb4914870613de0d8ef96331 (diff)
download	gmime-b775ca73f16a217f152e32523db855a16e70d0f7.tar.gz