tracker-extract-pdf: Fix timeout situation with PDF extraction

The 10 second timeout was always being used for every PDF extraction because writing to the pipe was blocking and only killing the process would result in reading from the parent of the fork(). This meant all extractions were much slower than they needed to be. This approach uses select() instead of signal handling to make sure the FDs from pipe() are read at the right times too. This fixes GB#680897
author: Martyn Russell <martyn@lanedo.com> 2012-08-16 16:21:35 +0100
committer: Martyn Russell <martyn@lanedo.com> 2012-10-24 16:44:54 +0100
commit: 12a410356bf0f0a594df36db371310d763a7ec00 (patch)
tree: 9fa41a64ec0a92003e409b1a0135467ae9fda9bd
parent: 3b8aa71d90b6b5d0edb3fdcd1d0dfa9e855e13b9 (diff)
download: tracker-12a410356bf0f0a594df36db371310d763a7ec00.tar.gz
1 files changed, 245 insertions, 92 deletions
diff --git a/src/tracker-extract/tracker-extract-pdf.c b/src/tracker-extract/tracker-extract-pdf.c
index 85066b323..ffff4d937 100644
--- a/src/tracker-extract/tracker-extract-pdf.c
+++ b/src/tracker-extract/tracker-extract-pdf.c
@@ -25,15 +25,16 @@
 #define _GNU_SOURCE
 #endif
 
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/select.h>
+#include <sys/wait.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <string.h>
-#include <sys/types.h>
-#include <sys/stat.h>
 #include <unistd.h>
-#include <sys/mman.h>
 #include <stdio.h>
-#include <sys/wait.h>
 #include <stdlib.h>
 
 #include <glib.h>
@@ -51,6 +52,13 @@
 
 #include "tracker-main.h"
 
+/* Time in seconds before we kill the forked child process used for
+ * content extraction */
+#define EXTRACTION_PROCESS_TIMEOUT 10
+
+/* Size of the buffer to use when reading, in bytes */
+#define BUFFER_SIZE 65535
+
 typedef struct {
 	gchar *title;
 	gchar *subject;
@@ -193,7 +201,7 @@ read_outline (PopplerDocument      *document,
 }
 
 static GString *
-extract_content_util (PopplerDocument *document,
+extract_content_text (PopplerDocument *document,
                       gsize            n_bytes)
 {
 	gint n_pages, i = 0;
@@ -206,8 +214,7 @@ extract_content_util (PopplerDocument *document,
 	timer = g_timer_new ();
 
 	while (i < n_pages &&
-	       remaining_bytes > 0 &&
-	       g_timer_elapsed (timer, NULL) < 5) {
+	       remaining_bytes > 0) {
 		PopplerPage *page;
 		gsize written_bytes = 0;
 		gchar *text;
@@ -231,7 +238,7 @@ extract_content_util (PopplerDocument *document,
 
 		remaining_bytes -= written_bytes;
 
-		g_debug ("Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
+		g_debug ("Child: Extracted %" G_GSIZE_FORMAT " bytes from page %d, "
 		         "%" G_GSIZE_FORMAT " bytes remaining",
 		         written_bytes, i, remaining_bytes);
 
@@ -239,123 +246,269 @@ extract_content_util (PopplerDocument *document,
 		g_object_unref (page);
 	}
 
-	g_debug ("Content extraction finished: %d/%d pages indexed in %lf seconds, "
+	g_debug ("Child: Content extraction finished: %d/%d pages indexed in %2.2f seconds, "
 	         "%" G_GSIZE_FORMAT " bytes extracted",
-	         i, n_pages, g_timer_elapsed (timer, NULL), (n_bytes - remaining_bytes));
+	         i,
+	         n_pages,
+	         g_timer_elapsed (timer, NULL),
+	         (n_bytes - remaining_bytes));
 
 	g_timer_destroy (timer);
 
 	return string;
 }
 
-
-static gchar *
-extract_content (PopplerDocument *document,
-                 gsize  n_bytes)
+static void
+extract_content_child_process (PopplerDocument *document,
+                               gsize            n_bytes,
+                               int              fd[2])
 {
-	pid_t childpid;
-	gchar *retval = NULL;
-	int fd[2];
+	GString *str;
+	gint64 size;
+	GOutputStream *output_stream;
+	GDataOutputStream *dataout_stream;
+
+	/* This is the child extracting the content, hopefully in time */
+
+	output_stream = g_unix_output_stream_new (fd[1], FALSE);
+	dataout_stream = g_data_output_stream_new (output_stream);
+	str = extract_content_text (document, n_bytes);
+	size = (gint64) str->len;
+
+	/* Write the results to the pipe */
+	if (g_data_output_stream_put_int64 (dataout_stream, size, NULL, NULL)) {
+		g_output_stream_write_all (output_stream,
+		                           str->str,
+		                           str->len,
+		                           NULL,
+		                           NULL,
+		                           NULL);
+	}
+
+	g_debug ("Child: Content extraction now finished in child process, "
+	         "written %" G_GSIZE_FORMAT " bytes to parent process",
+	         size);
 
-	pipe (fd);
-	childpid = fork();
+	g_string_free (str, TRUE);
+	g_object_unref (dataout_stream);
+	g_object_unref (output_stream);
 
-	if (childpid >= 0) {
-		if (childpid == 0) {
-			GString *str;
-			gint64 size;
-			GOutputStream *output_stream;
-			GDataOutputStream *dataout_stream;
+	close (fd[1]);
 
-			/* This is the child extracting the content, hopefully in time */
+	exit (0);
+}
 
-			output_stream = g_unix_output_stream_new (fd[1], FALSE);
-			dataout_stream = g_data_output_stream_new (output_stream);
-			str = extract_content_util (document, n_bytes);
-			size = (gint64) str->len;
+static gchar *
+extract_content_parent_process (PopplerDocument *document,
+                                int              fd[2],
+                                pid_t            child_pid)
+{
+	GInputStream *input_stream;
+	GDataInputStream *datain_stream;
+	GString *content = NULL;
+	GError *error = NULL;
+	GTimer *timer = NULL;
+	gsize bytes_expected = -1;
+	gboolean timed_out = FALSE;
+	gboolean finished = FALSE;
+	struct timeval timeout;
+	fd_set rfds;
+
+	/* This is the parent process waiting for the content extractor to
+	 * finish in time. */
+
+	g_debug ("Parent: Content extraction now starting in child process (pid = %d)", child_pid);
+
+	/* Set up gio streams */
+	input_stream = g_unix_input_stream_new (fd[0], FALSE);
+	datain_stream = g_data_input_stream_new (input_stream);
+
+	/* Watch FD to see when it has input. */
+	FD_ZERO(&rfds);
+	FD_SET(fd[0], &rfds);
+
+	/* We give the content extractor 10 seconds to do its job */
+	timeout.tv_sec = EXTRACTION_PROCESS_TIMEOUT;
+	timeout.tv_usec = 0;
+
+	/* We also use our own timer because timeouts in select()
+	 * can be inconsistent across UNIX platforms. Some update the
+	 * timeout and some don't.
+	 */
+	timer = g_timer_new ();
 
-			/* Write the results to the pipe */
-			if (g_data_output_stream_put_int64 (dataout_stream, size, NULL, NULL)) {
-				g_output_stream_write_all (output_stream, str->str, str->len, NULL,
-				                           NULL, NULL);
+	/* So, this is fairly simple, what we're doing here is using
+	 * select() to know when the child process has written some or
+	 * all the data and we then avoid the child blocking by
+	 * reading from that stream. We couple with this with a
+	 * timeout of 10 seconds so if we receive nothing then we know
+	 * we can kill the process because it is taking too long.
+	 *
+	 * We use waitpid() to know if the process quit because it has
+	 * finished or if it is still processing data and needs to be
+	 * killed.
+	 */
+	while (!finished) {
+		int retval;
+
+		/* 1a. Wait for data on the FD and limit by timeout */
+		retval = select (fd[0] + 1,  &rfds, NULL, NULL, &timeout);
+
+		/* 2. Did we error? Have data? or just timeout? */
+		if (retval == -1) {
+			perror ("select()");
+			finished = TRUE;
+		} else if (retval == 1) {
+			gsize bytes_remaining;
+			gboolean read_finished = FALSE;
+			int state = 0;
+
+			g_debug ("Parent: fd is set is %d", FD_ISSET(fd[0], &rfds));
+
+			if (g_timer_elapsed (timer, NULL) >= EXTRACTION_PROCESS_TIMEOUT) {
+				finished = TRUE;
+				timed_out = TRUE;
+				continue;
 			}
 
-			g_string_free (str, TRUE);
-			g_object_unref (dataout_stream);
-			g_object_unref (output_stream);
+			/* 3. Start reading data */
+			if (bytes_expected == -1) {
+				/* We only need to read the size once before the data! */
+				bytes_expected = (gsize) g_data_input_stream_read_int64 (datain_stream,
+				                                                         NULL,
+				                                                         &error);
+				if (error) {
+					g_warning ("Call to g_data_input_stream_read_int64() failed, %s",
+					           error->message);
+					g_error_free (error);
+					finished = TRUE;
+					continue;
+				}
 
-			close (fd[1]);
-			exit (0);
-		} else {
-			gboolean failed = FALSE;
-			sigset_t mask;
-			sigset_t orig_mask;
-			struct timespec timeout;
+				g_debug ("Parent: Expected bytes to read is %" G_GSSIZE_FORMAT "", bytes_expected);
+				bytes_remaining = bytes_expected;
+				content = g_string_new ("");
+			}
 
-			/* This is the parent process waiting for the content extractor to
-			 * finish in time. */
+			/* 4. Read until done from stream and concatenate data */
+			while (!read_finished) {
+				gchar buf[BUFFER_SIZE];
+				gsize bytes_read;
+
+				memset (buf, 0, BUFFER_SIZE);
+				bytes_read = g_input_stream_read (G_INPUT_STREAM (datain_stream),
+				                                  buf,
+				                                  MIN (BUFFER_SIZE, bytes_remaining),
+				                                  NULL,
+				                                  &error);
+
+				g_debug ("Parent:   Bytes read is %" G_GSSIZE_FORMAT ","
+				         "bytes remaining is %" G_GSSIZE_FORMAT "",
+				         bytes_read,
+				         MAX (bytes_remaining - bytes_read, 0));
+
+				if (bytes_read == -1 || error) {
+					g_warning ("Call to g_input_stream_read() failed, %s",
+					           error ? error->message : "no error given");
+					g_clear_error (&error);
+					read_finished = TRUE;
+					finished = TRUE;
+				} else {
+					content = g_string_append (content, buf);
 
-			sigemptyset (&mask);
-			sigaddset (&mask, SIGCHLD);
+					bytes_remaining -= bytes_read;
+					bytes_remaining  = MAX (bytes_remaining, 0);
 
-			if (sigprocmask (SIG_BLOCK, &mask, &orig_mask) < 0) {
-				return NULL;
-			}
+					if (bytes_read == 0) {
+						/* We finished reading */
+						g_debug ("Parent:   Finished reading all bytes");
+						read_finished = TRUE;
+					}
 
-			/* We give the content extractor 10 seconds to do its job */
-			timeout.tv_sec = 10;
-			timeout.tv_nsec = 0;
-
-			do {
-				if (sigtimedwait (&mask, NULL, &timeout) < 0) {
-					if (errno == EINTR) {
-						continue;
-					} else if (errno == EAGAIN) {
-						g_warning ("Content extraction of PDF Timed out\n");
-						kill (childpid, SIGKILL);
-						failed = TRUE;
+					/* Are we finished reading everything */
+					if (bytes_remaining < 1) {
+						finished = TRUE;
 					}
 				}
-				break;
-			} while (1);
+			}
 
-			if (!failed) {
-				GInputStream *input_stream;
-				GDataInputStream *datain_stream;
-				gsize to_read, nbytes;
-				GError *error = NULL;
+			/* 5. Check if the process exited yet or not */
+			retval = waitpid (child_pid, &state, WNOHANG);
+			if (retval == -1) {
+				perror ("waitpid()");
+			} else if (retval == child_pid) {
+				g_debug ("Parent: Child process exited");
+				finished = TRUE;
+				continue;
+			} else {
+				/* If retval is 0 then nothing changed
+				 * and we go round again */
+			}
+		} else {
+			/* 3. We must have timed out with no data in select() */
+			finished = TRUE;
+			timed_out = TRUE;
+		}
+	}
 
-				/* If the child was in time, we read its results from the pipe */
+	if (timed_out) {
+		g_debug ("Parent: Child process took too long. We waited %d seconds, so we're going to kill it!",
+		         EXTRACTION_PROCESS_TIMEOUT);
+		kill (child_pid, SIGKILL);
+	} else {
+		g_debug ("Parent: Data received in %2.2f seconds (timeout is %d seconds)",
+		         g_timer_elapsed (timer, NULL),
+		         EXTRACTION_PROCESS_TIMEOUT);
+	}
 
-				input_stream = g_unix_input_stream_new (fd[0], FALSE);
-				datain_stream = g_data_input_stream_new (input_stream);
+	g_timer_destroy (timer);
 
-				to_read = (gsize) g_data_input_stream_read_int64 (datain_stream,
-				                                                  NULL, &error);
+	g_object_unref (datain_stream);
+	g_object_unref (input_stream);
 
-				if (!error) {
-					retval = g_malloc0 (to_read + 1);
-					if (!g_input_stream_read_all (input_stream, retval, to_read,
-					                              &nbytes, NULL, NULL)) {
-						g_free (retval);
-						retval = NULL;
-					}
-				} else {
-					g_error_free (error);
-				}
+	close (fd[0]);
 
-				g_object_unref (datain_stream);
-				g_object_unref (input_stream);
-			}
+	return content ? g_string_free (content, FALSE) : NULL;
+}
+
+static gchar *
+extract_content (PopplerDocument *document,
+                 gsize            n_bytes)
+{
+	pid_t child_pid;
+	int fd[2];
+	sigset_t mask;
+	sigset_t orig_mask;
+
+	if (pipe (fd) == -1) {
+		g_warning ("Content extraction failed, call to pipe() failed");
+		return NULL;
+	}
+
+	/* Set sig mask before fork() to avoid race conditions */
+	sigemptyset (&mask);
+	sigaddset (&mask, SIGCHLD);
+
+	if (sigprocmask (SIG_SETMASK, &mask, &orig_mask) == -1) {
+		g_warning ("Content extraction failed, call to sigprocmask() failed");
+		return NULL;
+	}
+
+	child_pid = fork ();
+
+	if (child_pid == -1) {
+		g_warning ("Content extraction failed, call to fork() failed");
 
-			close (fd[0]);
-		}
-	} else {
 		close (fd[0]);
 		close (fd[1]);
 	}
 
-	return retval;
+	if (child_pid == 0) {
+		extract_content_child_process (document, n_bytes, fd);
+		return NULL;
+	}
+
+	return extract_content_parent_process (document, fd, child_pid);
 }
 
 static void
author	Martyn Russell <martyn@lanedo.com>	2012-08-16 16:21:35 +0100
committer	Martyn Russell <martyn@lanedo.com>	2012-10-24 16:44:54 +0100
commit	12a410356bf0f0a594df36db371310d763a7ec00 (patch)
tree	9fa41a64ec0a92003e409b1a0135467ae9fda9bd
parent	3b8aa71d90b6b5d0edb3fdcd1d0dfa9e855e13b9 (diff)
download	tracker-12a410356bf0f0a594df36db371310d763a7ec00.tar.gz