src/tracker-extract/tracker-read.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384

/*
 * Copyright (C) 2010, Nokia <ivan.frade@nokia.com>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
 * Boston, MA  02110-1301, USA.
 */

#include "config.h"

#include <string.h>
#include <unistd.h>
#include <fcntl.h>

#include <glib.h>
#include <gio/gio.h>

#include <libtracker-extract/tracker-extract.h>

#include "tracker-read.h"

/* Size of the buffer to use when reading, in bytes */
#define BUFFER_SIZE 65535

static gchar *
get_string_from_guessed_encoding (const gchar *str,
                                  gsize        str_len,
                                  gsize       *utf8_len)
{
	const gchar *current = NULL;

	/* If we have embedded NULs try UTF-16 directly */
	if (memchr (str, '\0', str_len))
		current = "UTF-16";
	/* If locale charset is UTF-8, try with windows-1252.
	 * NOTE: g_get_charset() returns TRUE if locale charset is UTF-8 */
	else if (g_get_charset (&current))
		current = "windows-1252";

	while (current) {
		gchar *utf8_str;
		gsize bytes_read = 0;
		gsize bytes_written = 0;

		utf8_str = g_convert (str,
		                      str_len,
		                      "UTF-8",
		                      current,
		                      &bytes_read,
		                      &bytes_written,
		                      NULL);
		if (utf8_str &&
		    str_len == bytes_read) {
			g_debug ("Converted %" G_GSIZE_FORMAT " bytes in '%s' codeset "
			         "to %" G_GSIZE_FORMAT " bytes in UTF-8",
			         bytes_read,
			         current,
			         bytes_written);
			*utf8_len = bytes_written;
			return utf8_str;
		}
		g_free (utf8_str);

		g_debug ("Text not in '%s' encoding", current);

		if (!strcmp (current, "windows-1252") ||
		    !strcmp (current, "UTF-16"))
			/* If we tried windows-1252 or UTF-16, don't try anything else */
			current = NULL;
		else
			/* If we tried a locale encoding and didn't work, retry with
			 * windows-1252 */
			current = "windows-1252";
	}

	return NULL;
}

/* Returns %TRUE if read operation should continue, %FALSE otherwise */
static gboolean
process_chunk (const gchar  *read_bytes,
               gsize         read_size,
               gsize         buffer_size,
               gsize        *remaining_size,
               GString     **s)
{
	/* If no more bytes to read, halt loop */
	if (read_size == 0) {
		return FALSE;
	}

	/* First of all, check if this is the first time we
	 * have tried to read the stream up to the BUFFER_SIZE
	 * limit. Then make sure that we read the maximum size
	 * of the buffer. If we don't do this, there is the
	 * case where we read 10 bytes in and it is just one
	 * line with no '\n'. Once we have confirmed this we
	 * check that the buffer has a '\n' to make sure the
	 * file is worth indexing. Similarly if the file has
	 * <= 3 bytes then we drop it.
	 *
	 * NOTE: We may have non-UTF8 content read (say,
	 * UTF-16LE), so we can't rely on methods which assume
	 * NUL-terminated strings, as g_strstr_len().
	 */
	if (*s == NULL) {
		if (read_size <= 3) {
			g_debug ("  File has less than 3 characters in it, "
			         "not indexing file");
			return FALSE;
		}

		if (read_size == buffer_size) {
			const gchar *i;
			gboolean eol_found = FALSE;

			i = read_bytes;
			while (i != &read_bytes[read_size - 1]) {
				if (*i == '\n') {
					eol_found = TRUE;
					break;
				}
				i++;
			}

			if (!eol_found) {
				g_debug ("  No '\\n' in the first %" G_GSSIZE_FORMAT " bytes, "
				         "not indexing file",
				         read_size);
				return FALSE;
			}
		}
	}

	/* Update remaining bytes */
	*remaining_size -= read_size;

	g_debug ("  Read "
	         "%" G_GSSIZE_FORMAT " bytes from file, %" G_GSIZE_FORMAT " "
	         "bytes remaining until configured threshold is reached",
	         read_size,
	         *remaining_size);

	/* Append non-NIL terminated bytes */
	*s = (*s ?
	      g_string_append_len (*s, read_bytes, read_size) :
	      g_string_new_len (read_bytes, read_size));

	return TRUE;
}

static gchar *
process_whole_string (GString  *s)
{
	gchar *utf8 = NULL;
	gsize  utf8_len = 0;
	gsize n_valid_utf8_bytes = 0;

	/* Support also UTF-16 encoded text files, as the ones generated in
	 * Windows OS. We will only accept text files in UTF-16 which come
	 * with a proper BOM. */
	if (s->len > 2) {
		GError *error = NULL;

		if (memcmp (s->str, "\xFF\xFE", 2) == 0) {
			g_debug ("String comes in UTF-16LE, converting");
			utf8 = g_convert (&(s->str[2]),
			                  s->len - 2,
			                  "UTF-8",
			                  "UTF-16LE",
			                  NULL,
			                  &utf8_len,
			                  &error);

		} else if (memcmp (s->str, "\xFE\xFF", 2) == 0) {
			g_debug ("String comes in UTF-16BE, converting");
			utf8 = g_convert (&(s->str[2]),
			                  s->len - 2,
			                  "UTF-8",
			                  "UTF-16BE",
			                  NULL,
			                  &utf8_len,
			                  &error);
		}

		if (error) {
			g_warning ("Couldn't convert string from UTF-16 to UTF-8...: %s",
			           error->message);
			g_error_free (error);
			g_string_free (s, TRUE);
			return NULL;
		}
	}

	if (utf8) {
		n_valid_utf8_bytes = utf8_len;
		g_string_free (s, TRUE);
	} else {
		utf8_len = s->len;
		utf8 = g_string_free (s, FALSE);

		/* Get number of valid UTF-8 bytes found */
		tracker_text_validate_utf8 (utf8,
					    utf8_len,
					    NULL,
					    &n_valid_utf8_bytes);
	}

	/* A valid UTF-8 file will be that where all read bytes are valid,
	 *  with a margin of 3 bytes for the last UTF-8 character which might
	 *  have been cut. */
	if (utf8_len - n_valid_utf8_bytes > 3) {
		gchar *from_guessed_str;
		gsize  from_guessed_str_len;

		/* If not UTF-8, try to get contents in guessed encoding
		 *  (returns valid UTF-8) */
		from_guessed_str = get_string_from_guessed_encoding (utf8,
		                                                     utf8_len,
		                                                     &from_guessed_str_len);
		g_free (utf8);
		if (!from_guessed_str)
			return NULL;
		utf8 = from_guessed_str;
		utf8_len = from_guessed_str_len;
	} else if (n_valid_utf8_bytes < utf8_len) {
		g_debug ("  Truncating to last valid UTF-8 character "
		         "(%" G_GSSIZE_FORMAT "/%" G_GSSIZE_FORMAT " bytes)",
		         n_valid_utf8_bytes,
		         utf8_len);
		utf8[n_valid_utf8_bytes] = '\0';
		utf8_len = n_valid_utf8_bytes;
	}

	if (utf8_len < 1) {
		g_free (utf8);
		return NULL;
	}

	return utf8;
}

/**
 * tracker_read_text_from_stream:
 * @stream: input stream to read from
 * @max_bytes: max number of bytes to read from @stream
 *
 * Reads up to @max_bytes from @stream, and validates the read text as proper
 *  UTF-8.
 *
 * If the input text is not UTF-8 it will also try to decode it based on the
 * current locale, or windows-1252, or UTF-16.
 *
 * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
 **/
gchar *
tracker_read_text_from_stream (GInputStream *stream,
                               gsize         max_bytes)
{
	GString *s = NULL;
	gsize n_bytes_remaining = max_bytes;

	g_return_val_if_fail (stream, NULL);
	g_return_val_if_fail (max_bytes > 0, NULL);

	/* Reading in chunks of BUFFER_SIZE
	 *   Loop is halted whenever one of this conditions is met:
	 *     a) Read bytes reached the maximum allowed (max_bytes)
	 *     b) No more bytes to read
	 *     c) Error reading
	 *     d) Stream has less than 3 bytes
	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
	 */
	while (n_bytes_remaining > 0) {
		gchar buf[BUFFER_SIZE];
		GError *error = NULL;
		gsize n_bytes_read;

		/* Read bytes from stream */
		if (!g_input_stream_read_all (stream,
		                              buf,
		                              MIN (BUFFER_SIZE, n_bytes_remaining),
		                              &n_bytes_read,
		                              NULL,
		                              &error)) {
			g_message ("Error reading from stream: '%s'",
			           error->message);
			g_error_free (error);
			break;
		}

		/* Process read bytes, and halt loop if needed */
		if (!process_chunk (buf,
		                    n_bytes_read,
		                    BUFFER_SIZE,
		                    &n_bytes_remaining,
		                    &s)) {
			break;
		}
	}

	/* Validate UTF-8 if something was read, and return it */
	return s ? process_whole_string (s) : NULL;
}


/**
 * tracker_read_text_from_fd:
 * @fd: input fd to read from
 * @max_bytes: max number of bytes to read from @fd
 *
 * Reads up to @max_bytes from @fd, and validates the read text as proper
 *  UTF-8. Will also properly close the FD when finishes.
 *
 * If the input text is not UTF-8 it will also try to decode it based on the
 * current locale, or windows-1252, or UTF-16.
 *
 * Returns: newly-allocated NUL-terminated UTF-8 string with the read text.
 **/
gchar *
tracker_read_text_from_fd (gint  fd,
                           gsize max_bytes)
{
	FILE *fz;
	GString *s = NULL;
	gsize n_bytes_remaining = max_bytes;

	g_return_val_if_fail (max_bytes > 0, NULL);

	if ((fz = fdopen (fd, "r")) == NULL) {
		g_warning ("Cannot read from FD... could not extract text");
		close (fd);
		return NULL;
	}

	/* Reading in chunks of BUFFER_SIZE
	 *   Loop is halted whenever one of this conditions is met:
	 *     a) Read bytes reached the maximum allowed (max_bytes)
	 *     b) No more bytes to read
	 *     c) Error reading
	 *     d) Stream has less than 3 bytes
	 *     e) Stream has a single line of BUFFER_SIZE bytes with no EOL
	 */
	while (n_bytes_remaining > 0) {
		gchar buf[BUFFER_SIZE];
		gsize n_bytes_read;

		/* Read bytes */
		n_bytes_read = fread (buf,
		                      1,
		                      MIN (BUFFER_SIZE, n_bytes_remaining),
		                      fz);

		/* Process read bytes, and halt loop if needed */
		if (!process_chunk (buf,
		                    n_bytes_read,
		                    BUFFER_SIZE,
		                    &n_bytes_remaining,
		                    &s)) {
			break;
		}
	}

	/* Close the file here */
#ifdef HAVE_POSIX_FADVISE
	if (posix_fadvise (fd, 0, 0, POSIX_FADV_DONTNEED) != 0)
		g_warning ("posix_fadvise() call failed: %m");
#endif /* HAVE_POSIX_FADVISE */
	fclose (fz);

	/* Validate UTF-8 if something was read, and return it */
	return s ? process_whole_string (s) : NULL;
}