/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
/*
* Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
*
* This library is free software: you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation.
*
* This library is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
* or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
* for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this library. If not, see .
*
* Authors: Jeffrey Stedfast
*/
#include "evolution-data-server-config.h"
#include
#include
#include
#include "camel-trie.h"
#include "camel-url-scanner.h"
#include "camel-utf8.h"
struct _CamelUrlScanner {
GPtrArray *patterns;
CamelTrie *trie;
};
/**
* camel_url_scanner_new: (skip)
*
* Returns: (transfer full): Creates a new #CamelUrlScanner
**/
CamelUrlScanner *
camel_url_scanner_new (void)
{
CamelUrlScanner *scanner;
scanner = g_slice_new0 (CamelUrlScanner);
scanner->patterns = g_ptr_array_new ();
scanner->trie = camel_trie_new (TRUE);
return scanner;
}
/**
* camel_url_scanner_free: (skip)
* @scanner: a #CamelUrlScanner
*
* Frees the @scanner.
**/
void
camel_url_scanner_free (CamelUrlScanner *scanner)
{
g_return_if_fail (scanner != NULL);
g_ptr_array_free (scanner->patterns, TRUE);
camel_trie_free (scanner->trie);
g_slice_free (CamelUrlScanner, scanner);
}
/**
* camel_url_scanner_add: (skip)
* @scanner: a #CamelUrlScanner
* @pattern: a #CamelUrlPattern to add
*
* Adds a new @pattern into the scanner
**/
void
camel_url_scanner_add (CamelUrlScanner *scanner,
CamelUrlPattern *pattern)
{
g_return_if_fail (scanner != NULL);
camel_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
g_ptr_array_add (scanner->patterns, pattern);
}
/**
* camel_url_scanner_scan: (skip)
* @scanner: a #CamelUrlScanner object.
* @in: (array length=inlen) (type gchar): the url to scan.
* @inlen: length of the in array.
* @match: the #CamelUrlMatch structure containing the criterias.
*
* Scan the @in string with the @match criterias.
*
* Returns: %TRUE if there is a result.
**/
gboolean
camel_url_scanner_scan (CamelUrlScanner *scanner,
const gchar *in,
gsize inlen,
CamelUrlMatch *match)
{
const gchar *pos;
const guchar *inptr, *inend;
CamelUrlPattern *pat;
gint pattern;
g_return_val_if_fail (scanner != NULL, FALSE);
g_return_val_if_fail (in != NULL, FALSE);
inptr = (const guchar *) in;
inend = inptr + inlen;
/* check validity of a string first */
if (!g_utf8_validate (in, inlen, NULL))
return FALSE;
do {
if (!(pos = camel_trie_search (scanner->trie, (const gchar *) inptr, inlen, &pattern)))
return FALSE;
pat = g_ptr_array_index (scanner->patterns, pattern);
match->pattern = pat->pattern;
match->prefix = pat->prefix;
if (pat->start (in, pos, (const gchar *) inend, match) && pat->end (in, pos, (const gchar *) inend, match))
return TRUE;
inptr = (const guchar *) pos;
if (camel_utf8_getc_limit (&inptr, inend) == 0xffff)
break;
inlen = inend - inptr;
} while (inptr < inend);
return FALSE;
}
/* stephenhay from https://mathiasbynens.be/demo/url-regex */
#define URL_PROTOCOLS "news|telnet|nntp|file|https?|s?ftp|webcal|localhost|ssh"
#define URL_PATTERN "((?:(?:(?:" URL_PROTOCOLS ")\\:\\/\\/)|(?:www\\.|ftp\\.))[^\\s\\/\\$\\.\\?#].[^\\s]*+)"
#define FILE_PATTERN "((?:(?:(?:file)\\:\\/\\/)|(?:(?:file)\\:\\/\\/\\/))[^\\s\\/\\$\\.\\?#].[^\\s]*+)"
static guchar url_scanner_table[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160,
68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128,
160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128,
128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,
66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
};
enum {
IS_CTRL = (1 << 0),
IS_ALPHA = (1 << 1),
IS_DIGIT = (1 << 2),
IS_LWSP = (1 << 3),
IS_SPACE = (1 << 4),
IS_SPECIAL = (1 << 5),
IS_DOMAIN = (1 << 6),
IS_URLSAFE = (1 << 7)
};
#define is_ctrl(x) ((url_scanner_table[(guchar)(x)] & IS_CTRL) != 0)
#define is_lwsp(x) ((url_scanner_table[(guchar)(x)] & IS_LWSP) != 0)
#define is_atom(x) ((url_scanner_table[(guchar)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
#define is_alpha(x) ((url_scanner_table[(guchar)(x)] & IS_ALPHA) != 0)
#define is_digit(x) ((url_scanner_table[(guchar)(x)] & IS_DIGIT) != 0)
#define is_domain(x) ((url_scanner_table[(guchar)(x)] & IS_DOMAIN) != 0)
#define is_urlsafe(x) ((url_scanner_table[(guchar)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
static const struct {
const gchar open;
const gchar close;
} url_braces[] = {
{ '(', ')' },
{ '{', '}' },
{ '[', ']' },
{ '<', '>' },
{ '|', '|' },
{ '\'', '\'' },
};
static gboolean
is_open_brace (gchar c)
{
gint i;
for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
if (c == url_braces[i].open)
return TRUE;
}
return FALSE;
}
static gboolean
camel_url_pattern_end (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match,
const gchar *pattern,
gboolean remove_trailing_bad_chars)
{
GRegex *regex;
GMatchInfo *match_info = NULL;
gboolean success = FALSE;
regex = g_regex_new (pattern, 0, 0, NULL);
if (!regex)
return FALSE;
if (g_regex_match_all_full (regex, pos, inend - pos, 0, G_REGEX_MATCH_NOTEMPTY, &match_info, NULL) && match_info &&
g_match_info_matches (match_info)) {
gint start_pos, end_pos;
if (g_match_info_fetch_pos (match_info, 0, &start_pos, &end_pos) && start_pos == 0 && end_pos > 0 && end_pos <= inend - pos) {
const gchar *inptr = pos + end_pos, *ptr;
/* Stop on the angle brackets, which cannot be part of the URL (see RFC 3986 Appendix C) */
for (ptr = pos; ptr < inptr; ptr++) {
if (*ptr == '<' || *ptr == '>') {
inptr = ptr;
break;
}
}
success = inptr > pos;
if (remove_trailing_bad_chars && success) {
/* urls are extremely unlikely to end with any
* punctuation, so strip any trailing
* punctuation off. Also strip off any closing
* double-quotes. */
while (inptr > pos && strchr (",.:;?!-|}])\">", inptr[-1])) {
gchar open_bracket = 0, close_bracket = inptr[-1];
if (close_bracket == ')')
open_bracket = '(';
else if (close_bracket == '}')
open_bracket = '{';
else if (close_bracket == ']')
open_bracket = '[';
else if (close_bracket == '>')
open_bracket = '<';
if (open_bracket != 0) {
gint n_opened = 0, n_closed = 0;
for (ptr = pos; ptr < inptr; ptr++) {
if (*ptr == open_bracket)
n_opened++;
else if (*ptr == close_bracket)
n_closed++;
}
/* The closing bracket can match one inside the URL,
thus keep it there. */
if (n_opened > 0 && n_opened - n_closed >= 0)
break;
}
inptr--;
}
}
match->um_eo = (inptr - in);
}
}
g_match_info_free (match_info);
g_regex_unref (regex);
return success;
}
gboolean
camel_url_addrspec_start (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match)
{
register const gchar *inptr = pos;
g_return_val_if_fail (*inptr == '@', FALSE);
if (inptr > in)
inptr--;
while (inptr > in) {
if (is_atom (*inptr))
inptr--;
else
break;
while (inptr > in && is_atom (*inptr))
inptr--;
if (inptr > in && *inptr == '.')
inptr--;
}
while (!is_atom (*inptr) || is_open_brace (*inptr))
inptr++;
if (inptr >= pos)
return FALSE;
match->um_so = (inptr - in);
return TRUE;
}
gboolean
camel_url_addrspec_end (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match)
{
const gchar *inptr = pos;
gint parts = 0, digits;
gboolean got_dot = FALSE;
g_return_val_if_fail (*inptr == '@', FALSE);
inptr++;
if (*inptr == '[') {
/* domain literal */
do {
inptr++;
digits = 0;
while (inptr < inend && is_digit (*inptr) && digits < 3) {
inptr++;
digits++;
}
parts++;
if (*inptr != '.' && parts != 4)
return FALSE;
} while (parts < 4);
if (*inptr == ']')
inptr++;
else
return FALSE;
got_dot = TRUE;
} else {
while (inptr < inend) {
if (is_domain (*inptr))
inptr++;
else
break;
while (inptr < inend && is_domain (*inptr))
inptr++;
if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
if (*inptr == '.')
got_dot = TRUE;
inptr++;
}
}
}
/* don't allow toplevel domains */
if (inptr == pos + 1 || !got_dot)
return FALSE;
match->um_eo = (inptr - in);
return TRUE;
}
gboolean
camel_url_file_start (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match)
{
match->um_so = (pos - in);
return TRUE;
}
gboolean
camel_url_file_end (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match)
{
return camel_url_pattern_end (in, pos, inend, match, FILE_PATTERN, FALSE);
}
gboolean
camel_url_web_start (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match)
{
if (pos > in && !strncmp (pos, "www", 3)) {
/* make sure we aren't actually part of another word */
if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
return FALSE;
}
match->um_so = (pos - in);
return TRUE;
}
gboolean
camel_url_web_end (const gchar *in,
const gchar *pos,
const gchar *inend,
CamelUrlMatch *match)
{
return camel_url_pattern_end (in, pos, inend, match, URL_PATTERN, TRUE);
}
#ifdef BUILD_TABLE
/* got these from rfc1738 */
#define CHARS_LWSP " \t\n\r" /* linear whitespace chars */
#define CHARS_SPECIAL "()<>@,;:\\\".[]"
/* got these from rfc1738 */
#define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
static void
table_init_bits (guint mask,
const guchar *vals)
{
gint i;
for (i = 0; vals[i] != '\0'; i++)
url_scanner_table[vals[i]] |= mask;
}
static void
url_scanner_table_init (void)
{
gint i;
for (i = 0; i < 256; i++) {
url_scanner_table[i] = 0;
if (i < 32)
url_scanner_table[i] |= IS_CTRL;
if ((i >= '0' && i <= '9'))
url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN;
if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN;
if (i >= 127)
url_scanner_table[i] |= IS_CTRL;
}
url_scanner_table[' '] |= IS_SPACE;
url_scanner_table['-'] |= IS_DOMAIN;
/* not defined to be special in rfc0822, but when scanning
* backwards to find the beginning of the email address we do
* not want to include this gchar if we come accross it - so
* this is kind of a hack */
url_scanner_table['/'] |= IS_SPECIAL;
table_init_bits (IS_LWSP, CHARS_LWSP);
table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
}
gint main (gint argc, gchar **argv)
{
gint i;
url_scanner_table_init ();
printf ("static guchar url_scanner_table[256] = {");
for (i = 0; i < 256; i++) {
printf (
"%s%3d%s", (i % 16) ? "" : "\n\t",
url_scanner_table[i], i != 255 ? "," : "\n");
}
printf ("};\n\n");
return 0;
}
#endif /* BUILD_TABLE */