diff options
author | mvglasow <michael -at- vonglasow.com> | 2017-12-02 17:08:05 +0100 |
---|---|---|
committer | mvglasow <michael -at- vonglasow.com> | 2017-12-02 17:08:05 +0100 |
commit | 6db567eae5e301314e2f612b285f51959eafabe4 (patch) | |
tree | d3ce1d787f8e57cc919439c4851bf78633ee2d95 /navit/util.c | |
parent | 3d9652a18f4477fcff6c4e408ff6cd808bc2227b (diff) | |
download | navit-6db567eae5e301314e2f612b285f51959eafabe4.tar.gz |
Add:util:compare_name_systematic()
Signed-off-by: mvglasow <michael -at- vonglasow.com>
Diffstat (limited to 'navit/util.c')
-rw-r--r-- | navit/util.c | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/navit/util.c b/navit/util.c index 79c92379e..e5172435d 100644 --- a/navit/util.c +++ b/navit/util.c @@ -68,6 +68,201 @@ navit_utf8_strcasecmp(const char *s1, const char *s2) return cmpres; } +/** + * @brief Trims all leading and trailing whitespace characters from a string. + * + * Whitespace characters are all up to and including 0x20. + * + * This function operates in-place, i.e. `s` will be modified. + * + * @param s The string to trim + */ +void strtrim(char *s) { + char *tmp = g_strdup(s); + char *in = tmp; + while (strlen(in) && (in[0] <= 0x20)) + in++; + while (strlen(in) && (in[strlen(in) - 1] <= 0x20)) + in[strlen(in) - 1] = 0; + strcpy(s, in); + g_free(tmp); +} + +/** + * @brief Parser states for `parse_for_systematic_comparison()`. + */ +enum parse_state { + parse_state_whitespace, + parse_state_numeric, + parse_state_alpha, +}; + +/** + * @brief Parses a string for systematic comparison. + * + * This is a helper function for `compare_name_systematic()`. + * + * The string is broken down into numeric and non-numeric parts. Whitespace characters are discarded + * unless they are surrounded by string characters, in which case the whole unit is treated as one + * string part. All strings are converted to lowercase and leading zeroes stripped from numbers. + * + * @param s The string to parse + * + * @return A buffer containing the parsed string, parts delimited by a null character, the last part + * followed by a double null character. + */ +char * parse_for_systematic_comparison(const char *s) { + char *ret = g_malloc0(strlen(s) * 2 + 1); + char *in = s; + char *out = ret; + char *part; + enum parse_state state = parse_state_whitespace; + int i = 0; + char c; + + dbg(lvl_debug, "enter\n"); + + // TODO convert strings to lowercase + + while (i < strlen(in)) { + c = in[i]; + if (c <= 0x20) { + /* whitespace */ + if (state == parse_state_numeric) { + part = g_malloc0(i + 1); + strncpy(part, in, i); + sprintf(part, "%d", atoi(part)); + strcpy(out, part); + out += strlen(part) + 1; + dbg(lvl_debug, "part='%s'\n", part); + g_free(part); + in += i; + i = 1; + state = parse_state_whitespace; + } else + i++; + } else if ((c >= '0') && (c <= '9')) { + /* numeric */ + if (state == parse_state_alpha) { + part = g_malloc0(i + 1); + strncpy(part, in, i); + strtrim(part); + strcpy(out, part); + out += strlen(part) + 1; + dbg(lvl_debug, "part='%s'\n", part); + g_free(part); + in += i; + i = 1; + } else + i++; + state = parse_state_numeric; + } else { + /* alpha */ + if (state == parse_state_numeric) { + part = g_malloc0(i + 1); + strncpy(part, in, i); + sprintf(part, "%d", atoi(part)); + strcpy(out, part); + out += strlen(part) + 1; + dbg(lvl_debug, "part='%s'\n", part); + g_free(part); + in += i; + i = 1; + } else + i++; + state = parse_state_alpha; + } + } + + if (strlen(in) > 0) { + if (state == parse_state_numeric) { + part = g_malloc0(strlen(in) + 1); + strcpy(part, in); + sprintf(part, "%d", atoi(part)); + strcpy(out, part); + dbg(lvl_debug, "part='%s'\n", part); + g_free(part); + } else if (state == parse_state_alpha) { + part = g_malloc0(strlen(in) + 1); + strcpy(part, in); + strtrim(part); + strcpy(out, part); + dbg(lvl_debug, "part='%s'\n", part); + g_free(part); + } + } + + return ret; +} + +/** + * @brief Compares two name_systematic strings. + * + * A name_systematic string is typically used for road reference numbers (A 4, I-51, SP526). This + * function performs a fuzzy comparison: Each string is broken down into numeric and non-numeric parts. + * Then both strings are compared part by part. The following rules apply: + * + * \li Whitespace bordering on a number is discarded. + * \li Whitespace surrounded by string characters is treated as one string with the surrounding characters. + * \li If one string has more parts than the other, the shorter string is padded with null parts. + * \li null is less than non-null. + * \li null equals null. + * \li A numeric part is less than a string part. + * \li Numeric parts are compared as integers. + * \li String parts are compared as strings. Comparison is case-insensitive. + * + * @param s1 The first string + * @param s2 The second string + * + * @return 0 if both strings match, a negative value if `s1<s2`, a positive value if `s1>s2`. + */ +int compare_name_systematic(const char *s1, const char *s2) { + int ret = 0; + int i; + char *l, *r, *l0, *r0; + + if (!s1 || !s1[0]) { + if (!s2 || !s2[0]) + return 0; + else + return 1; + } else if (!s2 || !s2[0]) + return -1; + + l0 = parse_for_systematic_comparison(s1); + r0 = parse_for_systematic_comparison(s2); + + l = l0; + r = r0; + + while (!ret && l[0] && r[0]) { + if (atoi(l) || (l[0] == '0')) { + if (atoi(r) || (r[0] == '0')) + ret = atoi(l) - atoi(r); + else + ret = -1; + } else { + if (atoi(r) || (r[0] == '0')) + ret = 1; + else + ret = strcmp(l, r); + } + + l += strlen(l) + 1; + r += strlen(r) + 1; + } + + g_free(l0); + g_free(r0); + + if (!ret) + ret = l[0] - r[0]; + + dbg(lvl_debug, "'%s' %s '%s'\n", s1, ret?"does NOT match":"matches", s2); + + return ret; +} + static void hash_callback(gpointer key, gpointer value, gpointer user_data) { |