summaryrefslogtreecommitdiff
path: root/gcc/spellcheck.c
blob: e641a56bc023ae37d538172b46115957dc9abb44 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
/* Find near-matches for strings.
   Copyright (C) 2015-2016 Free Software Foundation, Inc.

This file is part of GCC.

GCC is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3, or (at your option) any later
version.

GCC is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
for more details.

You should have received a copy of the GNU General Public License
along with GCC; see the file COPYING3.  If not see
<http://www.gnu.org/licenses/>.  */

#include "config.h"
#include "system.h"
#include "coretypes.h"
#include "tm.h"
#include "tree.h"
#include "spellcheck.h"

/* The Levenshtein distance is an "edit-distance": the minimal
   number of one-character insertions, removals or substitutions
   that are needed to change one string into another.

   This implementation uses the Wagner-Fischer algorithm.  */

edit_distance_t
levenshtein_distance (const char *s, int len_s,
		      const char *t, int len_t)
{
  const bool debug = false;

  if (debug)
    {
      printf ("s: \"%s\" (len_s=%i)\n", s, len_s);
      printf ("t: \"%s\" (len_t=%i)\n", t, len_t);
    }

  if (len_s == 0)
    return len_t;
  if (len_t == 0)
    return len_s;

  /* We effectively build a matrix where each (i, j) contains the
     Levenshtein distance between the prefix strings s[0:j]
     and t[0:i].
     Rather than actually build an (len_t + 1) * (len_s + 1) matrix,
     we simply keep track of the last row, v0 and a new row, v1,
     which avoids an (len_t + 1) * (len_s + 1) allocation and memory accesses
     in favor of two (len_s + 1) allocations.  These could potentially be
     statically-allocated if we impose a maximum length on the
     strings of interest.  */
  edit_distance_t *v0 = new edit_distance_t[len_s + 1];
  edit_distance_t *v1 = new edit_distance_t[len_s + 1];

  /* The first row is for the case of an empty target string, which
     we can reach by deleting every character in the source string.  */
  for (int i = 0; i < len_s + 1; i++)
    v0[i] = i;

  /* Build successive rows.  */
  for (int i = 0; i < len_t; i++)
    {
      if (debug)
	{
	  printf ("i:%i v0 = ", i);
	  for (int j = 0; j < len_s + 1; j++)
	    printf ("%i ", v0[j]);
	  printf ("\n");
	}

      /* The initial column is for the case of an empty source string; we
	 can reach prefixes of the target string of length i
	 by inserting i characters.  */
      v1[0] = i + 1;

      /* Build the rest of the row by considering neighbours to
	 the north, west and northwest.  */
      for (int j = 0; j < len_s; j++)
	{
	  edit_distance_t cost = (s[j] == t[i] ? 0 : 1);
	  edit_distance_t deletion     = v1[j] + 1;
	  edit_distance_t insertion    = v0[j + 1] + 1;
	  edit_distance_t substitution = v0[j] + cost;
	  edit_distance_t cheapest = MIN (deletion, insertion);
	  cheapest = MIN (cheapest, substitution);
	  v1[j + 1] = cheapest;
	}

      /* Prepare to move on to next row.  */
      for (int j = 0; j < len_s + 1; j++)
	v0[j] = v1[j];
    }

  if (debug)
    {
      printf ("final v1 = ");
      for (int j = 0; j < len_s + 1; j++)
	printf ("%i ", v1[j]);
      printf ("\n");
    }

  edit_distance_t result = v1[len_s];
  delete[] v0;
  delete[] v1;
  return result;
}

/* Calculate Levenshtein distance between two nil-terminated strings.  */

edit_distance_t
levenshtein_distance (const char *s, const char *t)
{
  return levenshtein_distance (s, strlen (s), t, strlen (t));
}