summaryrefslogtreecommitdiff
path: root/subversion/svn/similarity.c
blob: 0bcf0f559bf5080c4b2583421345d2659f727224 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
/*
 * similarity.c: Utility functions for finding similar strings in lists
 *
 * ====================================================================
 *    Licensed to the Apache Software Foundation (ASF) under one
 *    or more contributor license agreements.  See the NOTICE file
 *    distributed with this work for additional information
 *    regarding copyright ownership.  The ASF licenses this file
 *    to you under the Apache License, Version 2.0 (the
 *    "License"); you may not use this file except in compliance
 *    with the License.  You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 *    Unless required by applicable law or agreed to in writing,
 *    software distributed under the License is distributed on an
 *    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 *    KIND, either express or implied.  See the License for the
 *    specific language governing permissions and limitations
 *    under the License.
 * ====================================================================
 */

/* ==================================================================== */



/*** Includes. ***/

#include <stdlib.h>

#include "svn_string.h"
#include "cl.h"

#include "private/svn_string_private.h"

#include "svn_private_config.h"


/* Context for token similarity checking */
struct svn_cl__simcheck_context_t
{
  svn_string_t key;             /* The token we're comparing with */
  svn_membuf_t buffer;          /* Buffer for similarity testing */
};


/* Similarity test between two property names */
static APR_INLINE apr_size_t
simcheck_key_diff(const svn_string_t *key, const svn_string_t *ctx,
                 svn_membuf_t *buffer, apr_size_t *diff)
{
  apr_size_t lcs;
  const apr_size_t score = svn_string__similarity(key, ctx, buffer, &lcs);
  if (key->len > ctx->len)
    *diff = key->len - lcs;
  else
    *diff = ctx->len - lcs;
  return score;
}


/* Key comparator for qsort for svn_cl__simcheck_t */
static int
simcheck_compare(const void *pkeya, const void *pkeyb)
{
  svn_cl__simcheck_t *const keya = *(svn_cl__simcheck_t *const *)pkeya;
  svn_cl__simcheck_t *const keyb = *(svn_cl__simcheck_t *const *)pkeyb;
  svn_cl__simcheck_context_t *const context = keya->context;

  if (keya->score == -1)
    keya->score = simcheck_key_diff(&keya->token, &context->key,
                                    &context->buffer, &keya->diff);
  if (keyb->score == -1)
    keyb->score = simcheck_key_diff(&keyb->token, &context->key,
                                    &context->buffer, &keyb->diff);

  return (keya->score < keyb->score ? 1
          : (keya->score > keyb->score ? -1
             : (keya->diff > keyb->diff ? 1
                : (keya->diff < keyb->diff ? -1 : 0))));
}

apr_size_t
svn_cl__similarity_check(const char *key,
                         svn_cl__simcheck_t **tokens,
                         apr_size_t token_count,
                         apr_pool_t *scratch_pool)
{
  apr_size_t result;
  apr_size_t i;

  svn_cl__simcheck_context_t context;
  context.key.data = key;
  context.key.len = strlen(key);
  svn_membuf__create(&context.buffer, 0, scratch_pool);

  /* Populate the score, diff and context members. */
  for (i = 0; i < token_count; ++i)
    {
      svn_cl__simcheck_t *const token = tokens[i];
      token->score = -1;
      token->diff = 0;
      token->context = &context;
    }

  /* Sort the tokens by similarity. */
  qsort(tokens, token_count, sizeof(*tokens), simcheck_compare);

  /* Remove references to the context, since it points to the stack,
     and calculate the number of results that are at least two-thirds
     similar to the key. */
  for (i = 0, result = 1; i < token_count; ++i)
    {
      svn_cl__simcheck_t *const token = tokens[i];
      token->context = NULL;
      /* If you update this factor, consider updating
       * ../libsvn_subr/cmdline.c:most_similar(). */
      if (token->score >= (2 * SVN_STRING__SIM_RANGE_MAX + 1) / 3)
        ++result;
    }

  if (0 == tokens[0]->diff)
    return 0;                   /* We found an exact match. */
  return result;
}