Mercurial > hg > octave-kai > gnulib-hg
annotate lib/fstrcmp.c @ 17463:203c036eb0c6
bootstrap: support checksum utils without a --status option
* build-aux/bootstrap: Only look for sha1sum if updating po files.
Add sha1 to the list of supported checksum utils since it's now
supported through adjustments below.
(update_po_files): Remove the use of --status
in a way that will suppress all error messages, but since this is
only used to minimize updates, it shouldn't cause an issue.
Exit early if there is a problem updating the po file checksums.
(find_tool): Remove the check for --version support as this
is optional as per commit 86186b17. Don't even check for the
presence of the command as if that is needed, it's supported
through configuring prerequisites in bootstrap.conf.
Prompt that when a tool isn't found, one can define an environment
variable to add to the hardcoded search list.
author | Pádraig Brady <P@draigBrady.com> |
---|---|
date | Thu, 08 Aug 2013 11:08:49 +0100 |
parents | e542fd46ad6f |
children |
rev | line source |
---|---|
9151 | 1 /* Functions to make fuzzy comparisons between strings |
17249
e542fd46ad6f
maint: update all copyright year number ranges
Eric Blake <eblake@redhat.com>
parents:
16201
diff
changeset
|
2 Copyright (C) 1988-1989, 1992-1993, 1995, 2001-2003, 2006, 2008-2013 Free |
12518
b5e42ef33b49
update nearly all FSF copyright year lists to include 2009
Jim Meyering <meyering@redhat.com>
parents:
12421
diff
changeset
|
3 Software Foundation, Inc. |
9151 | 4 |
9309
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
9151
diff
changeset
|
5 This program is free software: you can redistribute it and/or modify |
9151 | 6 it under the terms of the GNU General Public License as published by |
9309
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
9151
diff
changeset
|
7 the Free Software Foundation; either version 3 of the License, or |
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
9151
diff
changeset
|
8 (at your option) any later version. |
9151 | 9 |
10 This program is distributed in the hope that it will be useful, | |
11 but WITHOUT ANY WARRANTY; without even the implied warranty of | |
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
13 GNU General Public License for more details. | |
14 | |
15 You should have received a copy of the GNU General Public License | |
9309
bbbbbf4cd1c5
Change copyright notice from GPLv2+ to GPLv3+.
Bruno Haible <bruno@clisp.org>
parents:
9151
diff
changeset
|
16 along with this program. If not, see <http://www.gnu.org/licenses/>. |
9151 | 17 |
18 | |
19 Derived from GNU diff 2.7, analyze.c et al. | |
20 | |
21 The basic idea is to consider two vectors as similar if, when | |
22 transforming the first vector into the second vector through a | |
23 sequence of edits (inserts and deletes of one element each), | |
24 this sequence is short - or equivalently, if the ordered list | |
25 of elements that are untouched by these edits is long. For a | |
26 good introduction to the subject, read about the "Levenshtein | |
27 distance" in Wikipedia. | |
28 | |
29 The basic algorithm is described in: | |
30 "An O(ND) Difference Algorithm and its Variations", Eugene Myers, | |
31 Algorithmica Vol. 1 No. 2, 1986, pp. 251-266; | |
32 see especially section 4.2, which describes the variation used below. | |
33 | |
34 The basic algorithm was independently discovered as described in: | |
35 "Algorithms for Approximate String Matching", E. Ukkonen, | |
36 Information and Control Vol. 64, 1985, pp. 100-118. | |
37 | |
38 Unless the 'find_minimal' flag is set, this code uses the TOO_EXPENSIVE | |
39 heuristic, by Paul Eggert, to limit the cost to O(N**1.5 log N) | |
40 at the price of producing suboptimal output for large inputs with | |
41 many differences. */ | |
42 | |
43 #include <config.h> | |
44 | |
45 /* Specification. */ | |
46 #include "fstrcmp.h" | |
47 | |
48 #include <string.h> | |
49 #include <stdbool.h> | |
50 #include <stdio.h> | |
51 #include <stdlib.h> | |
52 #include <limits.h> | |
53 | |
10318
8a3539888308
Move the lock and tls source files into a subdirectory.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
54 #include "glthread/lock.h" |
8a3539888308
Move the lock and tls source files into a subdirectory.
Bruno Haible <bruno@clisp.org>
parents:
9309
diff
changeset
|
55 #include "glthread/tls.h" |
9151 | 56 #include "minmax.h" |
57 #include "xalloc.h" | |
58 | |
59 #ifndef uintptr_t | |
60 # define uintptr_t unsigned long | |
61 #endif | |
62 | |
63 | |
64 #define ELEMENT char | |
65 #define EQUAL(x,y) ((x) == (y)) | |
66 #define OFFSET int | |
67 #define EXTRA_CONTEXT_FIELDS \ | |
10439
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
68 /* The number of edits beyond which the computation can be aborted. */ \ |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
69 int edit_count_limit; \ |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
70 /* The number of edits (= number of elements inserted, plus the number of \ |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
71 elements deleted), temporarily minus edit_count_limit. */ \ |
10438
1a57c9d644f2
Simplify result computation.
Bruno Haible <bruno@clisp.org>
parents:
10437
diff
changeset
|
72 int edit_count; |
1a57c9d644f2
Simplify result computation.
Bruno Haible <bruno@clisp.org>
parents:
10437
diff
changeset
|
73 #define NOTE_DELETE(ctxt, xoff) ctxt->edit_count++ |
1a57c9d644f2
Simplify result computation.
Bruno Haible <bruno@clisp.org>
parents:
10437
diff
changeset
|
74 #define NOTE_INSERT(ctxt, yoff) ctxt->edit_count++ |
10439
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
75 #define EARLY_ABORT(ctxt) ctxt->edit_count > 0 |
9151 | 76 /* We don't need USE_HEURISTIC, since it is unlikely in typical uses of |
77 fstrcmp(). */ | |
78 #include "diffseq.h" | |
79 | |
80 | |
81 /* Because fstrcmp is typically called multiple times, attempt to minimize | |
82 the number of memory allocations performed. Thus, let a call reuse the | |
83 memory already allocated by the previous call, if it is sufficient. | |
84 To make it multithread-safe, without need for a lock that protects the | |
85 already allocated memory, store the allocated memory per thread. Free | |
86 it only when the thread exits. */ | |
87 | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
88 static gl_tls_key_t buffer_key; /* TLS key for a 'int *' */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
89 static gl_tls_key_t bufmax_key; /* TLS key for a 'size_t' */ |
9151 | 90 |
91 static void | |
92 keys_init (void) | |
93 { | |
94 gl_tls_key_init (buffer_key, free); | |
95 gl_tls_key_init (bufmax_key, NULL); | |
96 /* The per-thread initial values are NULL and 0, respectively. */ | |
97 } | |
98 | |
99 /* Ensure that keys_init is called once only. */ | |
100 gl_once_define(static, keys_init_once) | |
101 | |
102 | |
10455
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
103 /* In the code below, branch probabilities were measured by Ralf Wildenhues, |
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
104 by running "msgmerge LL.po coreutils.pot" with msgmerge 0.18 for many |
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
105 values of LL. The probability indicates that the condition evaluates |
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
106 to true; whether that leads to a branch or a non-branch in the code, |
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
107 depends on the compiler's reordering of basic blocks. */ |
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
108 |
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
109 |
9151 | 110 double |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
111 fstrcmp_bounded (const char *string1, const char *string2, double lower_bound) |
9151 | 112 { |
113 struct context ctxt; | |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
114 int xvec_length = strlen (string1); |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
115 int yvec_length = strlen (string2); |
9151 | 116 int i; |
117 | |
118 size_t fdiag_len; | |
119 int *buffer; | |
120 size_t bufmax; | |
121 | |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
122 /* short-circuit obvious comparisons */ |
10455
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
123 if (xvec_length == 0 || yvec_length == 0) /* Prob: 1% */ |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
124 return (xvec_length == 0 && yvec_length == 0 ? 1.0 : 0.0); |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
125 |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
126 if (lower_bound > 0) |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
127 { |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
128 /* Compute a quick upper bound. |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
129 Each edit is an insertion or deletion of an element, hence modifies |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
130 the length of the sequence by at most 1. |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
131 Therefore, when starting from a sequence X and ending at a sequence Y, |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
132 with N edits, | yvec_length - xvec_length | <= N. (Proof by |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
133 induction over N.) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
134 So, at the end, we will have |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
135 edit_count >= | xvec_length - yvec_length |. |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
136 and hence |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
137 result |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
138 = (xvec_length + yvec_length - edit_count) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
139 / (xvec_length + yvec_length) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
140 <= (xvec_length + yvec_length - | yvec_length - xvec_length |) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
141 / (xvec_length + yvec_length) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
142 = 2 * min (xvec_length, yvec_length) / (xvec_length + yvec_length). |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
143 */ |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
144 volatile double upper_bound = |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
145 (double) (2 * MIN (xvec_length, yvec_length)) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
146 / (xvec_length + yvec_length); |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
147 |
10455
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
148 if (upper_bound < lower_bound) /* Prob: 74% */ |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
149 /* Return an arbitrary value < LOWER_BOUND. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
150 return 0.0; |
10443
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
151 |
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
152 #if CHAR_BIT <= 8 |
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
153 /* When X and Y are both small, avoid the overhead of setting up an |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
154 array of size 256. */ |
10455
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
155 if (xvec_length + yvec_length >= 20) /* Prob: 99% */ |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
156 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
157 /* Compute a less quick upper bound. |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
158 Each edit is an insertion or deletion of a character, hence |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
159 modifies the occurrence count of a character by 1 and leaves the |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
160 other occurrence counts unchanged. |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
161 Therefore, when starting from a sequence X and ending at a |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
162 sequence Y, and denoting the occurrence count of C in X with |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
163 OCC (X, C), with N edits, |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
164 sum_C | OCC (X, C) - OCC (Y, C) | <= N. |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
165 (Proof by induction over N.) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
166 So, at the end, we will have |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
167 edit_count >= sum_C | OCC (X, C) - OCC (Y, C) |, |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
168 and hence |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
169 result |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
170 = (xvec_length + yvec_length - edit_count) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
171 / (xvec_length + yvec_length) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
172 <= (xvec_length + yvec_length - sum_C | OCC(X,C) - OCC(Y,C) |) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
173 / (xvec_length + yvec_length). |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
174 */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
175 int occ_diff[UCHAR_MAX + 1]; /* array C -> OCC(X,C) - OCC(Y,C) */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
176 int sum; |
10443
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
177 |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
178 /* Determine the occurrence counts in X. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
179 memset (occ_diff, 0, sizeof (occ_diff)); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
180 for (i = xvec_length - 1; i >= 0; i--) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
181 occ_diff[(unsigned char) string1[i]]++; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
182 /* Subtract the occurrence counts in Y. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
183 for (i = yvec_length - 1; i >= 0; i--) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
184 occ_diff[(unsigned char) string2[i]]--; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
185 /* Sum up the absolute values. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
186 sum = 0; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
187 for (i = 0; i <= UCHAR_MAX; i++) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
188 { |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
189 int d = occ_diff[i]; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
190 sum += (d >= 0 ? d : -d); |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
191 } |
10443
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
192 |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
193 upper_bound = 1.0 - (double) sum / (xvec_length + yvec_length); |
10443
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
194 |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
195 if (upper_bound < lower_bound) /* Prob: 66% */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
196 /* Return an arbitrary value < LOWER_BOUND. */ |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
197 return 0.0; |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
198 } |
10443
13d9d9fee7b5
Use a second, less quick upper bound based on character occurrence counts.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10439
diff
changeset
|
199 #endif |
10437
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
200 } |
aaa73f947c24
New function fstrcmp_bounded.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10318
diff
changeset
|
201 |
9151 | 202 /* set the info for each string. */ |
203 ctxt.xvec = string1; | |
204 ctxt.yvec = string2; | |
205 | |
206 /* Set TOO_EXPENSIVE to be approximate square root of input size, | |
207 bounded below by 256. */ | |
208 ctxt.too_expensive = 1; | |
209 for (i = xvec_length + yvec_length; | |
210 i != 0; | |
211 i >>= 2) | |
212 ctxt.too_expensive <<= 1; | |
213 if (ctxt.too_expensive < 256) | |
214 ctxt.too_expensive = 256; | |
215 | |
216 /* Allocate memory for fdiag and bdiag from a thread-local pool. */ | |
217 fdiag_len = xvec_length + yvec_length + 3; | |
218 gl_once (keys_init_once, keys_init); | |
219 buffer = (int *) gl_tls_get (buffer_key); | |
220 bufmax = (size_t) (uintptr_t) gl_tls_get (bufmax_key); | |
221 if (fdiag_len > bufmax) | |
222 { | |
223 /* Need more memory. */ | |
224 bufmax = 2 * bufmax; | |
225 if (fdiag_len > bufmax) | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
226 bufmax = fdiag_len; |
9151 | 227 /* Calling xrealloc would be a waste: buffer's contents does not need |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
228 to be preserved. */ |
9151 | 229 if (buffer != NULL) |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
230 free (buffer); |
9151 | 231 buffer = (int *) xnmalloc (bufmax, 2 * sizeof (int)); |
232 gl_tls_set (buffer_key, buffer); | |
233 gl_tls_set (bufmax_key, (void *) (uintptr_t) bufmax); | |
234 } | |
235 ctxt.fdiag = buffer + yvec_length + 1; | |
236 ctxt.bdiag = ctxt.fdiag + fdiag_len; | |
237 | |
10439
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
238 /* The edit_count is only ever increased. The computation can be aborted |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
239 when |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
240 (xvec_length + yvec_length - edit_count) / (xvec_length + yvec_length) |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
241 < lower_bound, |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
242 or equivalently |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
243 edit_count > (xvec_length + yvec_length) * (1 - lower_bound) |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
244 or equivalently |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
245 edit_count > floor((xvec_length + yvec_length) * (1 - lower_bound)). |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
246 We need to add an epsilon inside the floor(...) argument, to neutralize |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
247 rounding errors. */ |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
248 ctxt.edit_count_limit = |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
249 (lower_bound < 1.0 |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
250 ? (int) ((xvec_length + yvec_length) * (1.0 - lower_bound + 0.000001)) |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
251 : 0); |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
252 |
9151 | 253 /* Now do the main comparison algorithm */ |
10439
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
254 ctxt.edit_count = - ctxt.edit_count_limit; |
10455
e8f55251d47e
Add data about branch probabilities.
Bruno Haible <bruno@clisp.org>
parents:
10443
diff
changeset
|
255 if (compareseq (0, xvec_length, 0, yvec_length, 0, &ctxt)) /* Prob: 98% */ |
10439
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
256 /* The edit_count passed the limit. Hence the result would be |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
257 < lower_bound. We can return any value < lower_bound instead. */ |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
258 return 0.0; |
95f2b5d880cb
Abort the fstrcmp computation early when a lower bound has been given.
Ralf Wildenhues <Ralf.Wildenhues@gmx.de>
parents:
10438
diff
changeset
|
259 ctxt.edit_count += ctxt.edit_count_limit; |
9151 | 260 |
261 /* The result is | |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
262 ((number of chars in common) / (average length of the strings)). |
10438
1a57c9d644f2
Simplify result computation.
Bruno Haible <bruno@clisp.org>
parents:
10437
diff
changeset
|
263 The numerator is |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
264 = xvec_length - (number of calls to NOTE_DELETE) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
265 = yvec_length - (number of calls to NOTE_INSERT) |
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
266 = 1/2 * (xvec_length + yvec_length - (number of edits)). |
9151 | 267 This is admittedly biased towards finding that the strings are |
268 similar, however it does produce meaningful results. */ | |
10438
1a57c9d644f2
Simplify result computation.
Bruno Haible <bruno@clisp.org>
parents:
10437
diff
changeset
|
269 return ((double) (xvec_length + yvec_length - ctxt.edit_count) |
12421
e8d2c6fc33ad
Use spaces for indentation, not tabs.
Bruno Haible <bruno@clisp.org>
parents:
10455
diff
changeset
|
270 / (xvec_length + yvec_length)); |
9151 | 271 } |