1 /* SPDX-License-Identifier: BSD-3-Clause */
2 /*
3 * Copyright (c) 1994-2009 Red Hat, Inc.
4 * All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions are met:
8 *
9 * 1. Redistributions of source code must retain the above copyright notice,
10 * this list of conditions and the following disclaimer.
11 *
12 * 2. Redistributions in binary form must reproduce the above copyright notice,
13 * this list of conditions and the following disclaimer in the documentation
14 * and/or other materials provided with the distribution.
15 *
16 * 3. Neither the name of the copyright holder nor the names of its
17 * contributors may be used to endorse or promote products derived from this
18 * software without specific prior written permission.
19 *
20 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 * POSSIBILITY OF SUCH DAMAGE.
31 */
32
33 /* Byte-wise substring search, using the Two-Way algorithm.
34 * Copyright (C) 2008, 2010 Eric Blake
35 * Permission to use, copy, modify, and distribute this software
36 * is freely granted, provided that this notice is preserved.
37 */
38
39
40 /* Before including this file, you need to include <string.h>, and define:
41 RESULT_TYPE A macro that expands to the return type.
42 AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are
43 at least N_L bytes left starting at
44 H[J]. H is 'unsigned char *', H_L, J,
45 and N_L are 'size_t'; H_L is an
46 lvalue. For NUL-terminated searches,
47 H_L can be modified each iteration to
48 avoid having to compute the end of H
49 up front.
50
51 For case-insensitivity, you may optionally define:
52 CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L
53 characters of P1 and P2 are equal.
54 CANON_ELEMENT(c) A macro that canonicalizes an element
55 right after it has been fetched from
56 one of the two strings. The argument
57 is an 'unsigned char'; the result must
58 be an 'unsigned char' as well.
59
60 This file undefines the macros documented above, and defines
61 LONG_NEEDLE_THRESHOLD.
62 */
63
64 #include <limits.h>
65 #include <stdint.h>
66
67 /* We use the Two-Way string matching algorithm, which guarantees
68 linear complexity with constant space. Additionally, for long
69 needles, we also use a bad character shift table similar to the
70 Boyer-Moore algorithm to achieve improved (potentially sub-linear)
71 performance.
72
73 See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
74 and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
75 */
76
77 /* Point at which computing a bad-byte shift table is likely to be
78 worthwhile. Small needles should not compute a table, since it
79 adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
80 speedup no greater than a factor of NEEDLE_LEN. The larger the
81 needle, the better the potential performance gain. On the other
82 hand, on non-POSIX systems with CHAR_BIT larger than eight, the
83 memory required for the table is prohibitive. */
84 #if CHAR_BIT < 10
85 # define LONG_NEEDLE_THRESHOLD 32U
86 #else
87 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
88 #endif
89
90 #define MAX(a, b) ((a < b) ? (b) : (a))
91
92 #ifndef CANON_ELEMENT
93 # define CANON_ELEMENT(c) c
94 #endif
95 #ifndef CMP_FUNC
96 # define CMP_FUNC memcmp
97 #endif
98
99 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
100 Return the index of the first byte in the right half, and set
101 *PERIOD to the global period of the right half.
102
103 The global period of a string is the smallest index (possibly its
104 length) at which all remaining bytes in the string are repetitions
105 of the prefix (the last repetition may be a subset of the prefix).
106
107 When NEEDLE is factored into two halves, a local period is the
108 length of the smallest word that shares a suffix with the left half
109 and shares a prefix with the right half. All factorizations of a
110 non-empty NEEDLE have a local period of at least 1 and no greater
111 than NEEDLE_LEN.
112
113 A critical factorization has the property that the local period
114 equals the global period. All strings have at least one critical
115 factorization with the left half smaller than the global period.
116
117 Given an ordered alphabet, a critical factorization can be computed
118 in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
119 larger of two ordered maximal suffixes. The ordered maximal
120 suffixes are determined by lexicographic comparison of
121 periodicity. */
122 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)123 critical_factorization (const unsigned char *needle, size_t needle_len,
124 size_t *period)
125 {
126 /* Index of last byte of left half, or SIZE_MAX. */
127 size_t max_suffix, max_suffix_rev;
128 size_t j; /* Index into NEEDLE for current candidate suffix. */
129 size_t k; /* Offset into current period. */
130 size_t p; /* Intermediate period. */
131 unsigned char a, b; /* Current comparison bytes. */
132
133 /* Invariants:
134 0 <= j < NEEDLE_LEN - 1
135 -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
136 min(max_suffix, max_suffix_rev) < global period of NEEDLE
137 1 <= p <= global period of NEEDLE
138 p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
139 1 <= k <= p
140 */
141
142 /* Perform lexicographic search. */
143 max_suffix = SIZE_MAX;
144 j = 0;
145 k = p = 1;
146 while (j + k < needle_len)
147 {
148 a = CANON_ELEMENT (needle[j + k]);
149 b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
150 if (a < b)
151 {
152 /* Suffix is smaller, period is entire prefix so far. */
153 j += k;
154 k = 1;
155 p = j - max_suffix;
156 }
157 else if (a == b)
158 {
159 /* Advance through repetition of the current period. */
160 if (k != p)
161 ++k;
162 else
163 {
164 j += p;
165 k = 1;
166 }
167 }
168 else /* b < a */
169 {
170 /* Suffix is larger, start over from current location. */
171 max_suffix = j++;
172 k = p = 1;
173 }
174 }
175 *period = p;
176
177 /* Perform reverse lexicographic search. */
178 max_suffix_rev = SIZE_MAX;
179 j = 0;
180 k = p = 1;
181 while (j + k < needle_len)
182 {
183 a = CANON_ELEMENT (needle[j + k]);
184 b = CANON_ELEMENT (needle[max_suffix_rev + k]);
185 if (b < a)
186 {
187 /* Suffix is smaller, period is entire prefix so far. */
188 j += k;
189 k = 1;
190 p = j - max_suffix_rev;
191 }
192 else if (a == b)
193 {
194 /* Advance through repetition of the current period. */
195 if (k != p)
196 ++k;
197 else
198 {
199 j += p;
200 k = 1;
201 }
202 }
203 else /* a < b */
204 {
205 /* Suffix is larger, start over from current location. */
206 max_suffix_rev = j++;
207 k = p = 1;
208 }
209 }
210
211 /* Choose the longer suffix. Return the first byte of the right
212 half, rather than the last byte of the left half. */
213 if (max_suffix_rev + 1 < max_suffix + 1)
214 return max_suffix + 1;
215 *period = p;
216 return max_suffix_rev + 1;
217 }
218
219 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
220 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
221 method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
222 Performance is guaranteed to be linear, with an initialization cost
223 of 2 * NEEDLE_LEN comparisons.
224
225 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
226 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
227 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
228 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */
229 static RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)230 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
231 const unsigned char *needle, size_t needle_len)
232 {
233 size_t i; /* Index into current byte of NEEDLE. */
234 size_t j; /* Index into current window of HAYSTACK. */
235 size_t period; /* The period of the right half of needle. */
236 size_t suffix; /* The index of the right half of needle. */
237
238 /* Factor the needle into two halves, such that the left half is
239 smaller than the global period, and the right half is
240 periodic (with a period as large as NEEDLE_LEN - suffix). */
241 suffix = critical_factorization (needle, needle_len, &period);
242
243 /* Perform the search. Each iteration compares the right half
244 first. */
245 if (CMP_FUNC (needle, needle + period, suffix) == 0)
246 {
247 /* Entire needle is periodic; a mismatch can only advance by the
248 period, so use memory to avoid rescanning known occurrences
249 of the period. */
250 size_t memory = 0;
251 j = 0;
252 while (AVAILABLE (haystack, haystack_len, j, needle_len))
253 {
254 /* Scan for matches in right half. */
255 i = MAX (suffix, memory);
256 while (i < needle_len && (CANON_ELEMENT (needle[i])
257 == CANON_ELEMENT (haystack[i + j])))
258 ++i;
259 if (needle_len <= i)
260 {
261 /* Scan for matches in left half. */
262 i = suffix - 1;
263 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
264 == CANON_ELEMENT (haystack[i + j])))
265 --i;
266 if (i + 1 < memory + 1)
267 return (RETURN_TYPE) (haystack + j);
268 /* No match, so remember how many repetitions of period
269 on the right half were scanned. */
270 j += period;
271 memory = needle_len - period;
272 }
273 else
274 {
275 j += i - suffix + 1;
276 memory = 0;
277 }
278 }
279 }
280 else
281 {
282 /* The two halves of needle are distinct; no extra memory is
283 required, and any mismatch results in a maximal shift. */
284 period = MAX (suffix, needle_len - suffix) + 1;
285 j = 0;
286 while (AVAILABLE (haystack, haystack_len, j, needle_len))
287 {
288 /* Scan for matches in right half. */
289 i = suffix;
290 while (i < needle_len && (CANON_ELEMENT (needle[i])
291 == CANON_ELEMENT (haystack[i + j])))
292 ++i;
293 if (needle_len <= i)
294 {
295 /* Scan for matches in left half. */
296 i = suffix - 1;
297 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
298 == CANON_ELEMENT (haystack[i + j])))
299 --i;
300 if (i == SIZE_MAX)
301 return (RETURN_TYPE) (haystack + j);
302 j += period;
303 }
304 else
305 j += i - suffix + 1;
306 }
307 }
308 return NULL;
309 }
310
311 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
312 NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This
313 method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
314 Performance is guaranteed to be linear, with an initialization cost
315 of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
316
317 If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
318 most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
319 and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
320 If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
321 HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
322 sublinear performance is not possible. */
323 static RETURN_TYPE
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)324 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
325 const unsigned char *needle, size_t needle_len)
326 {
327 size_t i; /* Index into current byte of NEEDLE. */
328 size_t j; /* Index into current window of HAYSTACK. */
329 size_t period; /* The period of the right half of needle. */
330 size_t suffix; /* The index of the right half of needle. */
331 size_t shift_table[1U << CHAR_BIT]; /* See below. */
332
333 /* Factor the needle into two halves, such that the left half is
334 smaller than the global period, and the right half is
335 periodic (with a period as large as NEEDLE_LEN - suffix). */
336 suffix = critical_factorization (needle, needle_len, &period);
337
338 /* Populate shift_table. For each possible byte value c,
339 shift_table[c] is the distance from the last occurrence of c to
340 the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
341 shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */
342 for (i = 0; i < 1U << CHAR_BIT; i++)
343 shift_table[i] = needle_len;
344 for (i = 0; i < needle_len; i++)
345 shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
346
347 /* Perform the search. Each iteration compares the right half
348 first. */
349 if (CMP_FUNC (needle, needle + period, suffix) == 0)
350 {
351 /* Entire needle is periodic; a mismatch can only advance by the
352 period, so use memory to avoid rescanning known occurrences
353 of the period. */
354 size_t memory = 0;
355 size_t shift;
356 j = 0;
357 while (AVAILABLE (haystack, haystack_len, j, needle_len))
358 {
359 /* Check the last byte first; if it does not match, then
360 shift to the next possible match location. */
361 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
362 if (0 < shift)
363 {
364 if (memory && shift < period)
365 {
366 /* Since needle is periodic, but the last period has
367 a byte out of place, there can be no match until
368 after the mismatch. */
369 shift = needle_len - period;
370 }
371 memory = 0;
372 j += shift;
373 continue;
374 }
375 /* Scan for matches in right half. The last byte has
376 already been matched, by virtue of the shift table. */
377 i = MAX (suffix, memory);
378 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
379 == CANON_ELEMENT (haystack[i + j])))
380 ++i;
381 if (needle_len - 1 <= i)
382 {
383 /* Scan for matches in left half. */
384 i = suffix - 1;
385 while (memory < i + 1 && (CANON_ELEMENT (needle[i])
386 == CANON_ELEMENT (haystack[i + j])))
387 --i;
388 if (i + 1 < memory + 1)
389 return (RETURN_TYPE) (haystack + j);
390 /* No match, so remember how many repetitions of period
391 on the right half were scanned. */
392 j += period;
393 memory = needle_len - period;
394 }
395 else
396 {
397 j += i - suffix + 1;
398 memory = 0;
399 }
400 }
401 }
402 else
403 {
404 /* The two halves of needle are distinct; no extra memory is
405 required, and any mismatch results in a maximal shift. */
406 size_t shift;
407 period = MAX (suffix, needle_len - suffix) + 1;
408 j = 0;
409 while (AVAILABLE (haystack, haystack_len, j, needle_len))
410 {
411 /* Check the last byte first; if it does not match, then
412 shift to the next possible match location. */
413 shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
414 if (0 < shift)
415 {
416 j += shift;
417 continue;
418 }
419 /* Scan for matches in right half. The last byte has
420 already been matched, by virtue of the shift table. */
421 i = suffix;
422 while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
423 == CANON_ELEMENT (haystack[i + j])))
424 ++i;
425 if (needle_len - 1 <= i)
426 {
427 /* Scan for matches in left half. */
428 i = suffix - 1;
429 while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
430 == CANON_ELEMENT (haystack[i + j])))
431 --i;
432 if (i == SIZE_MAX)
433 return (RETURN_TYPE) (haystack + j);
434 j += period;
435 }
436 else
437 j += i - suffix + 1;
438 }
439 }
440 return NULL;
441 }
442
443 #undef AVAILABLE
444 #undef CANON_ELEMENT
445 #undef CMP_FUNC
446 #undef MAX
447 #undef RETURN_TYPE
448