1 /* SPDX-License-Identifier: BSD-3-Clause */
2 /*
3  * Copyright (c) 1994-2009  Red Hat, Inc.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice,
10  * this list of conditions and the following disclaimer.
11  *
12  * 2. Redistributions in binary form must reproduce the above copyright notice,
13  * this list of conditions and the following disclaimer in the documentation
14  * and/or other materials provided with the distribution.
15  *
16  * 3. Neither the name of the copyright holder nor the names of its
17  * contributors may be used to endorse or promote products derived from this
18  * software without specific prior written permission.
19  *
20  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
21  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
22  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
23  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
24  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
25  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
26  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
27  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
28  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
29  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30  * POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /* Byte-wise substring search, using the Two-Way algorithm.
34  * Copyright (C) 2008, 2010 Eric Blake
35  * Permission to use, copy, modify, and distribute this software
36  * is freely granted, provided that this notice is preserved.
37  */
38 
39 
40 /* Before including this file, you need to include <string.h>, and define:
41      RESULT_TYPE		A macro that expands to the return type.
42      AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are
43 				at least N_L bytes left starting at
44 				H[J].  H is 'unsigned char *', H_L, J,
45 				and N_L are 'size_t'; H_L is an
46 				lvalue.  For NUL-terminated searches,
47 				H_L can be modified each iteration to
48 				avoid having to compute the end of H
49 				up front.
50 
51   For case-insensitivity, you may optionally define:
52      CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L
53 				characters of P1 and P2 are equal.
54      CANON_ELEMENT(c)		A macro that canonicalizes an element
55 				right after it has been fetched from
56 				one of the two strings.  The argument
57 				is an 'unsigned char'; the result must
58 				be an 'unsigned char' as well.
59 
60   This file undefines the macros documented above, and defines
61   LONG_NEEDLE_THRESHOLD.
62 */
63 
64 #include <limits.h>
65 #include <stdint.h>
66 
67 /* We use the Two-Way string matching algorithm, which guarantees
68    linear complexity with constant space.  Additionally, for long
69    needles, we also use a bad character shift table similar to the
70    Boyer-Moore algorithm to achieve improved (potentially sub-linear)
71    performance.
72 
73    See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
74    and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
75 */
76 
77 /* Point at which computing a bad-byte shift table is likely to be
78    worthwhile.  Small needles should not compute a table, since it
79    adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
80    speedup no greater than a factor of NEEDLE_LEN.  The larger the
81    needle, the better the potential performance gain.  On the other
82    hand, on non-POSIX systems with CHAR_BIT larger than eight, the
83    memory required for the table is prohibitive.  */
84 #if CHAR_BIT < 10
85 # define LONG_NEEDLE_THRESHOLD 32U
86 #else
87 # define LONG_NEEDLE_THRESHOLD SIZE_MAX
88 #endif
89 
90 #define MAX(a, b) ((a < b) ? (b) : (a))
91 
92 #ifndef CANON_ELEMENT
93 # define CANON_ELEMENT(c) c
94 #endif
95 #ifndef CMP_FUNC
96 # define CMP_FUNC memcmp
97 #endif
98 
99 /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
100    Return the index of the first byte in the right half, and set
101    *PERIOD to the global period of the right half.
102 
103    The global period of a string is the smallest index (possibly its
104    length) at which all remaining bytes in the string are repetitions
105    of the prefix (the last repetition may be a subset of the prefix).
106 
107    When NEEDLE is factored into two halves, a local period is the
108    length of the smallest word that shares a suffix with the left half
109    and shares a prefix with the right half.  All factorizations of a
110    non-empty NEEDLE have a local period of at least 1 and no greater
111    than NEEDLE_LEN.
112 
113    A critical factorization has the property that the local period
114    equals the global period.  All strings have at least one critical
115    factorization with the left half smaller than the global period.
116 
117    Given an ordered alphabet, a critical factorization can be computed
118    in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
119    larger of two ordered maximal suffixes.  The ordered maximal
120    suffixes are determined by lexicographic comparison of
121    periodicity.  */
122 static size_t
critical_factorization(const unsigned char * needle,size_t needle_len,size_t * period)123 critical_factorization (const unsigned char *needle, size_t needle_len,
124 			size_t *period)
125 {
126   /* Index of last byte of left half, or SIZE_MAX.  */
127   size_t max_suffix, max_suffix_rev;
128   size_t j; /* Index into NEEDLE for current candidate suffix.  */
129   size_t k; /* Offset into current period.  */
130   size_t p; /* Intermediate period.  */
131   unsigned char a, b; /* Current comparison bytes.  */
132 
133   /* Invariants:
134      0 <= j < NEEDLE_LEN - 1
135      -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
136      min(max_suffix, max_suffix_rev) < global period of NEEDLE
137      1 <= p <= global period of NEEDLE
138      p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
139      1 <= k <= p
140   */
141 
142   /* Perform lexicographic search.  */
143   max_suffix = SIZE_MAX;
144   j = 0;
145   k = p = 1;
146   while (j + k < needle_len)
147     {
148       a = CANON_ELEMENT (needle[j + k]);
149       b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
150       if (a < b)
151 	{
152 	  /* Suffix is smaller, period is entire prefix so far.  */
153 	  j += k;
154 	  k = 1;
155 	  p = j - max_suffix;
156 	}
157       else if (a == b)
158 	{
159 	  /* Advance through repetition of the current period.  */
160 	  if (k != p)
161 	    ++k;
162 	  else
163 	    {
164 	      j += p;
165 	      k = 1;
166 	    }
167 	}
168       else /* b < a */
169 	{
170 	  /* Suffix is larger, start over from current location.  */
171 	  max_suffix = j++;
172 	  k = p = 1;
173 	}
174     }
175   *period = p;
176 
177   /* Perform reverse lexicographic search.  */
178   max_suffix_rev = SIZE_MAX;
179   j = 0;
180   k = p = 1;
181   while (j + k < needle_len)
182     {
183       a = CANON_ELEMENT (needle[j + k]);
184       b = CANON_ELEMENT (needle[max_suffix_rev + k]);
185       if (b < a)
186 	{
187 	  /* Suffix is smaller, period is entire prefix so far.  */
188 	  j += k;
189 	  k = 1;
190 	  p = j - max_suffix_rev;
191 	}
192       else if (a == b)
193 	{
194 	  /* Advance through repetition of the current period.  */
195 	  if (k != p)
196 	    ++k;
197 	  else
198 	    {
199 	      j += p;
200 	      k = 1;
201 	    }
202 	}
203       else /* a < b */
204 	{
205 	  /* Suffix is larger, start over from current location.  */
206 	  max_suffix_rev = j++;
207 	  k = p = 1;
208 	}
209     }
210 
211   /* Choose the longer suffix.  Return the first byte of the right
212      half, rather than the last byte of the left half.  */
213   if (max_suffix_rev + 1 < max_suffix + 1)
214     return max_suffix + 1;
215   *period = p;
216   return max_suffix_rev + 1;
217 }
218 
219 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
220    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
221    method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
222    Performance is guaranteed to be linear, with an initialization cost
223    of 2 * NEEDLE_LEN comparisons.
224 
225    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
226    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
227    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
228    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
229 static RETURN_TYPE
two_way_short_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)230 two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
231 		      const unsigned char *needle, size_t needle_len)
232 {
233   size_t i; /* Index into current byte of NEEDLE.  */
234   size_t j; /* Index into current window of HAYSTACK.  */
235   size_t period; /* The period of the right half of needle.  */
236   size_t suffix; /* The index of the right half of needle.  */
237 
238   /* Factor the needle into two halves, such that the left half is
239      smaller than the global period, and the right half is
240      periodic (with a period as large as NEEDLE_LEN - suffix).  */
241   suffix = critical_factorization (needle, needle_len, &period);
242 
243   /* Perform the search.  Each iteration compares the right half
244      first.  */
245   if (CMP_FUNC (needle, needle + period, suffix) == 0)
246     {
247       /* Entire needle is periodic; a mismatch can only advance by the
248 	 period, so use memory to avoid rescanning known occurrences
249 	 of the period.  */
250       size_t memory = 0;
251       j = 0;
252       while (AVAILABLE (haystack, haystack_len, j, needle_len))
253 	{
254 	  /* Scan for matches in right half.  */
255 	  i = MAX (suffix, memory);
256 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
257 				    == CANON_ELEMENT (haystack[i + j])))
258 	    ++i;
259 	  if (needle_len <= i)
260 	    {
261 	      /* Scan for matches in left half.  */
262 	      i = suffix - 1;
263 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
264 					== CANON_ELEMENT (haystack[i + j])))
265 		--i;
266 	      if (i + 1 < memory + 1)
267 		return (RETURN_TYPE) (haystack + j);
268 	      /* No match, so remember how many repetitions of period
269 		 on the right half were scanned.  */
270 	      j += period;
271 	      memory = needle_len - period;
272 	    }
273 	  else
274 	    {
275 	      j += i - suffix + 1;
276 	      memory = 0;
277 	    }
278 	}
279     }
280   else
281     {
282       /* The two halves of needle are distinct; no extra memory is
283 	 required, and any mismatch results in a maximal shift.  */
284       period = MAX (suffix, needle_len - suffix) + 1;
285       j = 0;
286       while (AVAILABLE (haystack, haystack_len, j, needle_len))
287 	{
288 	  /* Scan for matches in right half.  */
289 	  i = suffix;
290 	  while (i < needle_len && (CANON_ELEMENT (needle[i])
291 				    == CANON_ELEMENT (haystack[i + j])))
292 	    ++i;
293 	  if (needle_len <= i)
294 	    {
295 	      /* Scan for matches in left half.  */
296 	      i = suffix - 1;
297 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
298 				       == CANON_ELEMENT (haystack[i + j])))
299 		--i;
300 	      if (i == SIZE_MAX)
301 		return (RETURN_TYPE) (haystack + j);
302 	      j += period;
303 	    }
304 	  else
305 	    j += i - suffix + 1;
306 	}
307     }
308   return NULL;
309 }
310 
311 /* Return the first location of non-empty NEEDLE within HAYSTACK, or
312    NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
313    method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
314    Performance is guaranteed to be linear, with an initialization cost
315    of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
316 
317    If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
318    most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
319    and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
320    If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
321    HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
322    sublinear performance is not possible.  */
323 static RETURN_TYPE
two_way_long_needle(const unsigned char * haystack,size_t haystack_len,const unsigned char * needle,size_t needle_len)324 two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
325 		     const unsigned char *needle, size_t needle_len)
326 {
327   size_t i; /* Index into current byte of NEEDLE.  */
328   size_t j; /* Index into current window of HAYSTACK.  */
329   size_t period; /* The period of the right half of needle.  */
330   size_t suffix; /* The index of the right half of needle.  */
331   size_t shift_table[1U << CHAR_BIT]; /* See below.  */
332 
333   /* Factor the needle into two halves, such that the left half is
334      smaller than the global period, and the right half is
335      periodic (with a period as large as NEEDLE_LEN - suffix).  */
336   suffix = critical_factorization (needle, needle_len, &period);
337 
338   /* Populate shift_table.  For each possible byte value c,
339      shift_table[c] is the distance from the last occurrence of c to
340      the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
341      shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
342   for (i = 0; i < 1U << CHAR_BIT; i++)
343     shift_table[i] = needle_len;
344   for (i = 0; i < needle_len; i++)
345     shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
346 
347   /* Perform the search.  Each iteration compares the right half
348      first.  */
349   if (CMP_FUNC (needle, needle + period, suffix) == 0)
350     {
351       /* Entire needle is periodic; a mismatch can only advance by the
352 	 period, so use memory to avoid rescanning known occurrences
353 	 of the period.  */
354       size_t memory = 0;
355       size_t shift;
356       j = 0;
357       while (AVAILABLE (haystack, haystack_len, j, needle_len))
358 	{
359 	  /* Check the last byte first; if it does not match, then
360 	     shift to the next possible match location.  */
361 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
362 	  if (0 < shift)
363 	    {
364 	      if (memory && shift < period)
365 		{
366 		  /* Since needle is periodic, but the last period has
367 		     a byte out of place, there can be no match until
368 		     after the mismatch.  */
369 		  shift = needle_len - period;
370 		}
371 	      memory = 0;
372 	      j += shift;
373 	      continue;
374 	    }
375 	  /* Scan for matches in right half.  The last byte has
376 	     already been matched, by virtue of the shift table.  */
377 	  i = MAX (suffix, memory);
378 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
379 					== CANON_ELEMENT (haystack[i + j])))
380 	    ++i;
381 	  if (needle_len - 1 <= i)
382 	    {
383 	      /* Scan for matches in left half.  */
384 	      i = suffix - 1;
385 	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
386 					== CANON_ELEMENT (haystack[i + j])))
387 		--i;
388 	      if (i + 1 < memory + 1)
389 		return (RETURN_TYPE) (haystack + j);
390 	      /* No match, so remember how many repetitions of period
391 		 on the right half were scanned.  */
392 	      j += period;
393 	      memory = needle_len - period;
394 	    }
395 	  else
396 	    {
397 	      j += i - suffix + 1;
398 	      memory = 0;
399 	    }
400 	}
401     }
402   else
403     {
404       /* The two halves of needle are distinct; no extra memory is
405 	 required, and any mismatch results in a maximal shift.  */
406       size_t shift;
407       period = MAX (suffix, needle_len - suffix) + 1;
408       j = 0;
409       while (AVAILABLE (haystack, haystack_len, j, needle_len))
410 	{
411 	  /* Check the last byte first; if it does not match, then
412 	     shift to the next possible match location.  */
413 	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
414 	  if (0 < shift)
415 	    {
416 	      j += shift;
417 	      continue;
418 	    }
419 	  /* Scan for matches in right half.  The last byte has
420 	     already been matched, by virtue of the shift table.  */
421 	  i = suffix;
422 	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
423 					== CANON_ELEMENT (haystack[i + j])))
424 	    ++i;
425 	  if (needle_len - 1 <= i)
426 	    {
427 	      /* Scan for matches in left half.  */
428 	      i = suffix - 1;
429 	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
430 				       == CANON_ELEMENT (haystack[i + j])))
431 		--i;
432 	      if (i == SIZE_MAX)
433 		return (RETURN_TYPE) (haystack + j);
434 	      j += period;
435 	    }
436 	  else
437 	    j += i - suffix + 1;
438 	}
439     }
440   return NULL;
441 }
442 
443 #undef AVAILABLE
444 #undef CANON_ELEMENT
445 #undef CMP_FUNC
446 #undef MAX
447 #undef RETURN_TYPE
448