Subversion Repositories Kolibri OS

Rev

Rev 4874 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/* Byte-wise substring search, using the Two-Way algorithm.
2
 * Copyright (C) 2008, 2010 Eric Blake
3
 * Permission to use, copy, modify, and distribute this software
4
 * is freely granted, provided that this notice is preserved.
5
 */
6
 
7
 
8
/* Before including this file, you need to include , and define:
9
     RESULT_TYPE		A macro that expands to the return type.
10
     AVAILABLE(h, h_l, j, n_l)	A macro that returns nonzero if there are
11
				at least N_L bytes left starting at
12
				H[J].  H is 'unsigned char *', H_L, J,
13
				and N_L are 'size_t'; H_L is an
14
				lvalue.  For NUL-terminated searches,
15
				H_L can be modified each iteration to
16
				avoid having to compute the end of H
17
				up front.
18
 
19
  For case-insensitivity, you may optionally define:
20
     CMP_FUNC(p1, p2, l)	A macro that returns 0 iff the first L
21
				characters of P1 and P2 are equal.
22
     CANON_ELEMENT(c)		A macro that canonicalizes an element
23
				right after it has been fetched from
24
				one of the two strings.  The argument
25
				is an 'unsigned char'; the result must
26
				be an 'unsigned char' as well.
27
 
28
  This file undefines the macros documented above, and defines
29
  LONG_NEEDLE_THRESHOLD.
30
*/
31
 
32
#include 
33
#include 
34
 
35
/* We use the Two-Way string matching algorithm, which guarantees
36
   linear complexity with constant space.  Additionally, for long
37
   needles, we also use a bad character shift table similar to the
38
   Boyer-Moore algorithm to achieve improved (potentially sub-linear)
39
   performance.
40
 
41
   See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260
42
   and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm
43
*/
44
 
45
/* Point at which computing a bad-byte shift table is likely to be
46
   worthwhile.  Small needles should not compute a table, since it
47
   adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a
48
   speedup no greater than a factor of NEEDLE_LEN.  The larger the
49
   needle, the better the potential performance gain.  On the other
50
   hand, on non-POSIX systems with CHAR_BIT larger than eight, the
51
   memory required for the table is prohibitive.  */
52
#if CHAR_BIT < 10
53
# define LONG_NEEDLE_THRESHOLD 32U
54
#else
55
# define LONG_NEEDLE_THRESHOLD SIZE_MAX
56
#endif
57
 
58
#define MAX(a, b) ((a < b) ? (b) : (a))
59
 
60
#ifndef CANON_ELEMENT
61
# define CANON_ELEMENT(c) c
62
#endif
63
#ifndef CMP_FUNC
64
# define CMP_FUNC memcmp
65
#endif
66
 
67
/* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN.
68
   Return the index of the first byte in the right half, and set
69
   *PERIOD to the global period of the right half.
70
 
71
   The global period of a string is the smallest index (possibly its
72
   length) at which all remaining bytes in the string are repetitions
73
   of the prefix (the last repetition may be a subset of the prefix).
74
 
75
   When NEEDLE is factored into two halves, a local period is the
76
   length of the smallest word that shares a suffix with the left half
77
   and shares a prefix with the right half.  All factorizations of a
78
   non-empty NEEDLE have a local period of at least 1 and no greater
79
   than NEEDLE_LEN.
80
 
81
   A critical factorization has the property that the local period
82
   equals the global period.  All strings have at least one critical
83
   factorization with the left half smaller than the global period.
84
 
85
   Given an ordered alphabet, a critical factorization can be computed
86
   in linear time, with 2 * NEEDLE_LEN comparisons, by computing the
87
   larger of two ordered maximal suffixes.  The ordered maximal
88
   suffixes are determined by lexicographic comparison of
89
   periodicity.  */
90
static size_t
91
critical_factorization (const unsigned char *needle, size_t needle_len,
92
			size_t *period)
93
{
94
  /* Index of last byte of left half, or SIZE_MAX.  */
95
  size_t max_suffix, max_suffix_rev;
96
  size_t j; /* Index into NEEDLE for current candidate suffix.  */
97
  size_t k; /* Offset into current period.  */
98
  size_t p; /* Intermediate period.  */
99
  unsigned char a, b; /* Current comparison bytes.  */
100
 
101
  /* Invariants:
102
 
103
     -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed)
104
     min(max_suffix, max_suffix_rev) < global period of NEEDLE
105
     1 <= p <= global period of NEEDLE
106
     p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j]
107
     1 <= k <= p
108
  */
109
 
110
  /* Perform lexicographic search.  */
111
  max_suffix = SIZE_MAX;
112
  j = 0;
113
  k = p = 1;
114
  while (j + k < needle_len)
115
    {
116
      a = CANON_ELEMENT (needle[j + k]);
117
      b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]);
118
      if (a < b)
119
	{
120
	  /* Suffix is smaller, period is entire prefix so far.  */
121
	  j += k;
122
	  k = 1;
123
	  p = j - max_suffix;
124
	}
125
      else if (a == b)
126
	{
127
	  /* Advance through repetition of the current period.  */
128
	  if (k != p)
129
	    ++k;
130
	  else
131
	    {
132
	      j += p;
133
	      k = 1;
134
	    }
135
	}
136
      else /* b < a */
137
	{
138
	  /* Suffix is larger, start over from current location.  */
139
	  max_suffix = j++;
140
	  k = p = 1;
141
	}
142
    }
143
  *period = p;
144
 
145
  /* Perform reverse lexicographic search.  */
146
  max_suffix_rev = SIZE_MAX;
147
  j = 0;
148
  k = p = 1;
149
  while (j + k < needle_len)
150
    {
151
      a = CANON_ELEMENT (needle[j + k]);
152
      b = CANON_ELEMENT (needle[max_suffix_rev + k]);
153
      if (b < a)
154
	{
155
	  /* Suffix is smaller, period is entire prefix so far.  */
156
	  j += k;
157
	  k = 1;
158
	  p = j - max_suffix_rev;
159
	}
160
      else if (a == b)
161
	{
162
	  /* Advance through repetition of the current period.  */
163
	  if (k != p)
164
	    ++k;
165
	  else
166
	    {
167
	      j += p;
168
	      k = 1;
169
	    }
170
	}
171
      else /* a < b */
172
	{
173
	  /* Suffix is larger, start over from current location.  */
174
	  max_suffix_rev = j++;
175
	  k = p = 1;
176
	}
177
    }
178
 
179
  /* Choose the longer suffix.  Return the first byte of the right
180
     half, rather than the last byte of the left half.  */
181
  if (max_suffix_rev + 1 < max_suffix + 1)
182
    return max_suffix + 1;
183
  *period = p;
184
  return max_suffix_rev + 1;
185
}
186
 
187
/* Return the first location of non-empty NEEDLE within HAYSTACK, or
188
   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
189
   method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD.
190
   Performance is guaranteed to be linear, with an initialization cost
191
   of 2 * NEEDLE_LEN comparisons.
192
 
193
   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
194
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.
195
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
196
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching.  */
197
static RETURN_TYPE
198
two_way_short_needle (const unsigned char *haystack, size_t haystack_len,
199
		      const unsigned char *needle, size_t needle_len)
200
{
201
  size_t i; /* Index into current byte of NEEDLE.  */
202
  size_t j; /* Index into current window of HAYSTACK.  */
203
  size_t period; /* The period of the right half of needle.  */
204
  size_t suffix; /* The index of the right half of needle.  */
205
 
206
  /* Factor the needle into two halves, such that the left half is
207
     smaller than the global period, and the right half is
208
     periodic (with a period as large as NEEDLE_LEN - suffix).  */
209
  suffix = critical_factorization (needle, needle_len, &period);
210
 
211
  /* Perform the search.  Each iteration compares the right half
212
     first.  */
213
  if (CMP_FUNC (needle, needle + period, suffix) == 0)
214
    {
215
      /* Entire needle is periodic; a mismatch can only advance by the
216
	 period, so use memory to avoid rescanning known occurrences
217
	 of the period.  */
218
      size_t memory = 0;
219
      j = 0;
220
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
221
	{
222
	  /* Scan for matches in right half.  */
223
	  i = MAX (suffix, memory);
224
	  while (i < needle_len && (CANON_ELEMENT (needle[i])
225
				    == CANON_ELEMENT (haystack[i + j])))
226
	    ++i;
227
	  if (needle_len <= i)
228
	    {
229
	      /* Scan for matches in left half.  */
230
	      i = suffix - 1;
231
	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
232
					== CANON_ELEMENT (haystack[i + j])))
233
		--i;
234
	      if (i + 1 < memory + 1)
235
		return (RETURN_TYPE) (haystack + j);
236
	      /* No match, so remember how many repetitions of period
237
		 on the right half were scanned.  */
238
	      j += period;
239
	      memory = needle_len - period;
240
	    }
241
	  else
242
	    {
243
	      j += i - suffix + 1;
244
	      memory = 0;
245
	    }
246
	}
247
    }
248
  else
249
    {
250
      /* The two halves of needle are distinct; no extra memory is
251
	 required, and any mismatch results in a maximal shift.  */
252
      period = MAX (suffix, needle_len - suffix) + 1;
253
      j = 0;
254
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
255
	{
256
	  /* Scan for matches in right half.  */
257
	  i = suffix;
258
	  while (i < needle_len && (CANON_ELEMENT (needle[i])
259
				    == CANON_ELEMENT (haystack[i + j])))
260
	    ++i;
261
	  if (needle_len <= i)
262
	    {
263
	      /* Scan for matches in left half.  */
264
	      i = suffix - 1;
265
	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
266
				       == CANON_ELEMENT (haystack[i + j])))
267
		--i;
268
	      if (i == SIZE_MAX)
269
		return (RETURN_TYPE) (haystack + j);
270
	      j += period;
271
	    }
272
	  else
273
	    j += i - suffix + 1;
274
	}
275
    }
276
  return NULL;
277
}
278
 
279
/* Return the first location of non-empty NEEDLE within HAYSTACK, or
280
   NULL.  HAYSTACK_LEN is the minimum known length of HAYSTACK.  This
281
   method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN.
282
   Performance is guaranteed to be linear, with an initialization cost
283
   of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations.
284
 
285
   If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at
286
   most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching,
287
   and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible.
288
   If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 *
289
   HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and
290
   sublinear performance is not possible.  */
291
static RETURN_TYPE
292
two_way_long_needle (const unsigned char *haystack, size_t haystack_len,
293
		     const unsigned char *needle, size_t needle_len)
294
{
295
  size_t i; /* Index into current byte of NEEDLE.  */
296
  size_t j; /* Index into current window of HAYSTACK.  */
297
  size_t period; /* The period of the right half of needle.  */
298
  size_t suffix; /* The index of the right half of needle.  */
299
  size_t shift_table[1U << CHAR_BIT]; /* See below.  */
300
 
301
  /* Factor the needle into two halves, such that the left half is
302
     smaller than the global period, and the right half is
303
     periodic (with a period as large as NEEDLE_LEN - suffix).  */
304
  suffix = critical_factorization (needle, needle_len, &period);
305
 
306
  /* Populate shift_table.  For each possible byte value c,
307
     shift_table[c] is the distance from the last occurrence of c to
308
     the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE.
309
     shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0.  */
310
  for (i = 0; i < 1U << CHAR_BIT; i++)
311
    shift_table[i] = needle_len;
312
  for (i = 0; i < needle_len; i++)
313
    shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1;
314
 
315
  /* Perform the search.  Each iteration compares the right half
316
     first.  */
317
  if (CMP_FUNC (needle, needle + period, suffix) == 0)
318
    {
319
      /* Entire needle is periodic; a mismatch can only advance by the
320
	 period, so use memory to avoid rescanning known occurrences
321
	 of the period.  */
322
      size_t memory = 0;
323
      size_t shift;
324
      j = 0;
325
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
326
	{
327
	  /* Check the last byte first; if it does not match, then
328
	     shift to the next possible match location.  */
329
	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
330
	  if (0 < shift)
331
	    {
332
	      if (memory && shift < period)
333
		{
334
		  /* Since needle is periodic, but the last period has
335
		     a byte out of place, there can be no match until
336
		     after the mismatch.  */
337
		  shift = needle_len - period;
338
		}
6099 serge 339
	      memory = 0;
4349 Serge 340
	      j += shift;
341
	      continue;
342
	    }
343
	  /* Scan for matches in right half.  The last byte has
344
	     already been matched, by virtue of the shift table.  */
345
	  i = MAX (suffix, memory);
346
	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
347
					== CANON_ELEMENT (haystack[i + j])))
348
	    ++i;
349
	  if (needle_len - 1 <= i)
350
	    {
351
	      /* Scan for matches in left half.  */
352
	      i = suffix - 1;
353
	      while (memory < i + 1 && (CANON_ELEMENT (needle[i])
354
					== CANON_ELEMENT (haystack[i + j])))
355
		--i;
356
	      if (i + 1 < memory + 1)
357
		return (RETURN_TYPE) (haystack + j);
358
	      /* No match, so remember how many repetitions of period
359
		 on the right half were scanned.  */
360
	      j += period;
361
	      memory = needle_len - period;
362
	    }
363
	  else
364
	    {
365
	      j += i - suffix + 1;
366
	      memory = 0;
367
	    }
368
	}
369
    }
370
  else
371
    {
372
      /* The two halves of needle are distinct; no extra memory is
373
	 required, and any mismatch results in a maximal shift.  */
374
      size_t shift;
375
      period = MAX (suffix, needle_len - suffix) + 1;
376
      j = 0;
377
      while (AVAILABLE (haystack, haystack_len, j, needle_len))
378
	{
379
	  /* Check the last byte first; if it does not match, then
380
	     shift to the next possible match location.  */
381
	  shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])];
382
	  if (0 < shift)
383
	    {
384
	      j += shift;
385
	      continue;
386
	    }
387
	  /* Scan for matches in right half.  The last byte has
388
	     already been matched, by virtue of the shift table.  */
389
	  i = suffix;
390
	  while (i < needle_len - 1 && (CANON_ELEMENT (needle[i])
391
					== CANON_ELEMENT (haystack[i + j])))
392
	    ++i;
393
	  if (needle_len - 1 <= i)
394
	    {
395
	      /* Scan for matches in left half.  */
396
	      i = suffix - 1;
397
	      while (i != SIZE_MAX && (CANON_ELEMENT (needle[i])
398
				       == CANON_ELEMENT (haystack[i + j])))
399
		--i;
400
	      if (i == SIZE_MAX)
401
		return (RETURN_TYPE) (haystack + j);
402
	      j += period;
403
	    }
404
	  else
405
	    j += i - suffix + 1;
406
	}
407
    }
408
  return NULL;
409
}
410
 
411
#undef AVAILABLE
412
#undef CANON_ELEMENT
413
#undef CMP_FUNC
414
#undef MAX
415
#undef RETURN_TYPE