Rev 4874 | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* Byte-wise substring search, using the Two-Way algorithm. |
2 | * Copyright (C) 2008, 2010 Eric Blake |
||
3 | * Permission to use, copy, modify, and distribute this software |
||
4 | * is freely granted, provided that this notice is preserved. |
||
5 | */ |
||
6 | |||
7 | |||
8 | /* Before including this file, you need to include |
||
9 | RESULT_TYPE A macro that expands to the return type. |
||
10 | AVAILABLE(h, h_l, j, n_l) A macro that returns nonzero if there are |
||
11 | at least N_L bytes left starting at |
||
12 | H[J]. H is 'unsigned char *', H_L, J, |
||
13 | and N_L are 'size_t'; H_L is an |
||
14 | lvalue. For NUL-terminated searches, |
||
15 | H_L can be modified each iteration to |
||
16 | avoid having to compute the end of H |
||
17 | up front. |
||
18 | |||
19 | For case-insensitivity, you may optionally define: |
||
20 | CMP_FUNC(p1, p2, l) A macro that returns 0 iff the first L |
||
21 | characters of P1 and P2 are equal. |
||
22 | CANON_ELEMENT(c) A macro that canonicalizes an element |
||
23 | right after it has been fetched from |
||
24 | one of the two strings. The argument |
||
25 | is an 'unsigned char'; the result must |
||
26 | be an 'unsigned char' as well. |
||
27 | |||
28 | This file undefines the macros documented above, and defines |
||
29 | LONG_NEEDLE_THRESHOLD. |
||
30 | */ |
||
31 | |||
32 | #include |
||
33 | #include |
||
34 | |||
35 | /* We use the Two-Way string matching algorithm, which guarantees |
||
36 | linear complexity with constant space. Additionally, for long |
||
37 | needles, we also use a bad character shift table similar to the |
||
38 | Boyer-Moore algorithm to achieve improved (potentially sub-linear) |
||
39 | performance. |
||
40 | |||
41 | See http://www-igm.univ-mlv.fr/~lecroq/string/node26.html#SECTION00260 |
||
42 | and http://en.wikipedia.org/wiki/Boyer-Moore_string_search_algorithm |
||
43 | */ |
||
44 | |||
45 | /* Point at which computing a bad-byte shift table is likely to be |
||
46 | worthwhile. Small needles should not compute a table, since it |
||
47 | adds (1 << CHAR_BIT) + NEEDLE_LEN computations of preparation for a |
||
48 | speedup no greater than a factor of NEEDLE_LEN. The larger the |
||
49 | needle, the better the potential performance gain. On the other |
||
50 | hand, on non-POSIX systems with CHAR_BIT larger than eight, the |
||
51 | memory required for the table is prohibitive. */ |
||
52 | #if CHAR_BIT < 10 |
||
53 | # define LONG_NEEDLE_THRESHOLD 32U |
||
54 | #else |
||
55 | # define LONG_NEEDLE_THRESHOLD SIZE_MAX |
||
56 | #endif |
||
57 | |||
58 | #define MAX(a, b) ((a < b) ? (b) : (a)) |
||
59 | |||
60 | #ifndef CANON_ELEMENT |
||
61 | # define CANON_ELEMENT(c) c |
||
62 | #endif |
||
63 | #ifndef CMP_FUNC |
||
64 | # define CMP_FUNC memcmp |
||
65 | #endif |
||
66 | |||
67 | /* Perform a critical factorization of NEEDLE, of length NEEDLE_LEN. |
||
68 | Return the index of the first byte in the right half, and set |
||
69 | *PERIOD to the global period of the right half. |
||
70 | |||
71 | The global period of a string is the smallest index (possibly its |
||
72 | length) at which all remaining bytes in the string are repetitions |
||
73 | of the prefix (the last repetition may be a subset of the prefix). |
||
74 | |||
75 | When NEEDLE is factored into two halves, a local period is the |
||
76 | length of the smallest word that shares a suffix with the left half |
||
77 | and shares a prefix with the right half. All factorizations of a |
||
78 | non-empty NEEDLE have a local period of at least 1 and no greater |
||
79 | than NEEDLE_LEN. |
||
80 | |||
81 | A critical factorization has the property that the local period |
||
82 | equals the global period. All strings have at least one critical |
||
83 | factorization with the left half smaller than the global period. |
||
84 | |||
85 | Given an ordered alphabet, a critical factorization can be computed |
||
86 | in linear time, with 2 * NEEDLE_LEN comparisons, by computing the |
||
87 | larger of two ordered maximal suffixes. The ordered maximal |
||
88 | suffixes are determined by lexicographic comparison of |
||
89 | periodicity. */ |
||
90 | static size_t |
||
91 | critical_factorization (const unsigned char *needle, size_t needle_len, |
||
92 | size_t *period) |
||
93 | { |
||
94 | /* Index of last byte of left half, or SIZE_MAX. */ |
||
95 | size_t max_suffix, max_suffix_rev; |
||
96 | size_t j; /* Index into NEEDLE for current candidate suffix. */ |
||
97 | size_t k; /* Offset into current period. */ |
||
98 | size_t p; /* Intermediate period. */ |
||
99 | unsigned char a, b; /* Current comparison bytes. */ |
||
100 | |||
101 | /* Invariants: |
||
102 | |||
103 | -1 <= max_suffix{,_rev} < j (treating SIZE_MAX as if it were signed) |
||
104 | min(max_suffix, max_suffix_rev) < global period of NEEDLE |
||
105 | 1 <= p <= global period of NEEDLE |
||
106 | p == global period of the substring NEEDLE[max_suffix{,_rev}+1...j] |
||
107 | 1 <= k <= p |
||
108 | */ |
||
109 | |||
110 | /* Perform lexicographic search. */ |
||
111 | max_suffix = SIZE_MAX; |
||
112 | j = 0; |
||
113 | k = p = 1; |
||
114 | while (j + k < needle_len) |
||
115 | { |
||
116 | a = CANON_ELEMENT (needle[j + k]); |
||
117 | b = CANON_ELEMENT (needle[(size_t)(max_suffix + k)]); |
||
118 | if (a < b) |
||
119 | { |
||
120 | /* Suffix is smaller, period is entire prefix so far. */ |
||
121 | j += k; |
||
122 | k = 1; |
||
123 | p = j - max_suffix; |
||
124 | } |
||
125 | else if (a == b) |
||
126 | { |
||
127 | /* Advance through repetition of the current period. */ |
||
128 | if (k != p) |
||
129 | ++k; |
||
130 | else |
||
131 | { |
||
132 | j += p; |
||
133 | k = 1; |
||
134 | } |
||
135 | } |
||
136 | else /* b < a */ |
||
137 | { |
||
138 | /* Suffix is larger, start over from current location. */ |
||
139 | max_suffix = j++; |
||
140 | k = p = 1; |
||
141 | } |
||
142 | } |
||
143 | *period = p; |
||
144 | |||
145 | /* Perform reverse lexicographic search. */ |
||
146 | max_suffix_rev = SIZE_MAX; |
||
147 | j = 0; |
||
148 | k = p = 1; |
||
149 | while (j + k < needle_len) |
||
150 | { |
||
151 | a = CANON_ELEMENT (needle[j + k]); |
||
152 | b = CANON_ELEMENT (needle[max_suffix_rev + k]); |
||
153 | if (b < a) |
||
154 | { |
||
155 | /* Suffix is smaller, period is entire prefix so far. */ |
||
156 | j += k; |
||
157 | k = 1; |
||
158 | p = j - max_suffix_rev; |
||
159 | } |
||
160 | else if (a == b) |
||
161 | { |
||
162 | /* Advance through repetition of the current period. */ |
||
163 | if (k != p) |
||
164 | ++k; |
||
165 | else |
||
166 | { |
||
167 | j += p; |
||
168 | k = 1; |
||
169 | } |
||
170 | } |
||
171 | else /* a < b */ |
||
172 | { |
||
173 | /* Suffix is larger, start over from current location. */ |
||
174 | max_suffix_rev = j++; |
||
175 | k = p = 1; |
||
176 | } |
||
177 | } |
||
178 | |||
179 | /* Choose the longer suffix. Return the first byte of the right |
||
180 | half, rather than the last byte of the left half. */ |
||
181 | if (max_suffix_rev + 1 < max_suffix + 1) |
||
182 | return max_suffix + 1; |
||
183 | *period = p; |
||
184 | return max_suffix_rev + 1; |
||
185 | } |
||
186 | |||
187 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
||
188 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
||
189 | method is optimized for NEEDLE_LEN < LONG_NEEDLE_THRESHOLD. |
||
190 | Performance is guaranteed to be linear, with an initialization cost |
||
191 | of 2 * NEEDLE_LEN comparisons. |
||
192 | |||
193 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
||
194 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. |
||
195 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
||
196 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching. */ |
||
197 | static RETURN_TYPE |
||
198 | two_way_short_needle (const unsigned char *haystack, size_t haystack_len, |
||
199 | const unsigned char *needle, size_t needle_len) |
||
200 | { |
||
201 | size_t i; /* Index into current byte of NEEDLE. */ |
||
202 | size_t j; /* Index into current window of HAYSTACK. */ |
||
203 | size_t period; /* The period of the right half of needle. */ |
||
204 | size_t suffix; /* The index of the right half of needle. */ |
||
205 | |||
206 | /* Factor the needle into two halves, such that the left half is |
||
207 | smaller than the global period, and the right half is |
||
208 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
||
209 | suffix = critical_factorization (needle, needle_len, &period); |
||
210 | |||
211 | /* Perform the search. Each iteration compares the right half |
||
212 | first. */ |
||
213 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
||
214 | { |
||
215 | /* Entire needle is periodic; a mismatch can only advance by the |
||
216 | period, so use memory to avoid rescanning known occurrences |
||
217 | of the period. */ |
||
218 | size_t memory = 0; |
||
219 | j = 0; |
||
220 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
||
221 | { |
||
222 | /* Scan for matches in right half. */ |
||
223 | i = MAX (suffix, memory); |
||
224 | while (i < needle_len && (CANON_ELEMENT (needle[i]) |
||
225 | == CANON_ELEMENT (haystack[i + j]))) |
||
226 | ++i; |
||
227 | if (needle_len <= i) |
||
228 | { |
||
229 | /* Scan for matches in left half. */ |
||
230 | i = suffix - 1; |
||
231 | while (memory < i + 1 && (CANON_ELEMENT (needle[i]) |
||
232 | == CANON_ELEMENT (haystack[i + j]))) |
||
233 | --i; |
||
234 | if (i + 1 < memory + 1) |
||
235 | return (RETURN_TYPE) (haystack + j); |
||
236 | /* No match, so remember how many repetitions of period |
||
237 | on the right half were scanned. */ |
||
238 | j += period; |
||
239 | memory = needle_len - period; |
||
240 | } |
||
241 | else |
||
242 | { |
||
243 | j += i - suffix + 1; |
||
244 | memory = 0; |
||
245 | } |
||
246 | } |
||
247 | } |
||
248 | else |
||
249 | { |
||
250 | /* The two halves of needle are distinct; no extra memory is |
||
251 | required, and any mismatch results in a maximal shift. */ |
||
252 | period = MAX (suffix, needle_len - suffix) + 1; |
||
253 | j = 0; |
||
254 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
||
255 | { |
||
256 | /* Scan for matches in right half. */ |
||
257 | i = suffix; |
||
258 | while (i < needle_len && (CANON_ELEMENT (needle[i]) |
||
259 | == CANON_ELEMENT (haystack[i + j]))) |
||
260 | ++i; |
||
261 | if (needle_len <= i) |
||
262 | { |
||
263 | /* Scan for matches in left half. */ |
||
264 | i = suffix - 1; |
||
265 | while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) |
||
266 | == CANON_ELEMENT (haystack[i + j]))) |
||
267 | --i; |
||
268 | if (i == SIZE_MAX) |
||
269 | return (RETURN_TYPE) (haystack + j); |
||
270 | j += period; |
||
271 | } |
||
272 | else |
||
273 | j += i - suffix + 1; |
||
274 | } |
||
275 | } |
||
276 | return NULL; |
||
277 | } |
||
278 | |||
279 | /* Return the first location of non-empty NEEDLE within HAYSTACK, or |
||
280 | NULL. HAYSTACK_LEN is the minimum known length of HAYSTACK. This |
||
281 | method is optimized for LONG_NEEDLE_THRESHOLD <= NEEDLE_LEN. |
||
282 | Performance is guaranteed to be linear, with an initialization cost |
||
283 | of 3 * NEEDLE_LEN + (1 << CHAR_BIT) operations. |
||
284 | |||
285 | If AVAILABLE does not modify HAYSTACK_LEN (as in memmem), then at |
||
286 | most 2 * HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, |
||
287 | and sublinear performance O(HAYSTACK_LEN / NEEDLE_LEN) is possible. |
||
288 | If AVAILABLE modifies HAYSTACK_LEN (as in strstr), then at most 3 * |
||
289 | HAYSTACK_LEN - NEEDLE_LEN comparisons occur in searching, and |
||
290 | sublinear performance is not possible. */ |
||
291 | static RETURN_TYPE |
||
292 | two_way_long_needle (const unsigned char *haystack, size_t haystack_len, |
||
293 | const unsigned char *needle, size_t needle_len) |
||
294 | { |
||
295 | size_t i; /* Index into current byte of NEEDLE. */ |
||
296 | size_t j; /* Index into current window of HAYSTACK. */ |
||
297 | size_t period; /* The period of the right half of needle. */ |
||
298 | size_t suffix; /* The index of the right half of needle. */ |
||
299 | size_t shift_table[1U << CHAR_BIT]; /* See below. */ |
||
300 | |||
301 | /* Factor the needle into two halves, such that the left half is |
||
302 | smaller than the global period, and the right half is |
||
303 | periodic (with a period as large as NEEDLE_LEN - suffix). */ |
||
304 | suffix = critical_factorization (needle, needle_len, &period); |
||
305 | |||
306 | /* Populate shift_table. For each possible byte value c, |
||
307 | shift_table[c] is the distance from the last occurrence of c to |
||
308 | the end of NEEDLE, or NEEDLE_LEN if c is absent from the NEEDLE. |
||
309 | shift_table[NEEDLE[NEEDLE_LEN - 1]] contains the only 0. */ |
||
310 | for (i = 0; i < 1U << CHAR_BIT; i++) |
||
311 | shift_table[i] = needle_len; |
||
312 | for (i = 0; i < needle_len; i++) |
||
313 | shift_table[CANON_ELEMENT (needle[i])] = needle_len - i - 1; |
||
314 | |||
315 | /* Perform the search. Each iteration compares the right half |
||
316 | first. */ |
||
317 | if (CMP_FUNC (needle, needle + period, suffix) == 0) |
||
318 | { |
||
319 | /* Entire needle is periodic; a mismatch can only advance by the |
||
320 | period, so use memory to avoid rescanning known occurrences |
||
321 | of the period. */ |
||
322 | size_t memory = 0; |
||
323 | size_t shift; |
||
324 | j = 0; |
||
325 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
||
326 | { |
||
327 | /* Check the last byte first; if it does not match, then |
||
328 | shift to the next possible match location. */ |
||
329 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
||
330 | if (0 < shift) |
||
331 | { |
||
332 | if (memory && shift < period) |
||
333 | { |
||
334 | /* Since needle is periodic, but the last period has |
||
335 | a byte out of place, there can be no match until |
||
336 | after the mismatch. */ |
||
337 | shift = needle_len - period; |
||
338 | } |
||
6099 | serge | 339 | memory = 0; |
4349 | Serge | 340 | j += shift; |
341 | continue; |
||
342 | } |
||
343 | /* Scan for matches in right half. The last byte has |
||
344 | already been matched, by virtue of the shift table. */ |
||
345 | i = MAX (suffix, memory); |
||
346 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) |
||
347 | == CANON_ELEMENT (haystack[i + j]))) |
||
348 | ++i; |
||
349 | if (needle_len - 1 <= i) |
||
350 | { |
||
351 | /* Scan for matches in left half. */ |
||
352 | i = suffix - 1; |
||
353 | while (memory < i + 1 && (CANON_ELEMENT (needle[i]) |
||
354 | == CANON_ELEMENT (haystack[i + j]))) |
||
355 | --i; |
||
356 | if (i + 1 < memory + 1) |
||
357 | return (RETURN_TYPE) (haystack + j); |
||
358 | /* No match, so remember how many repetitions of period |
||
359 | on the right half were scanned. */ |
||
360 | j += period; |
||
361 | memory = needle_len - period; |
||
362 | } |
||
363 | else |
||
364 | { |
||
365 | j += i - suffix + 1; |
||
366 | memory = 0; |
||
367 | } |
||
368 | } |
||
369 | } |
||
370 | else |
||
371 | { |
||
372 | /* The two halves of needle are distinct; no extra memory is |
||
373 | required, and any mismatch results in a maximal shift. */ |
||
374 | size_t shift; |
||
375 | period = MAX (suffix, needle_len - suffix) + 1; |
||
376 | j = 0; |
||
377 | while (AVAILABLE (haystack, haystack_len, j, needle_len)) |
||
378 | { |
||
379 | /* Check the last byte first; if it does not match, then |
||
380 | shift to the next possible match location. */ |
||
381 | shift = shift_table[CANON_ELEMENT (haystack[j + needle_len - 1])]; |
||
382 | if (0 < shift) |
||
383 | { |
||
384 | j += shift; |
||
385 | continue; |
||
386 | } |
||
387 | /* Scan for matches in right half. The last byte has |
||
388 | already been matched, by virtue of the shift table. */ |
||
389 | i = suffix; |
||
390 | while (i < needle_len - 1 && (CANON_ELEMENT (needle[i]) |
||
391 | == CANON_ELEMENT (haystack[i + j]))) |
||
392 | ++i; |
||
393 | if (needle_len - 1 <= i) |
||
394 | { |
||
395 | /* Scan for matches in left half. */ |
||
396 | i = suffix - 1; |
||
397 | while (i != SIZE_MAX && (CANON_ELEMENT (needle[i]) |
||
398 | == CANON_ELEMENT (haystack[i + j]))) |
||
399 | --i; |
||
400 | if (i == SIZE_MAX) |
||
401 | return (RETURN_TYPE) (haystack + j); |
||
402 | j += period; |
||
403 | } |
||
404 | else |
||
405 | j += i - suffix + 1; |
||
406 | } |
||
407 | } |
||
408 | return NULL; |
||
409 | } |
||
410 | |||
411 | #undef AVAILABLE |
||
412 | #undef CANON_ELEMENT |
||
413 | #undef CMP_FUNC |
||
414 | #undef MAX |
||
415 | #undef RETURN_TYPE=>>>>>=>>>>>><>>><>><>=>=>>>>=>>>>>>>>>>=>=>=>=>>>=>>=>>>><> |