Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4973 | right-hear | 1 | /* Copyright (C) 1995 DJ Delorie, see COPYING.DJ for details */ |
2 | /* |
||
3 | * The matching engine and friends. This file is #included by regexec.c |
||
4 | * after suitable #defines of a variety of macros used herein, so that |
||
5 | * different state representations can be used without duplicating masses |
||
6 | * of code. |
||
7 | */ |
||
8 | |||
9 | #ifdef SNAMES |
||
10 | #define matcher smatcher |
||
11 | #define fast sfast |
||
12 | #define slow sslow |
||
13 | #define dissect sdissect |
||
14 | #define backref sbackref |
||
15 | #define step sstep |
||
16 | #define print sprint |
||
17 | #define at sat |
||
18 | #define match smat |
||
19 | #endif |
||
20 | #ifdef LNAMES |
||
21 | #define matcher lmatcher |
||
22 | #define fast lfast |
||
23 | #define slow lslow |
||
24 | #define dissect ldissect |
||
25 | #define backref lbackref |
||
26 | #define step lstep |
||
27 | #define print lprint |
||
28 | #define at lat |
||
29 | #define match lmat |
||
30 | #endif |
||
31 | |||
32 | /* another structure passed up and down to avoid zillions of parameters */ |
||
33 | struct match { |
||
34 | struct re_guts *g; |
||
35 | int eflags; |
||
36 | regmatch_t *pmatch; /* [nsub+1] (0 element unused) */ |
||
37 | char *offp; /* offsets work from here */ |
||
38 | char *beginp; /* start of string -- virtual NUL precedes */ |
||
39 | char *endp; /* end of string -- virtual NUL here */ |
||
40 | char *coldp; /* can be no match starting before here */ |
||
41 | char **lastpos; /* [nplus+1] */ |
||
42 | STATEVARS; |
||
43 | states st; /* current states */ |
||
44 | states fresh; /* states for a fresh start */ |
||
45 | states tmp; /* temporary */ |
||
46 | states empty; /* empty set of states */ |
||
47 | }; |
||
48 | |||
49 | #include "engine.ih" |
||
50 | |||
51 | #ifdef REDEBUG |
||
52 | #define SP(t, s, c) print(m, t, s, c, stdout) |
||
53 | #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2) |
||
54 | #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); } |
||
55 | #else |
||
56 | #define SP(t, s, c) /* nothing */ |
||
57 | #define AT(t, p1, p2, s1, s2) /* nothing */ |
||
58 | #define NOTE(s) /* nothing */ |
||
59 | #endif |
||
60 | |||
61 | /* |
||
62 | - matcher - the actual matching engine |
||
63 | == static int matcher(register struct re_guts *g, char *string, \ |
||
64 | == size_t nmatch, regmatch_t pmatch[], int eflags); |
||
65 | */ |
||
66 | static int /* 0 success, REG_NOMATCH failure */ |
||
67 | matcher(g, string, nmatch, pmatch, eflags) |
||
68 | register struct re_guts *g; |
||
69 | char *string; |
||
70 | size_t nmatch; |
||
71 | regmatch_t pmatch[]; |
||
72 | int eflags; |
||
73 | { |
||
74 | register char *endp; |
||
75 | register int i; |
||
76 | struct match mv; |
||
77 | register struct match *m = &mv; |
||
78 | register char *dp; |
||
79 | const register sopno gf = g->firststate+1; /* +1 for OEND */ |
||
80 | const register sopno gl = g->laststate; |
||
81 | char *start; |
||
82 | char *stop; |
||
83 | |||
84 | /* simplify the situation where possible */ |
||
85 | if (g->cflags®_NOSUB) |
||
86 | nmatch = 0; |
||
87 | if (eflags®_STARTEND) { |
||
88 | start = string + pmatch[0].rm_so; |
||
89 | stop = string + pmatch[0].rm_eo; |
||
90 | } else { |
||
91 | start = string; |
||
92 | stop = start + strlen(start); |
||
93 | } |
||
94 | if (stop < start) |
||
95 | return(REG_INVARG); |
||
96 | |||
97 | /* prescreening; this does wonders for this rather slow code */ |
||
98 | if (g->must != NULL) { |
||
99 | for (dp = start; dp < stop; dp++) |
||
100 | if (*dp == g->must[0] && stop - dp >= g->mlen && |
||
101 | memcmp(dp, g->must, (size_t)g->mlen) == 0) |
||
102 | break; |
||
103 | if (dp == stop) /* we didn't find g->must */ |
||
104 | return(REG_NOMATCH); |
||
105 | } |
||
106 | |||
107 | /* match struct setup */ |
||
108 | m->g = g; |
||
109 | m->eflags = eflags; |
||
110 | m->pmatch = NULL; |
||
111 | m->lastpos = NULL; |
||
112 | m->offp = string; |
||
113 | m->beginp = start; |
||
114 | m->endp = stop; |
||
115 | STATESETUP(m, 4); |
||
116 | SETUP(m->st); |
||
117 | SETUP(m->fresh); |
||
118 | SETUP(m->tmp); |
||
119 | SETUP(m->empty); |
||
120 | CLEAR(m->empty); |
||
121 | |||
122 | /* this loop does only one repetition except for backrefs */ |
||
123 | for (;;) { |
||
124 | endp = fast(m, start, stop, gf, gl); |
||
125 | if (endp == NULL) { /* a miss */ |
||
126 | STATETEARDOWN(m); |
||
127 | return(REG_NOMATCH); |
||
128 | } |
||
129 | if (nmatch == 0 && !g->backrefs) |
||
130 | break; /* no further info needed */ |
||
131 | |||
132 | /* where? */ |
||
133 | assert(m->coldp != NULL); |
||
134 | for (;;) { |
||
135 | NOTE("finding start"); |
||
136 | endp = slow(m, m->coldp, stop, gf, gl); |
||
137 | if (endp != NULL) |
||
138 | break; |
||
139 | assert(m->coldp < m->endp); |
||
140 | m->coldp++; |
||
141 | } |
||
142 | if (nmatch == 1 && !g->backrefs) |
||
143 | break; /* no further info needed */ |
||
144 | |||
145 | /* oh my, he wants the subexpressions... */ |
||
146 | if (m->pmatch == NULL) |
||
147 | m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) * |
||
148 | sizeof(regmatch_t)); |
||
149 | if (m->pmatch == NULL) { |
||
150 | STATETEARDOWN(m); |
||
151 | return(REG_ESPACE); |
||
152 | } |
||
153 | for (i = 1; i <= m->g->nsub; i++) |
||
154 | m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1; |
||
155 | if (!g->backrefs && !(m->eflags®_BACKR)) { |
||
156 | NOTE("dissecting"); |
||
157 | dp = dissect(m, m->coldp, endp, gf, gl); |
||
158 | } else { |
||
159 | if (g->nplus > 0 && m->lastpos == NULL) |
||
160 | m->lastpos = (char **)malloc((g->nplus+1) * |
||
161 | sizeof(char *)); |
||
162 | if (g->nplus > 0 && m->lastpos == NULL) { |
||
163 | free(m->pmatch); |
||
164 | STATETEARDOWN(m); |
||
165 | return(REG_ESPACE); |
||
166 | } |
||
167 | NOTE("backref dissect"); |
||
168 | dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); |
||
169 | } |
||
170 | if (dp != NULL) |
||
171 | break; |
||
172 | |||
173 | /* uh-oh... we couldn't find a subexpression-level match */ |
||
174 | assert(g->backrefs); /* must be back references doing it */ |
||
175 | assert(g->nplus == 0 || m->lastpos != NULL); |
||
176 | for (;;) { |
||
177 | if (dp != NULL || endp <= m->coldp) |
||
178 | break; /* defeat */ |
||
179 | NOTE("backoff"); |
||
180 | endp = slow(m, m->coldp, endp-1, gf, gl); |
||
181 | if (endp == NULL) |
||
182 | break; /* defeat */ |
||
183 | /* try it on a shorter possibility */ |
||
184 | #ifndef NDEBUG |
||
185 | for (i = 1; i <= m->g->nsub; i++) { |
||
186 | assert(m->pmatch[i].rm_so == -1); |
||
187 | assert(m->pmatch[i].rm_eo == -1); |
||
188 | } |
||
189 | #endif |
||
190 | NOTE("backoff dissect"); |
||
191 | dp = backref(m, m->coldp, endp, gf, gl, (sopno)0); |
||
192 | } |
||
193 | assert(dp == NULL || dp == endp); |
||
194 | if (dp != NULL) /* found a shorter one */ |
||
195 | break; |
||
196 | |||
197 | /* despite initial appearances, there is no match here */ |
||
198 | NOTE("false alarm"); |
||
199 | start = m->coldp + 1; /* recycle starting later */ |
||
200 | assert(start <= stop); |
||
201 | } |
||
202 | |||
203 | /* fill in the details if requested */ |
||
204 | if (nmatch > 0) { |
||
205 | pmatch[0].rm_so = m->coldp - m->offp; |
||
206 | pmatch[0].rm_eo = endp - m->offp; |
||
207 | } |
||
208 | if (nmatch > 1) { |
||
209 | assert(m->pmatch != NULL); |
||
210 | for (i = 1; i < nmatch; i++) |
||
211 | if (i <= m->g->nsub) |
||
212 | pmatch[i] = m->pmatch[i]; |
||
213 | else { |
||
214 | pmatch[i].rm_so = -1; |
||
215 | pmatch[i].rm_eo = -1; |
||
216 | } |
||
217 | } |
||
218 | |||
219 | if (m->pmatch != NULL) |
||
220 | free((char *)m->pmatch); |
||
221 | if (m->lastpos != NULL) |
||
222 | free((char *)m->lastpos); |
||
223 | STATETEARDOWN(m); |
||
224 | return(0); |
||
225 | } |
||
226 | |||
227 | /* |
||
228 | - dissect - figure out what matched what, no back references |
||
229 | == static char *dissect(register struct match *m, char *start, \ |
||
230 | == char *stop, sopno startst, sopno stopst); |
||
231 | */ |
||
232 | static char * /* == stop (success) always */ |
||
233 | dissect(m, start, stop, startst, stopst) |
||
234 | register struct match *m; |
||
235 | char *start; |
||
236 | char *stop; |
||
237 | sopno startst; |
||
238 | sopno stopst; |
||
239 | { |
||
240 | register int i; |
||
241 | register sopno ss; /* start sop of current subRE */ |
||
242 | register sopno es; /* end sop of current subRE */ |
||
243 | register char *sp; /* start of string matched by it */ |
||
244 | register char *stp; /* string matched by it cannot pass here */ |
||
245 | register char *rest; /* start of rest of string */ |
||
246 | register char *tail; /* string unmatched by rest of RE */ |
||
247 | register sopno ssub; /* start sop of subsubRE */ |
||
248 | register sopno esub; /* end sop of subsubRE */ |
||
249 | register char *ssp; /* start of string matched by subsubRE */ |
||
250 | register char *sep; /* end of string matched by subsubRE */ |
||
251 | register char *oldssp; /* previous ssp */ |
||
252 | register char *dp; |
||
253 | |||
254 | AT("diss", start, stop, startst, stopst); |
||
255 | sp = start; |
||
256 | for (ss = startst; ss < stopst; ss = es) { |
||
257 | /* identify end of subRE */ |
||
258 | es = ss; |
||
259 | switch (OP(m->g->strip[es])) { |
||
260 | case OPLUS_: |
||
261 | case OQUEST_: |
||
262 | es += OPND(m->g->strip[es]); |
||
263 | break; |
||
264 | case OCH_: |
||
265 | while (OP(m->g->strip[es]) != O_CH) |
||
266 | es += OPND(m->g->strip[es]); |
||
267 | break; |
||
268 | } |
||
269 | es++; |
||
270 | |||
271 | /* figure out what it matched */ |
||
272 | switch (OP(m->g->strip[ss])) { |
||
273 | case OEND: |
||
274 | assert(nope); |
||
275 | break; |
||
276 | case OCHAR: |
||
277 | sp++; |
||
278 | break; |
||
279 | case OBOL: |
||
280 | case OEOL: |
||
281 | case OBOW: |
||
282 | case OEOW: |
||
283 | break; |
||
284 | case OANY: |
||
285 | case OANYOF: |
||
286 | sp++; |
||
287 | break; |
||
288 | case OBACK_: |
||
289 | case O_BACK: |
||
290 | assert(nope); |
||
291 | break; |
||
292 | /* cases where length of match is hard to find */ |
||
293 | case OQUEST_: |
||
294 | stp = stop; |
||
295 | for (;;) { |
||
296 | /* how long could this one be? */ |
||
297 | rest = slow(m, sp, stp, ss, es); |
||
298 | assert(rest != NULL); /* it did match */ |
||
299 | /* could the rest match the rest? */ |
||
300 | tail = slow(m, rest, stop, es, stopst); |
||
301 | if (tail == stop) |
||
302 | break; /* yes! */ |
||
303 | /* no -- try a shorter match for this one */ |
||
304 | stp = rest - 1; |
||
305 | assert(stp >= sp); /* it did work */ |
||
306 | } |
||
307 | ssub = ss + 1; |
||
308 | esub = es - 1; |
||
309 | /* did innards match? */ |
||
310 | if (slow(m, sp, rest, ssub, esub) != NULL) { |
||
311 | dp = dissect(m, sp, rest, ssub, esub); |
||
312 | assert(dp == rest); |
||
313 | } else /* no */ |
||
314 | assert(sp == rest); |
||
315 | sp = rest; |
||
316 | break; |
||
317 | case OPLUS_: |
||
318 | stp = stop; |
||
319 | for (;;) { |
||
320 | /* how long could this one be? */ |
||
321 | rest = slow(m, sp, stp, ss, es); |
||
322 | assert(rest != NULL); /* it did match */ |
||
323 | /* could the rest match the rest? */ |
||
324 | tail = slow(m, rest, stop, es, stopst); |
||
325 | if (tail == stop) |
||
326 | break; /* yes! */ |
||
327 | /* no -- try a shorter match for this one */ |
||
328 | stp = rest - 1; |
||
329 | assert(stp >= sp); /* it did work */ |
||
330 | } |
||
331 | ssub = ss + 1; |
||
332 | esub = es - 1; |
||
333 | ssp = sp; |
||
334 | oldssp = ssp; |
||
335 | for (;;) { /* find last match of innards */ |
||
336 | sep = slow(m, ssp, rest, ssub, esub); |
||
337 | if (sep == NULL || sep == ssp) |
||
338 | break; /* failed or matched null */ |
||
339 | oldssp = ssp; /* on to next try */ |
||
340 | ssp = sep; |
||
341 | } |
||
342 | if (sep == NULL) { |
||
343 | /* last successful match */ |
||
344 | sep = ssp; |
||
345 | ssp = oldssp; |
||
346 | } |
||
347 | assert(sep == rest); /* must exhaust substring */ |
||
348 | assert(slow(m, ssp, sep, ssub, esub) == rest); |
||
349 | dp = dissect(m, ssp, sep, ssub, esub); |
||
350 | assert(dp == sep); |
||
351 | sp = rest; |
||
352 | break; |
||
353 | case OCH_: |
||
354 | stp = stop; |
||
355 | for (;;) { |
||
356 | /* how long could this one be? */ |
||
357 | rest = slow(m, sp, stp, ss, es); |
||
358 | assert(rest != NULL); /* it did match */ |
||
359 | /* could the rest match the rest? */ |
||
360 | tail = slow(m, rest, stop, es, stopst); |
||
361 | if (tail == stop) |
||
362 | break; /* yes! */ |
||
363 | /* no -- try a shorter match for this one */ |
||
364 | stp = rest - 1; |
||
365 | assert(stp >= sp); /* it did work */ |
||
366 | } |
||
367 | ssub = ss + 1; |
||
368 | esub = ss + OPND(m->g->strip[ss]) - 1; |
||
369 | assert(OP(m->g->strip[esub]) == OOR1); |
||
370 | for (;;) { /* find first matching branch */ |
||
371 | if (slow(m, sp, rest, ssub, esub) == rest) |
||
372 | break; /* it matched all of it */ |
||
373 | /* that one missed, try next one */ |
||
374 | assert(OP(m->g->strip[esub]) == OOR1); |
||
375 | esub++; |
||
376 | assert(OP(m->g->strip[esub]) == OOR2); |
||
377 | ssub = esub + 1; |
||
378 | esub += OPND(m->g->strip[esub]); |
||
379 | if (OP(m->g->strip[esub]) == OOR2) |
||
380 | esub--; |
||
381 | else |
||
382 | assert(OP(m->g->strip[esub]) == O_CH); |
||
383 | } |
||
384 | dp = dissect(m, sp, rest, ssub, esub); |
||
385 | assert(dp == rest); |
||
386 | sp = rest; |
||
387 | break; |
||
388 | case O_PLUS: |
||
389 | case O_QUEST: |
||
390 | case OOR1: |
||
391 | case OOR2: |
||
392 | case O_CH: |
||
393 | assert(nope); |
||
394 | break; |
||
395 | case OLPAREN: |
||
396 | i = OPND(m->g->strip[ss]); |
||
397 | assert(0 < i && i <= m->g->nsub); |
||
398 | m->pmatch[i].rm_so = sp - m->offp; |
||
399 | break; |
||
400 | case ORPAREN: |
||
401 | i = OPND(m->g->strip[ss]); |
||
402 | assert(0 < i && i <= m->g->nsub); |
||
403 | m->pmatch[i].rm_eo = sp - m->offp; |
||
404 | break; |
||
405 | default: /* uh oh */ |
||
406 | assert(nope); |
||
407 | break; |
||
408 | } |
||
409 | } |
||
410 | |||
411 | assert(sp == stop); |
||
412 | return(sp); |
||
413 | } |
||
414 | |||
415 | /* |
||
416 | - backref - figure out what matched what, figuring in back references |
||
417 | == static char *backref(register struct match *m, char *start, \ |
||
418 | == char *stop, sopno startst, sopno stopst, sopno lev); |
||
419 | */ |
||
420 | static char * /* == stop (success) or NULL (failure) */ |
||
421 | backref(m, start, stop, startst, stopst, lev) |
||
422 | register struct match *m; |
||
423 | char *start; |
||
424 | char *stop; |
||
425 | sopno startst; |
||
426 | sopno stopst; |
||
427 | sopno lev; /* PLUS nesting level */ |
||
428 | { |
||
429 | register int i; |
||
430 | register sopno ss; /* start sop of current subRE */ |
||
431 | register char *sp; /* start of string matched by it */ |
||
432 | register sopno ssub; /* start sop of subsubRE */ |
||
433 | register sopno esub; /* end sop of subsubRE */ |
||
434 | register char *ssp; /* start of string matched by subsubRE */ |
||
435 | register char *dp; |
||
436 | register size_t len; |
||
437 | register int hard; |
||
438 | register sop s; |
||
439 | register regoff_t offsave; |
||
440 | register cset *cs; |
||
441 | |||
442 | AT("back", start, stop, startst, stopst); |
||
443 | sp = start; |
||
444 | |||
445 | /* get as far as we can with easy stuff */ |
||
446 | hard = 0; |
||
447 | for (ss = startst; !hard && ss < stopst; ss++) |
||
448 | switch (OP(s = m->g->strip[ss])) { |
||
449 | case OCHAR: |
||
450 | if (sp == stop || *sp++ != (char)OPND(s)) |
||
451 | return(NULL); |
||
452 | break; |
||
453 | case OANY: |
||
454 | if (sp == stop) |
||
455 | return(NULL); |
||
456 | sp++; |
||
457 | break; |
||
458 | case OANYOF: |
||
459 | cs = &m->g->sets[OPND(s)]; |
||
460 | if (sp == stop || !CHIN(cs, *sp++)) |
||
461 | return(NULL); |
||
462 | break; |
||
463 | case OBOL: |
||
464 | if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) || |
||
465 | (sp < m->endp && *(sp-1) == '\n' && |
||
466 | (m->g->cflags®_NEWLINE)) ) |
||
467 | { /* yes */ } |
||
468 | else |
||
469 | return(NULL); |
||
470 | break; |
||
471 | case OEOL: |
||
472 | if ( (sp == m->endp && !(m->eflags®_NOTEOL)) || |
||
473 | (sp < m->endp && *sp == '\n' && |
||
474 | (m->g->cflags®_NEWLINE)) ) |
||
475 | { /* yes */ } |
||
476 | else |
||
477 | return(NULL); |
||
478 | break; |
||
479 | case OBOW: |
||
480 | if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) || |
||
481 | (sp < m->endp && *(sp-1) == '\n' && |
||
482 | (m->g->cflags®_NEWLINE)) || |
||
483 | (sp > m->beginp && |
||
484 | !ISWORD(*(sp-1))) ) && |
||
485 | (sp < m->endp && ISWORD(*sp)) ) |
||
486 | { /* yes */ } |
||
487 | else |
||
488 | return(NULL); |
||
489 | break; |
||
490 | case OEOW: |
||
491 | if (( (sp == m->endp && !(m->eflags®_NOTEOL)) || |
||
492 | (sp < m->endp && *sp == '\n' && |
||
493 | (m->g->cflags®_NEWLINE)) || |
||
494 | (sp < m->endp && !ISWORD(*sp)) ) && |
||
495 | (sp > m->beginp && ISWORD(*(sp-1))) ) |
||
496 | { /* yes */ } |
||
497 | else |
||
498 | return(NULL); |
||
499 | break; |
||
500 | case O_QUEST: |
||
501 | break; |
||
502 | case OOR1: /* matches null but needs to skip */ |
||
503 | ss++; |
||
504 | s = m->g->strip[ss]; |
||
505 | do { |
||
506 | assert(OP(s) == OOR2); |
||
507 | ss += OPND(s); |
||
508 | } while (OP(s = m->g->strip[ss]) != O_CH); |
||
509 | /* note that the ss++ gets us past the O_CH */ |
||
510 | break; |
||
511 | default: /* have to make a choice */ |
||
512 | hard = 1; |
||
513 | break; |
||
514 | } |
||
515 | if (!hard) { /* that was it! */ |
||
516 | if (sp != stop) |
||
517 | return(NULL); |
||
518 | return(sp); |
||
519 | } |
||
520 | ss--; /* adjust for the for's final increment */ |
||
521 | |||
522 | /* the hard stuff */ |
||
523 | AT("hard", sp, stop, ss, stopst); |
||
524 | s = m->g->strip[ss]; |
||
525 | switch (OP(s)) { |
||
526 | case OBACK_: /* the vilest depths */ |
||
527 | i = OPND(s); |
||
528 | assert(0 < i && i <= m->g->nsub); |
||
529 | if (m->pmatch[i].rm_eo == -1) |
||
530 | return(NULL); |
||
531 | assert(m->pmatch[i].rm_so != -1); |
||
532 | len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so; |
||
533 | assert(stop - m->beginp >= len); |
||
534 | if (sp > stop - len) |
||
535 | return(NULL); /* not enough left to match */ |
||
536 | ssp = m->offp + m->pmatch[i].rm_so; |
||
537 | if (memcmp(sp, ssp, len) != 0) |
||
538 | return(NULL); |
||
539 | while (m->g->strip[ss] != SOP(O_BACK, i)) |
||
540 | ss++; |
||
541 | return(backref(m, sp+len, stop, ss+1, stopst, lev)); |
||
542 | break; |
||
543 | case OQUEST_: /* to null or not */ |
||
544 | dp = backref(m, sp, stop, ss+1, stopst, lev); |
||
545 | if (dp != NULL) |
||
546 | return(dp); /* not */ |
||
547 | return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev)); |
||
548 | break; |
||
549 | case OPLUS_: |
||
550 | assert(m->lastpos != NULL); |
||
551 | assert(lev+1 <= m->g->nplus); |
||
552 | m->lastpos[lev+1] = sp; |
||
553 | return(backref(m, sp, stop, ss+1, stopst, lev+1)); |
||
554 | break; |
||
555 | case O_PLUS: |
||
556 | if (sp == m->lastpos[lev]) /* last pass matched null */ |
||
557 | return(backref(m, sp, stop, ss+1, stopst, lev-1)); |
||
558 | /* try another pass */ |
||
559 | m->lastpos[lev] = sp; |
||
560 | dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev); |
||
561 | if (dp == NULL) |
||
562 | return(backref(m, sp, stop, ss+1, stopst, lev-1)); |
||
563 | else |
||
564 | return(dp); |
||
565 | break; |
||
566 | case OCH_: /* find the right one, if any */ |
||
567 | ssub = ss + 1; |
||
568 | esub = ss + OPND(s) - 1; |
||
569 | assert(OP(m->g->strip[esub]) == OOR1); |
||
570 | for (;;) { /* find first matching branch */ |
||
571 | dp = backref(m, sp, stop, ssub, esub, lev); |
||
572 | if (dp != NULL) |
||
573 | return(dp); |
||
574 | /* that one missed, try next one */ |
||
575 | if (OP(m->g->strip[esub]) == O_CH) |
||
576 | return(NULL); /* there is none */ |
||
577 | esub++; |
||
578 | assert(OP(m->g->strip[esub]) == OOR2); |
||
579 | ssub = esub + 1; |
||
580 | esub += OPND(m->g->strip[esub]); |
||
581 | if (OP(m->g->strip[esub]) == OOR2) |
||
582 | esub--; |
||
583 | else |
||
584 | assert(OP(m->g->strip[esub]) == O_CH); |
||
585 | } |
||
586 | break; |
||
587 | case OLPAREN: /* must undo assignment if rest fails */ |
||
588 | i = OPND(s); |
||
589 | assert(0 < i && i <= m->g->nsub); |
||
590 | offsave = m->pmatch[i].rm_so; |
||
591 | m->pmatch[i].rm_so = sp - m->offp; |
||
592 | dp = backref(m, sp, stop, ss+1, stopst, lev); |
||
593 | if (dp != NULL) |
||
594 | return(dp); |
||
595 | m->pmatch[i].rm_so = offsave; |
||
596 | return(NULL); |
||
597 | break; |
||
598 | case ORPAREN: /* must undo assignment if rest fails */ |
||
599 | i = OPND(s); |
||
600 | assert(0 < i && i <= m->g->nsub); |
||
601 | offsave = m->pmatch[i].rm_eo; |
||
602 | m->pmatch[i].rm_eo = sp - m->offp; |
||
603 | dp = backref(m, sp, stop, ss+1, stopst, lev); |
||
604 | if (dp != NULL) |
||
605 | return(dp); |
||
606 | m->pmatch[i].rm_eo = offsave; |
||
607 | return(NULL); |
||
608 | break; |
||
609 | default: /* uh oh */ |
||
610 | assert(nope); |
||
611 | break; |
||
612 | } |
||
613 | |||
614 | /* "can't happen" */ |
||
615 | assert(nope); |
||
616 | /* NOTREACHED */ |
||
617 | } |
||
618 | |||
619 | /* |
||
620 | - fast - step through the string at top speed |
||
621 | == static char *fast(register struct match *m, char *start, \ |
||
622 | == char *stop, sopno startst, sopno stopst); |
||
623 | */ |
||
624 | static char * /* where tentative match ended, or NULL */ |
||
625 | fast(m, start, stop, startst, stopst) |
||
626 | register struct match *m; |
||
627 | char *start; |
||
628 | char *stop; |
||
629 | sopno startst; |
||
630 | sopno stopst; |
||
631 | { |
||
632 | register states st = m->st; |
||
633 | register states fresh = m->fresh; |
||
634 | register states tmp = m->tmp; |
||
635 | register char *p = start; |
||
636 | register int c = (start == m->beginp) ? OUT : *(start-1); |
||
637 | register int lastc; /* previous c */ |
||
638 | register int flagch; |
||
639 | register int i; |
||
640 | register char *coldp; /* last p after which no match was underway */ |
||
641 | |||
642 | CLEAR(st); |
||
643 | SET1(st, startst); |
||
644 | st = step(m->g, startst, stopst, st, NOTHING, st); |
||
645 | ASSIGN(fresh, st); |
||
646 | SP("start", st, *p); |
||
647 | coldp = NULL; |
||
648 | for (;;) { |
||
649 | /* next character */ |
||
650 | lastc = c; |
||
651 | c = (p == m->endp) ? OUT : *p; |
||
652 | if (EQ(st, fresh)) |
||
653 | coldp = p; |
||
654 | |||
655 | /* is there an EOL and/or BOL between lastc and c? */ |
||
656 | flagch = '\0'; |
||
657 | i = 0; |
||
658 | if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || |
||
659 | (lastc == OUT && !(m->eflags®_NOTBOL)) ) { |
||
660 | flagch = BOL; |
||
661 | i = m->g->nbol; |
||
662 | } |
||
663 | if ( (c == '\n' && m->g->cflags®_NEWLINE) || |
||
664 | (c == OUT && !(m->eflags®_NOTEOL)) ) { |
||
665 | flagch = (flagch == BOL) ? BOLEOL : EOL; |
||
666 | i += m->g->neol; |
||
667 | } |
||
668 | if (i != 0) { |
||
669 | for (; i > 0; i--) |
||
670 | st = step(m->g, startst, stopst, st, flagch, st); |
||
671 | SP("boleol", st, c); |
||
672 | } |
||
673 | |||
674 | /* how about a word boundary? */ |
||
675 | if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && |
||
676 | (c != OUT && ISWORD(c)) ) { |
||
677 | flagch = BOW; |
||
678 | } |
||
679 | if ( (lastc != OUT && ISWORD(lastc)) && |
||
680 | (flagch == EOL || (c != OUT && !ISWORD(c))) ) { |
||
681 | flagch = EOW; |
||
682 | } |
||
683 | if (flagch == BOW || flagch == EOW) { |
||
684 | st = step(m->g, startst, stopst, st, flagch, st); |
||
685 | SP("boweow", st, c); |
||
686 | } |
||
687 | |||
688 | /* are we done? */ |
||
689 | if (ISSET(st, stopst) || p == stop) |
||
690 | break; /* NOTE BREAK OUT */ |
||
691 | |||
692 | /* no, we must deal with this character */ |
||
693 | ASSIGN(tmp, st); |
||
694 | ASSIGN(st, fresh); |
||
695 | assert(c != OUT); |
||
696 | st = step(m->g, startst, stopst, tmp, c, st); |
||
697 | SP("aft", st, c); |
||
698 | assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); |
||
699 | p++; |
||
700 | } |
||
701 | |||
702 | assert(coldp != NULL); |
||
703 | m->coldp = coldp; |
||
704 | if (ISSET(st, stopst)) |
||
705 | return(p+1); |
||
706 | else |
||
707 | return(NULL); |
||
708 | } |
||
709 | |||
710 | /* |
||
711 | - slow - step through the string more deliberately |
||
712 | == static char *slow(register struct match *m, char *start, \ |
||
713 | == char *stop, sopno startst, sopno stopst); |
||
714 | */ |
||
715 | static char * /* where it ended */ |
||
716 | slow(m, start, stop, startst, stopst) |
||
717 | register struct match *m; |
||
718 | char *start; |
||
719 | char *stop; |
||
720 | sopno startst; |
||
721 | sopno stopst; |
||
722 | { |
||
723 | register states st = m->st; |
||
724 | register states empty = m->empty; |
||
725 | register states tmp = m->tmp; |
||
726 | register char *p = start; |
||
727 | register int c = (start == m->beginp) ? OUT : *(start-1); |
||
728 | register int lastc; /* previous c */ |
||
729 | register int flagch; |
||
730 | register int i; |
||
731 | register char *matchp; /* last p at which a match ended */ |
||
732 | |||
733 | AT("slow", start, stop, startst, stopst); |
||
734 | CLEAR(st); |
||
735 | SET1(st, startst); |
||
736 | SP("sstart", st, *p); |
||
737 | st = step(m->g, startst, stopst, st, NOTHING, st); |
||
738 | matchp = NULL; |
||
739 | for (;;) { |
||
740 | /* next character */ |
||
741 | lastc = c; |
||
742 | c = (p == m->endp) ? OUT : *p; |
||
743 | |||
744 | /* is there an EOL and/or BOL between lastc and c? */ |
||
745 | flagch = '\0'; |
||
746 | i = 0; |
||
747 | if ( (lastc == '\n' && m->g->cflags®_NEWLINE) || |
||
748 | (lastc == OUT && !(m->eflags®_NOTBOL)) ) { |
||
749 | flagch = BOL; |
||
750 | i = m->g->nbol; |
||
751 | } |
||
752 | if ( (c == '\n' && m->g->cflags®_NEWLINE) || |
||
753 | (c == OUT && !(m->eflags®_NOTEOL)) ) { |
||
754 | flagch = (flagch == BOL) ? BOLEOL : EOL; |
||
755 | i += m->g->neol; |
||
756 | } |
||
757 | if (i != 0) { |
||
758 | for (; i > 0; i--) |
||
759 | st = step(m->g, startst, stopst, st, flagch, st); |
||
760 | SP("sboleol", st, c); |
||
761 | } |
||
762 | |||
763 | /* how about a word boundary? */ |
||
764 | if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) && |
||
765 | (c != OUT && ISWORD(c)) ) { |
||
766 | flagch = BOW; |
||
767 | } |
||
768 | if ( (lastc != OUT && ISWORD(lastc)) && |
||
769 | (flagch == EOL || (c != OUT && !ISWORD(c))) ) { |
||
770 | flagch = EOW; |
||
771 | } |
||
772 | if (flagch == BOW || flagch == EOW) { |
||
773 | st = step(m->g, startst, stopst, st, flagch, st); |
||
774 | SP("sboweow", st, c); |
||
775 | } |
||
776 | |||
777 | /* are we done? */ |
||
778 | if (ISSET(st, stopst)) |
||
779 | matchp = p; |
||
780 | if (EQ(st, empty) || p == stop) |
||
781 | break; /* NOTE BREAK OUT */ |
||
782 | |||
783 | /* no, we must deal with this character */ |
||
784 | ASSIGN(tmp, st); |
||
785 | ASSIGN(st, empty); |
||
786 | assert(c != OUT); |
||
787 | st = step(m->g, startst, stopst, tmp, c, st); |
||
788 | SP("saft", st, c); |
||
789 | assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st)); |
||
790 | p++; |
||
791 | } |
||
792 | |||
793 | return(matchp); |
||
794 | } |
||
795 | |||
796 | |||
797 | /* |
||
798 | - step - map set of states reachable before char to set reachable after |
||
799 | == static states step(register struct re_guts *g, sopno start, sopno stop, \ |
||
800 | == register states bef, int ch, register states aft); |
||
801 | == #define BOL (OUT+1) |
||
802 | == #define EOL (BOL+1) |
||
803 | == #define BOLEOL (BOL+2) |
||
804 | == #define NOTHING (BOL+3) |
||
805 | == #define BOW (BOL+4) |
||
806 | == #define EOW (BOL+5) |
||
807 | == #define CODEMAX (BOL+5) // highest code used |
||
808 | == #define NONCHAR(c) ((c) > CHAR_MAX) |
||
809 | == #define NNONCHAR (CODEMAX-CHAR_MAX) |
||
810 | */ |
||
811 | static states |
||
812 | step(g, start, stop, bef, ch, aft) |
||
813 | register struct re_guts *g; |
||
814 | sopno start; /* start state within strip */ |
||
815 | sopno stop; /* state after stop state within strip */ |
||
816 | register states bef; /* states reachable before */ |
||
817 | int ch; /* character or NONCHAR code */ |
||
818 | register states aft; /* states already known reachable after */ |
||
819 | { |
||
820 | register cset *cs; |
||
821 | register sop s; |
||
822 | register sopno pc; |
||
823 | register onestate here; /* note, macros know this name */ |
||
824 | register sopno look; |
||
825 | register int i; |
||
826 | |||
827 | for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) { |
||
828 | s = g->strip[pc]; |
||
829 | switch (OP(s)) { |
||
830 | case OEND: |
||
831 | assert(pc == stop-1); |
||
832 | break; |
||
833 | case OCHAR: |
||
834 | /* only characters can match */ |
||
835 | assert(!NONCHAR(ch) || ch != (char)OPND(s)); |
||
836 | if (ch == (char)OPND(s)) |
||
837 | FWD(aft, bef, 1); |
||
838 | break; |
||
839 | case OBOL: |
||
840 | if (ch == BOL || ch == BOLEOL) |
||
841 | FWD(aft, bef, 1); |
||
842 | break; |
||
843 | case OEOL: |
||
844 | if (ch == EOL || ch == BOLEOL) |
||
845 | FWD(aft, bef, 1); |
||
846 | break; |
||
847 | case OBOW: |
||
848 | if (ch == BOW) |
||
849 | FWD(aft, bef, 1); |
||
850 | break; |
||
851 | case OEOW: |
||
852 | if (ch == EOW) |
||
853 | FWD(aft, bef, 1); |
||
854 | break; |
||
855 | case OANY: |
||
856 | if (!NONCHAR(ch)) |
||
857 | FWD(aft, bef, 1); |
||
858 | break; |
||
859 | case OANYOF: |
||
860 | cs = &g->sets[OPND(s)]; |
||
861 | if (!NONCHAR(ch) && CHIN(cs, ch)) |
||
862 | FWD(aft, bef, 1); |
||
863 | break; |
||
864 | case OBACK_: /* ignored here */ |
||
865 | case O_BACK: |
||
866 | FWD(aft, aft, 1); |
||
867 | break; |
||
868 | case OPLUS_: /* forward, this is just an empty */ |
||
869 | FWD(aft, aft, 1); |
||
870 | break; |
||
871 | case O_PLUS: /* both forward and back */ |
||
872 | FWD(aft, aft, 1); |
||
873 | i = ISSETBACK(aft, OPND(s)); |
||
874 | BACK(aft, aft, OPND(s)); |
||
875 | if (!i && ISSETBACK(aft, OPND(s))) { |
||
876 | /* oho, must reconsider loop body */ |
||
877 | pc -= OPND(s) + 1; |
||
878 | INIT(here, pc); |
||
879 | } |
||
880 | break; |
||
881 | case OQUEST_: /* two branches, both forward */ |
||
882 | FWD(aft, aft, 1); |
||
883 | FWD(aft, aft, OPND(s)); |
||
884 | break; |
||
885 | case O_QUEST: /* just an empty */ |
||
886 | FWD(aft, aft, 1); |
||
887 | break; |
||
888 | case OLPAREN: /* not significant here */ |
||
889 | case ORPAREN: |
||
890 | FWD(aft, aft, 1); |
||
891 | break; |
||
892 | case OCH_: /* mark the first two branches */ |
||
893 | FWD(aft, aft, 1); |
||
894 | assert(OP(g->strip[pc+OPND(s)]) == OOR2); |
||
895 | FWD(aft, aft, OPND(s)); |
||
896 | break; |
||
897 | case OOR1: /* done a branch, find the O_CH */ |
||
898 | if (ISSTATEIN(aft, here)) { |
||
899 | for (look = 1; |
||
900 | OP(s = g->strip[pc+look]) != O_CH; |
||
901 | look += OPND(s)) |
||
902 | assert(OP(s) == OOR2); |
||
903 | FWD(aft, aft, look); |
||
904 | } |
||
905 | break; |
||
906 | case OOR2: /* propagate OCH_'s marking */ |
||
907 | FWD(aft, aft, 1); |
||
908 | if (OP(g->strip[pc+OPND(s)]) != O_CH) { |
||
909 | assert(OP(g->strip[pc+OPND(s)]) == OOR2); |
||
910 | FWD(aft, aft, OPND(s)); |
||
911 | } |
||
912 | break; |
||
913 | case O_CH: /* just empty */ |
||
914 | FWD(aft, aft, 1); |
||
915 | break; |
||
916 | default: /* ooooops... */ |
||
917 | assert(nope); |
||
918 | break; |
||
919 | } |
||
920 | } |
||
921 | |||
922 | return(aft); |
||
923 | } |
||
924 | |||
925 | #ifdef REDEBUG |
||
926 | /* |
||
927 | - print - print a set of states |
||
928 | == #ifdef REDEBUG |
||
929 | == static void print(struct match *m, char *caption, states st, \ |
||
930 | == int ch, FILE *d); |
||
931 | == #endif |
||
932 | */ |
||
933 | static void |
||
934 | print(m, caption, st, ch, d) |
||
935 | struct match *m; |
||
936 | char *caption; |
||
937 | states st; |
||
938 | int ch; |
||
939 | FILE *d; |
||
940 | { |
||
941 | register struct re_guts *g = m->g; |
||
942 | register int i; |
||
943 | register int first = 1; |
||
944 | |||
945 | if (!(m->eflags®_TRACE)) |
||
946 | return; |
||
947 | |||
948 | fprintf(d, "%s", caption); |
||
949 | if (ch != '\0') |
||
950 | fprintf(d, " %s", pchar(ch)); |
||
951 | for (i = 0; i < g->nstates; i++) |
||
952 | if (ISSET(st, i)) { |
||
953 | fprintf(d, "%s%d", (first) ? "\t" : ", ", i); |
||
954 | first = 0; |
||
955 | } |
||
956 | fprintf(d, "\n"); |
||
957 | } |
||
958 | |||
959 | /* |
||
960 | - at - print current situation |
||
961 | == #ifdef REDEBUG |
||
962 | == static void at(struct match *m, char *title, char *start, char *stop, \ |
||
963 | == sopno startst, sopno stopst); |
||
964 | == #endif |
||
965 | */ |
||
966 | static void |
||
967 | at(m, title, start, stop, startst, stopst) |
||
968 | struct match *m; |
||
969 | char *title; |
||
970 | char *start; |
||
971 | char *stop; |
||
972 | sopno startst; |
||
973 | sopno stopst; |
||
974 | { |
||
975 | if (!(m->eflags®_TRACE)) |
||
976 | return; |
||
977 | |||
978 | printf("%s %s-", title, pchar(*start)); |
||
979 | printf("%s ", pchar(*stop)); |
||
980 | printf("%ld-%ld\n", (long)startst, (long)stopst); |
||
981 | } |
||
982 | |||
983 | #ifndef PCHARDONE |
||
984 | #define PCHARDONE /* never again */ |
||
985 | /* |
||
986 | - pchar - make a character printable |
||
987 | == #ifdef REDEBUG |
||
988 | == static char *pchar(int ch); |
||
989 | == #endif |
||
990 | * |
||
991 | * Is this identical to regchar() over in debug.c? Well, yes. But a |
||
992 | * duplicate here avoids having a debugging-capable regexec.o tied to |
||
993 | * a matching debug.o, and this is convenient. It all disappears in |
||
994 | * the non-debug compilation anyway, so it doesn't matter much. |
||
995 | */ |
||
996 | static char * /* -> representation */ |
||
997 | pchar(ch) |
||
998 | int ch; |
||
999 | { |
||
1000 | static char pbuf[10]; |
||
1001 | |||
1002 | if (isprint(ch) || ch == ' ') |
||
1003 | sprintf(pbuf, "%c", ch); |
||
1004 | else |
||
1005 | sprintf(pbuf, "\\%o", ch); |
||
1006 | return(pbuf); |
||
1007 | } |
||
1008 | #endif |
||
1009 | #endif |
||
1010 | |||
1011 | #undef matcher |
||
1012 | #undef fast |
||
1013 | #undef slow |
||
1014 | #undef dissect |
||
1015 | #undef backref |
||
1016 | #undef step |
||
1017 | #undef print |
||
1018 | #undef at |
||
1019 | #undef match>=>>=>>=>=>>>>>>>>>=>>=>>>=>>=>=>=>=>>>> |