| 1 |
/* |
|---|
| 2 |
* regcomp and regexec -- regsub and regerror are elsewhere |
|---|
| 3 |
* |
|---|
| 4 |
* Copyright (c) 1986 by University of Toronto. |
|---|
| 5 |
* Written by Henry Spencer. Not derived from licensed software. |
|---|
| 6 |
* |
|---|
| 7 |
* Permission is granted to anyone to use this software for any |
|---|
| 8 |
* purpose on any computer system, and to redistribute it freely, |
|---|
| 9 |
* subject to the following restrictions: |
|---|
| 10 |
* |
|---|
| 11 |
* 1. The author is not responsible for the consequences of use of |
|---|
| 12 |
* this software, no matter how awful, even if they arise |
|---|
| 13 |
* from defects in it. |
|---|
| 14 |
* |
|---|
| 15 |
* 2. The origin of this software must not be misrepresented, either |
|---|
| 16 |
* by explicit claim or by omission. |
|---|
| 17 |
* |
|---|
| 18 |
* 3. Altered versions must be plainly marked as such, and must not |
|---|
| 19 |
* be misrepresented as being the original software. |
|---|
| 20 |
*** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
|---|
| 21 |
*** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to | |
|---|
| 22 |
*** to assist in implementing egrep. |
|---|
| 23 |
*** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
|---|
| 24 |
*** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching |
|---|
| 25 |
*** as in BSD grep and ex. |
|---|
| 26 |
*** THIS IS AN ALTERED VERSION. It was altered by John Gilmore, |
|---|
| 27 |
*** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \. |
|---|
| 28 |
*** THIS IS AN ALTERED VERSION. It was altered by James A. Woods, |
|---|
| 29 |
*** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy. |
|---|
| 30 |
*** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
|---|
| 31 |
*** seiwald@vix.com, on 28 August 1993, for use in jam. Regmagic.h |
|---|
| 32 |
*** was moved into regexp.h, and the include of regexp.h now uses "'s |
|---|
| 33 |
*** to avoid conflicting with the system regexp.h. Const, bless its |
|---|
| 34 |
*** soul, was removed so it can compile everywhere. The declaration |
|---|
| 35 |
*** of strchr() was in conflict on AIX, so it was removed (as it is |
|---|
| 36 |
*** happily defined in string.h). |
|---|
| 37 |
*** THIS IS AN ALTERED VERSION. It was altered by Christopher Seiwald |
|---|
| 38 |
*** seiwald@perforce.com, on 20 January 2000, to use function prototypes. |
|---|
| 39 |
* |
|---|
| 40 |
* Beware that some of this code is subtly aware of the way operator |
|---|
| 41 |
* precedence is structured in regular expressions. Serious changes in |
|---|
| 42 |
* regular-expression syntax might require a total rethink. |
|---|
| 43 |
*/ |
|---|
| 44 |
#include "regexp.h" |
|---|
| 45 |
#include <stdio.h> |
|---|
| 46 |
#include <ctype.h> |
|---|
| 47 |
#ifndef ultrix |
|---|
| 48 |
#include <stdlib.h> |
|---|
| 49 |
#endif |
|---|
| 50 |
#include <string.h> |
|---|
| 51 |
|
|---|
| 52 |
/* |
|---|
| 53 |
* The "internal use only" fields in regexp.h are present to pass info from |
|---|
| 54 |
* compile to execute that permits the execute phase to run lots faster on |
|---|
| 55 |
* simple cases. They are: |
|---|
| 56 |
* |
|---|
| 57 |
* regstart char that must begin a match; '\0' if none obvious |
|---|
| 58 |
* reganch is the match anchored (at beginning-of-line only)? |
|---|
| 59 |
* regmust string (pointer into program) that match must include, or NULL |
|---|
| 60 |
* regmlen length of regmust string |
|---|
| 61 |
* |
|---|
| 62 |
* Regstart and reganch permit very fast decisions on suitable starting points |
|---|
| 63 |
* for a match, cutting down the work a lot. Regmust permits fast rejection |
|---|
| 64 |
* of lines that cannot possibly match. The regmust tests are costly enough |
|---|
| 65 |
* that regcomp() supplies a regmust only if the r.e. contains something |
|---|
| 66 |
* potentially expensive (at present, the only such thing detected is * or + |
|---|
| 67 |
* at the start of the r.e., which can involve a lot of backup). Regmlen is |
|---|
| 68 |
* supplied because the test in regexec() needs it and regcomp() is computing |
|---|
| 69 |
* it anyway. |
|---|
| 70 |
*/ |
|---|
| 71 |
|
|---|
| 72 |
/* |
|---|
| 73 |
* Structure for regexp "program". This is essentially a linear encoding |
|---|
| 74 |
* of a nondeterministic finite-state machine (aka syntax charts or |
|---|
| 75 |
* "railroad normal form" in parsing technology). Each node is an opcode |
|---|
| 76 |
* plus a "next" pointer, possibly plus an operand. "Next" pointers of |
|---|
| 77 |
* all nodes except BRANCH implement concatenation; a "next" pointer with |
|---|
| 78 |
* a BRANCH on both ends of it is connecting two alternatives. (Here we |
|---|
| 79 |
* have one of the subtle syntax dependencies: an individual BRANCH (as |
|---|
| 80 |
* opposed to a collection of them) is never concatenated with anything |
|---|
| 81 |
* because of operator precedence.) The operand of some types of node is |
|---|
| 82 |
* a literal string; for others, it is a node leading into a sub-FSM. In |
|---|
| 83 |
* particular, the operand of a BRANCH node is the first node of the branch. |
|---|
| 84 |
* (NB this is *not* a tree structure: the tail of the branch connects |
|---|
| 85 |
* to the thing following the set of BRANCHes.) The opcodes are: |
|---|
| 86 |
*/ |
|---|
| 87 |
|
|---|
| 88 |
/* definition number opnd? meaning */ |
|---|
| 89 |
#define END 0 /* no End of program. */ |
|---|
| 90 |
#define BOL 1 /* no Match "" at beginning of line. */ |
|---|
| 91 |
#define EOL 2 /* no Match "" at end of line. */ |
|---|
| 92 |
#define ANY 3 /* no Match any one character. */ |
|---|
| 93 |
#define ANYOF 4 /* str Match any character in this string. */ |
|---|
| 94 |
#define ANYBUT 5 /* str Match any character not in this string. */ |
|---|
| 95 |
#define BRANCH 6 /* node Match this alternative, or the next... */ |
|---|
| 96 |
#define BACK 7 /* no Match "", "next" ptr points backward. */ |
|---|
| 97 |
#define EXACTLY 8 /* str Match this string. */ |
|---|
| 98 |
#define NOTHING 9 /* no Match empty string. */ |
|---|
| 99 |
#define STAR 10 /* node Match this (simple) thing 0 or more times. */ |
|---|
| 100 |
#define PLUS 11 /* node Match this (simple) thing 1 or more times. */ |
|---|
| 101 |
#define WORDA 12 /* no Match "" at wordchar, where prev is nonword */ |
|---|
| 102 |
#define WORDZ 13 /* no Match "" at nonwordchar, where prev is word */ |
|---|
| 103 |
#define OPEN 20 /* no Mark this point in input as start of #n. */ |
|---|
| 104 |
/* OPEN+1 is number 1, etc. */ |
|---|
| 105 |
#define CLOSE 30 /* no Analogous to OPEN. */ |
|---|
| 106 |
|
|---|
| 107 |
/* |
|---|
| 108 |
* Opcode notes: |
|---|
| 109 |
* |
|---|
| 110 |
* BRANCH The set of branches constituting a single choice are hooked |
|---|
| 111 |
* together with their "next" pointers, since precedence prevents |
|---|
| 112 |
* anything being concatenated to any individual branch. The |
|---|
| 113 |
* "next" pointer of the last BRANCH in a choice points to the |
|---|
| 114 |
* thing following the whole choice. This is also where the |
|---|
| 115 |
* final "next" pointer of each individual branch points; each |
|---|
| 116 |
* branch starts with the operand node of a BRANCH node. |
|---|
| 117 |
* |
|---|
| 118 |
* BACK Normal "next" pointers all implicitly point forward; BACK |
|---|
| 119 |
* exists to make loop structures possible. |
|---|
| 120 |
* |
|---|
| 121 |
* STAR,PLUS '?', and complex '*' and '+', are implemented as circular |
|---|
| 122 |
* BRANCH structures using BACK. Simple cases (one character |
|---|
| 123 |
* per match) are implemented with STAR and PLUS for speed |
|---|
| 124 |
* and to minimize recursive plunges. |
|---|
| 125 |
* |
|---|
| 126 |
* OPEN,CLOSE ...are numbered at compile time. |
|---|
| 127 |
*/ |
|---|
| 128 |
|
|---|
| 129 |
/* |
|---|
| 130 |
* A node is one char of opcode followed by two chars of "next" pointer. |
|---|
| 131 |
* "Next" pointers are stored as two 8-bit pieces, high order first. The |
|---|
| 132 |
* value is a positive offset from the opcode of the node containing it. |
|---|
| 133 |
* An operand, if any, simply follows the node. (Note that much of the |
|---|
| 134 |
* code generation knows about this implicit relationship.) |
|---|
| 135 |
* |
|---|
| 136 |
* Using two bytes for the "next" pointer is vast overkill for most things, |
|---|
| 137 |
* but allows patterns to get big without disasters. |
|---|
| 138 |
*/ |
|---|
| 139 |
#define OP(p) (*(p)) |
|---|
| 140 |
#define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377)) |
|---|
| 141 |
#define OPERAND(p) ((p) + 3) |
|---|
| 142 |
|
|---|
| 143 |
/* |
|---|
| 144 |
* See regmagic.h for one further detail of program structure. |
|---|
| 145 |
*/ |
|---|
| 146 |
|
|---|
| 147 |
|
|---|
| 148 |
/* |
|---|
| 149 |
* Utility definitions. |
|---|
| 150 |
*/ |
|---|
| 151 |
#ifndef CHARBITS |
|---|
| 152 |
#define UCHARAT(p) ((int)*(unsigned char *)(p)) |
|---|
| 153 |
#else |
|---|
| 154 |
#define UCHARAT(p) ((int)*(p)&CHARBITS) |
|---|
| 155 |
#endif |
|---|
| 156 |
|
|---|
| 157 |
#define FAIL(m) { regerror(m); return(NULL); } |
|---|
| 158 |
#define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?') |
|---|
| 159 |
|
|---|
| 160 |
/* |
|---|
| 161 |
* Flags to be passed up and down. |
|---|
| 162 |
*/ |
|---|
| 163 |
#define HASWIDTH 01 /* Known never to match null string. */ |
|---|
| 164 |
#define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */ |
|---|
| 165 |
#define SPSTART 04 /* Starts with * or +. */ |
|---|
| 166 |
#define WORST 0 /* Worst case. */ |
|---|
| 167 |
|
|---|
| 168 |
/* |
|---|
| 169 |
* Global work variables for regcomp(). |
|---|
| 170 |
*/ |
|---|
| 171 |
static char *regparse; /* Input-scan pointer. */ |
|---|
| 172 |
static int regnpar; /* () count. */ |
|---|
| 173 |
static char regdummy; |
|---|
| 174 |
static char *regcode; /* Code-emit pointer; ®dummy = don't. */ |
|---|
| 175 |
static long regsize; /* Code size. */ |
|---|
| 176 |
|
|---|
| 177 |
/* |
|---|
| 178 |
* Forward declarations for regcomp()'s friends. |
|---|
| 179 |
*/ |
|---|
| 180 |
#ifndef STATIC |
|---|
| 181 |
#define STATIC static |
|---|
| 182 |
#endif |
|---|
| 183 |
STATIC char *reg( int paren, int *flagp ); |
|---|
| 184 |
STATIC char *regbranch( int *flagp ); |
|---|
| 185 |
STATIC char *regpiece( int *flagp ); |
|---|
| 186 |
STATIC char *regatom( int *flagp ); |
|---|
| 187 |
STATIC char *regnode( int op ); |
|---|
| 188 |
STATIC char *regnext( register char *p ); |
|---|
| 189 |
STATIC void regc( int b ); |
|---|
| 190 |
STATIC void reginsert( char op, char *opnd ); |
|---|
| 191 |
STATIC void regtail( char *p, char *val ); |
|---|
| 192 |
STATIC void regoptail( char *p, char *val ); |
|---|
| 193 |
#ifdef STRCSPN |
|---|
| 194 |
STATIC int strcspn(); |
|---|
| 195 |
#endif |
|---|
| 196 |
|
|---|
| 197 |
/* |
|---|
| 198 |
- regcomp - compile a regular expression into internal code |
|---|
| 199 |
* |
|---|
| 200 |
* We can't allocate space until we know how big the compiled form will be, |
|---|
| 201 |
* but we can't compile it (and thus know how big it is) until we've got a |
|---|
| 202 |
* place to put the code. So we cheat: we compile it twice, once with code |
|---|
| 203 |
* generation turned off and size counting turned on, and once "for real". |
|---|
| 204 |
* This also means that we don't allocate space until we are sure that the |
|---|
| 205 |
* thing really will compile successfully, and we never have to move the |
|---|
| 206 |
* code and thus invalidate pointers into it. (Note that it has to be in |
|---|
| 207 |
* one piece because free() must be able to free it all.) |
|---|
| 208 |
* |
|---|
| 209 |
* Beware that the optimization-preparation code in here knows about some |
|---|
| 210 |
* of the structure of the compiled regexp. |
|---|
| 211 |
*/ |
|---|
| 212 |
regexp * |
|---|
| 213 |
regcomp( char *exp ) |
|---|
| 214 |
{ |
|---|
| 215 |
register regexp *r; |
|---|
| 216 |
register char *scan; |
|---|
| 217 |
register char *longest; |
|---|
| 218 |
register unsigned len; |
|---|
| 219 |
int flags; |
|---|
| 220 |
|
|---|
| 221 |
if (exp == NULL) |
|---|
| 222 |
FAIL("NULL argument"); |
|---|
| 223 |
|
|---|
| 224 |
/* First pass: determine size, legality. */ |
|---|
| 225 |
#ifdef notdef |
|---|
| 226 |
if (exp[0] == '.' && exp[1] == '*') exp += 2; /* aid grep */ |
|---|
| 227 |
#endif |
|---|
| 228 |
regparse = (char *)exp; |
|---|
| 229 |
regnpar = 1; |
|---|
| 230 |
regsize = 0L; |
|---|
| 231 |
regcode = ®dummy; |
|---|
| 232 |
regc(MAGIC); |
|---|
| 233 |
if (reg(0, &flags) == NULL) |
|---|
| 234 |
return(NULL); |
|---|
| 235 |
|
|---|
| 236 |
/* Small enough for pointer-storage convention? */ |
|---|
| 237 |
if (regsize >= 32767L) /* Probably could be 65535L. */ |
|---|
| 238 |
FAIL("regexp too big"); |
|---|
| 239 |
|
|---|
| 240 |
/* Allocate space. */ |
|---|
| 241 |
r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize); |
|---|
| 242 |
if (r == NULL) |
|---|
| 243 |
FAIL("out of space"); |
|---|
| 244 |
|
|---|
| 245 |
/* Second pass: emit code. */ |
|---|
| 246 |
regparse = (char *)exp; |
|---|
| 247 |
regnpar = 1; |
|---|
| 248 |
regcode = r->program; |
|---|
| 249 |
regc(MAGIC); |
|---|
| 250 |
if (reg(0, &flags) == NULL) |
|---|
| 251 |
return(NULL); |
|---|
| 252 |
|
|---|
| 253 |
/* Dig out information for optimizations. */ |
|---|
| 254 |
r->regstart = '\0'; /* Worst-case defaults. */ |
|---|
| 255 |
r->reganch = 0; |
|---|
| 256 |
r->regmust = NULL; |
|---|
| 257 |
r->regmlen = 0; |
|---|
| 258 |
scan = r->program+1; /* First BRANCH. */ |
|---|
| 259 |
if (OP(regnext(scan)) == END) { /* Only one top-level choice. */ |
|---|
| 260 |
scan = OPERAND(scan); |
|---|
| 261 |
|
|---|
| 262 |
/* Starting-point info. */ |
|---|
| 263 |
if (OP(scan) == EXACTLY) |
|---|
| 264 |
r->regstart = *OPERAND(scan); |
|---|
| 265 |
else if (OP(scan) == BOL) |
|---|
| 266 |
r->reganch++; |
|---|
| 267 |
|
|---|
| 268 |
/* |
|---|
| 269 |
* If there's something expensive in the r.e., find the |
|---|
| 270 |
* longest literal string that must appear and make it the |
|---|
| 271 |
* regmust. Resolve ties in favor of later strings, since |
|---|
| 272 |
* the regstart check works with the beginning of the r.e. |
|---|
| 273 |
* and avoiding duplication strengthens checking. Not a |
|---|
| 274 |
* strong reason, but sufficient in the absence of others. |
|---|
| 275 |
*/ |
|---|
| 276 |
if (flags&SPSTART) { |
|---|
| 277 |
longest = NULL; |
|---|
| 278 |
len = 0; |
|---|
| 279 |
for (; scan != NULL; scan = regnext(scan)) |
|---|
| 280 |
if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) { |
|---|
| 281 |
longest = OPERAND(scan); |
|---|
| 282 |
len = strlen(OPERAND(scan)); |
|---|
| 283 |
} |
|---|
| 284 |
r->regmust = longest; |
|---|
| 285 |
r->regmlen = len; |
|---|
| 286 |
} |
|---|
| 287 |
} |
|---|
| 288 |
|
|---|
| 289 |
return(r); |
|---|
| 290 |
} |
|---|
| 291 |
|
|---|
| 292 |
/* |
|---|
| 293 |
- reg - regular expression, i.e. main body or parenthesized thing |
|---|
| 294 |
* |
|---|
| 295 |
* Caller must absorb opening parenthesis. |
|---|
| 296 |
* |
|---|
| 297 |
* Combining parenthesis handling with the base level of regular expression |
|---|
| 298 |
* is a trifle forced, but the need to tie the tails of the branches to what |
|---|
| 299 |
* follows makes it hard to avoid. |
|---|
| 300 |
*/ |
|---|
| 301 |
static char * |
|---|
| 302 |
reg( |
|---|
| 303 |
int paren, /* Parenthesized? */ |
|---|
| 304 |
int *flagp ) |
|---|
| 305 |
{ |
|---|
| 306 |
register char *ret; |
|---|
| 307 |
register char *br; |
|---|
| 308 |
register char *ender; |
|---|
| 309 |
register int parno = 0; |
|---|
| 310 |
int flags; |
|---|
| 311 |
|
|---|
| 312 |
*flagp = HASWIDTH; /* Tentatively. */ |
|---|
| 313 |
|
|---|
| 314 |
/* Make an OPEN node, if parenthesized. */ |
|---|
| 315 |
if (paren) { |
|---|
| 316 |
if (regnpar >= NSUBEXP) |
|---|
| 317 |
FAIL("too many ()"); |
|---|
| 318 |
parno = regnpar; |
|---|
| 319 |
regnpar++; |
|---|
| 320 |
ret = regnode(OPEN+parno); |
|---|
| 321 |
} else |
|---|
| 322 |
ret = NULL; |
|---|
| 323 |
|
|---|
| 324 |
/* Pick up the branches, linking them together. */ |
|---|
| 325 |
br = regbranch(&flags); |
|---|
| 326 |
if (br == NULL) |
|---|
| 327 |
return(NULL); |
|---|
| 328 |
if (ret != NULL) |
|---|
| 329 |
regtail(ret, br); /* OPEN -> first. */ |
|---|
| 330 |
else |
|---|
| 331 |
ret = br; |
|---|
| 332 |
if (!(flags&HASWIDTH)) |
|---|
| 333 |
*flagp &= ~HASWIDTH; |
|---|
| 334 |
*flagp |= flags&SPSTART; |
|---|
| 335 |
while (*regparse == '|' || *regparse == '\n') { |
|---|
| 336 |
regparse++; |
|---|
| 337 |
br = regbranch(&flags); |
|---|
| 338 |
if (br == NULL) |
|---|
| 339 |
return(NULL); |
|---|
| 340 |
regtail(ret, br); /* BRANCH -> BRANCH. */ |
|---|
| 341 |
if (!(flags&HASWIDTH)) |
|---|
| 342 |
*flagp &= ~HASWIDTH; |
|---|
| 343 |
*flagp |= flags&SPSTART; |
|---|
| 344 |
} |
|---|
| 345 |
|
|---|
| 346 |
/* Make a closing node, and hook it on the end. */ |
|---|
| 347 |
ender = regnode((paren) ? CLOSE+parno : END); |
|---|
| 348 |
regtail(ret, ender); |
|---|
| 349 |
|
|---|
| 350 |
/* Hook the tails of the branches to the closing node. */ |
|---|
| 351 |
for (br = ret; br != NULL; br = regnext(br)) |
|---|
| 352 |
regoptail(br, ender); |
|---|
| 353 |
|
|---|
| 354 |
/* Check for proper termination. */ |
|---|
| 355 |
if (paren && *regparse++ != ')') { |
|---|
| 356 |
FAIL("unmatched ()"); |
|---|
| 357 |
} else if (!paren && *regparse != '\0') { |
|---|
| 358 |
if (*regparse == ')') { |
|---|
| 359 |
FAIL("unmatched ()"); |
|---|
| 360 |
} else |
|---|
| 361 |
FAIL("junk on end"); /* "Can't happen". */ |
|---|
| 362 |
/* NOTREACHED */ |
|---|
| 363 |
} |
|---|
| 364 |
|
|---|
| 365 |
return(ret); |
|---|
| 366 |
} |
|---|
| 367 |
|
|---|
| 368 |
/* |
|---|
| 369 |
- regbranch - one alternative of an | operator |
|---|
| 370 |
* |
|---|
| 371 |
* Implements the concatenation operator. |
|---|
| 372 |
*/ |
|---|
| 373 |
static char * |
|---|
| 374 |
regbranch( int *flagp ) |
|---|
| 375 |
{ |
|---|
| 376 |
register char *ret; |
|---|
| 377 |
register char *chain; |
|---|
| 378 |
register char *latest; |
|---|
| 379 |
int flags; |
|---|
| 380 |
|
|---|
| 381 |
*flagp = WORST; /* Tentatively. */ |
|---|
| 382 |
|
|---|
| 383 |
ret = regnode(BRANCH); |
|---|
| 384 |
chain = NULL; |
|---|
| 385 |
while (*regparse != '\0' && *regparse != ')' && |
|---|
| 386 |
*regparse != '\n' && *regparse != '|') { |
|---|
| 387 |
latest = regpiece(&flags); |
|---|
| 388 |
if (latest == NULL) |
|---|
| 389 |
return(NULL); |
|---|
| 390 |
*flagp |= flags&HASWIDTH; |
|---|
| 391 |
if (chain == NULL) /* First piece. */ |
|---|
| 392 |
*flagp |= flags&SPSTART; |
|---|
| 393 |
else |
|---|
| 394 |
regtail(chain, latest); |
|---|
| 395 |
chain = latest; |
|---|
| 396 |
} |
|---|
| 397 |
if (chain == NULL) /* Loop ran zero times. */ |
|---|
| 398 |
(void) regnode(NOTHING); |
|---|
| 399 |
|
|---|
| 400 |
return(ret); |
|---|
| 401 |
} |
|---|
| 402 |
|
|---|
| 403 |
/* |
|---|
| 404 |
- regpiece - something followed by possible [*+?] |
|---|
| 405 |
* |
|---|
| 406 |
* Note that the branching code sequences used for ? and the general cases |
|---|
| 407 |
* of * and + are somewhat optimized: they use the same NOTHING node as |
|---|
| 408 |
* both the endmarker for their branch list and the body of the last branch. |
|---|
| 409 |
* It might seem that this node could be dispensed with entirely, but the |
|---|
| 410 |
* endmarker role is not redundant. |
|---|
| 411 |
*/ |
|---|
| 412 |
static char * |
|---|
| 413 |
regpiece( int *flagp ) |
|---|
| 414 |
{ |
|---|
| 415 |
register char *ret; |
|---|
| 416 |
register char op; |
|---|
| 417 |
register char *next; |
|---|
| 418 |
int flags; |
|---|
| 419 |
|
|---|
| 420 |
ret = regatom(&flags); |
|---|
| 421 |
if (ret == NULL) |
|---|
| 422 |
return(NULL); |
|---|
| 423 |
|
|---|
| 424 |
op = *regparse; |
|---|
| 425 |
if (!ISMULT(op)) { |
|---|
| 426 |
*flagp = flags; |
|---|
| 427 |
return(ret); |
|---|
| 428 |
} |
|---|
| 429 |
|
|---|
| 430 |
if (!(flags&HASWIDTH) && op != '?') |
|---|
| 431 |
FAIL("*+ operand could be empty"); |
|---|
| 432 |
*flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); |
|---|
| 433 |
|
|---|
| 434 |
if (op == '*' && (flags&SIMPLE)) |
|---|
| 435 |
reginsert(STAR, ret); |
|---|
| 436 |
else if (op == '*') { |
|---|
| 437 |
/* Emit x* as (x&|), where & means "self". */ |
|---|
| 438 |
reginsert(BRANCH, ret); /* Either x */ |
|---|
| 439 |
regoptail(ret, regnode(BACK)); /* and loop */ |
|---|
| 440 |
regoptail(ret, ret); /* back */ |
|---|
| 441 |
regtail(ret, regnode(BRANCH)); /* or */ |
|---|
| 442 |
regtail(ret, regnode(NOTHING)); /* null. */ |
|---|
| 443 |
} else if (op == '+' && (flags&SIMPLE)) |
|---|
| 444 |
reginsert(PLUS, ret); |
|---|
| 445 |
else if (op == '+') { |
|---|
| 446 |
/* Emit x+ as x(&|), where & means "self". */ |
|---|
| 447 |
next = regnode(BRANCH); /* Either */ |
|---|
| 448 |
regtail(ret, next); |
|---|
| 449 |
regtail(regnode(BACK), ret); /* loop back */ |
|---|
| 450 |
regtail(next, regnode(BRANCH)); /* or */ |
|---|
| 451 |
regtail(ret, regnode(NOTHING)); /* null. */ |
|---|
| 452 |
} else if (op == '?') { |
|---|
| 453 |
/* Emit x? as (x|) */ |
|---|
| 454 |
reginsert(BRANCH, ret); /* Either x */ |
|---|
| 455 |
regtail(ret, regnode(BRANCH)); /* or */ |
|---|
| 456 |
next = regnode(NOTHING); /* null. */ |
|---|
| 457 |
regtail(ret, next); |
|---|
| 458 |
regoptail(ret, next); |
|---|
| 459 |
} |
|---|
| 460 |
regparse++; |
|---|
| 461 |
if (ISMULT(*regparse)) |
|---|
| 462 |
FAIL("nested *?+"); |
|---|
| 463 |
|
|---|
| 464 |
return(ret); |
|---|
| 465 |
} |
|---|
| 466 |
|
|---|
| 467 |
/* |
|---|
| 468 |
- regatom - the lowest level |
|---|
| 469 |
* |
|---|
| 470 |
* Optimization: gobbles an entire sequence of ordinary characters so that |
|---|
| 471 |
* it can turn them into a single node, which is smaller to store and |
|---|
| 472 |
* faster to run. Backslashed characters are exceptions, each becoming a |
|---|
| 473 |
* separate node; the code is simpler that way and it's not worth fixing. |
|---|
| 474 |
*/ |
|---|
| 475 |
static char * |
|---|
| 476 |
regatom( int *flagp ) |
|---|
| 477 |
{ |
|---|
| 478 |
register char *ret; |
|---|
| 479 |
int flags; |
|---|
| 480 |
|
|---|
| 481 |
*flagp = WORST; /* Tentatively. */ |
|---|
| 482 |
|
|---|
| 483 |
switch (*regparse++) { |
|---|
| 484 |
/* FIXME: these chars only have meaning at beg/end of pat? */ |
|---|
| 485 |
case '^': |
|---|
| 486 |
ret = regnode(BOL); |
|---|
| 487 |
break; |
|---|
| 488 |
case '$': |
|---|
| 489 |
ret = regnode(EOL); |
|---|
| 490 |
break; |
|---|
| 491 |
case '.': |
|---|
| 492 |
ret = regnode(ANY); |
|---|
| 493 |
*flagp |= HASWIDTH|SIMPLE; |
|---|
| 494 |
break; |
|---|
| 495 |
case '[': { |
|---|
| 496 |
register int classr; |
|---|
| 497 |
register int classend; |
|---|
| 498 |
|
|---|
| 499 |
if (*regparse == '^') { /* Complement of range. */ |
|---|
| 500 |
ret = regnode(ANYBUT); |
|---|
| 501 |
regparse++; |
|---|
| 502 |
} else |
|---|
| 503 |
ret = regnode(ANYOF); |
|---|
| 504 |
if (*regparse == ']' || *regparse == '-') |
|---|
| 505 |
regc(*regparse++); |
|---|
| 506 |
while (*regparse != '\0' && *regparse != ']') { |
|---|
| 507 |
if (*regparse == '-') { |
|---|
| 508 |
regparse++; |
|---|
| 509 |
if (*regparse == ']' || *regparse == '\0') |
|---|
| 510 |
regc('-'); |
|---|
| 511 |
else { |
|---|
| 512 |
classr = UCHARAT(regparse-2)+1; |
|---|
| 513 |
classend = UCHARAT(regparse); |
|---|
| 514 |
if (classr > classend+1) |
|---|
| 515 |
FAIL("invalid [] range"); |
|---|
| 516 |
for (; classr <= classend; classr++) |
|---|
| 517 |
regc(classr); |
|---|
| 518 |
regparse++; |
|---|
| 519 |
} |
|---|
| 520 |
} else |
|---|
| 521 |
regc(*regparse++); |
|---|
| 522 |
} |
|---|
| 523 |
regc('\0'); |
|---|
| 524 |
if (*regparse != ']') |
|---|
| 525 |
FAIL("unmatched []"); |
|---|
| 526 |
regparse++; |
|---|
| 527 |
*flagp |= HASWIDTH|SIMPLE; |
|---|
| 528 |
} |
|---|
| 529 |
break; |
|---|
| 530 |
case '(': |
|---|
| 531 |
ret = reg(1, &flags); |
|---|
| 532 |
if (ret == NULL) |
|---|
| 533 |
return(NULL); |
|---|
| 534 |
*flagp |= flags&(HASWIDTH|SPSTART); |
|---|
| 535 |
break; |
|---|
| 536 |
case '\0': |
|---|
| 537 |
case '|': |
|---|
| 538 |
case '\n': |
|---|
| 539 |
case ')': |
|---|
| 540 |
FAIL("internal urp"); /* Supposed to be caught earlier. */ |
|---|
| 541 |
break; |
|---|
| 542 |
case '?': |
|---|
| 543 |
case '+': |
|---|
| 544 |
case '*': |
|---|
| 545 |
FAIL("?+* follows nothing"); |
|---|
| 546 |
break; |
|---|
| 547 |
case '\\': |
|---|
| 548 |
switch (*regparse++) { |
|---|
| 549 |
case '\0': |
|---|
| 550 |
FAIL("trailing \\"); |
|---|
| 551 |
break; |
|---|
| 552 |
case '<': |
|---|
| 553 |
ret = regnode(WORDA); |
|---|
| 554 |
break; |
|---|
| 555 |
case '>': |
|---|
| 556 |
ret = regnode(WORDZ); |
|---|
| 557 |
break; |
|---|
| 558 |
/* FIXME: Someday handle \1, \2, ... */ |
|---|
| 559 |
default: |
|---|
| 560 |
/* Handle general quoted chars in exact-match routine */ |
|---|
| 561 |
goto de_fault; |
|---|
| 562 |
} |
|---|
| 563 |
break; |
|---|
| 564 |
de_fault: |
|---|
| 565 |
default: |
|---|
| 566 |
/* |
|---|
| 567 |
* Encode a string of characters to be matched exactly. |
|---|
| 568 |
* |
|---|
| 569 |
* This is a bit tricky due to quoted chars and due to |
|---|
| 570 |
* '*', '+', and '?' taking the SINGLE char previous |
|---|
| 571 |
* as their operand. |
|---|
| 572 |
* |
|---|
| 573 |
* On entry, the char at regparse[-1] is going to go |
|---|
| 574 |
* into the string, no matter what it is. (It could be |
|---|
| 575 |
* following a \ if we are entered from the '\' case.) |
|---|
| 576 |
* |
|---|
| 577 |
* Basic idea is to pick up a good char in ch and |
|---|
| 578 |
* examine the next char. If it's *+? then we twiddle. |
|---|
| 579 |
* If it's \ then we frozzle. If it's other magic char |
|---|
| 580 |
* we push ch and terminate the string. If none of the |
|---|
| 581 |
* above, we push ch on the string and go around again. |
|---|
| 582 |
* |
|---|
| 583 |
* regprev is used to remember where "the current char" |
|---|
| 584 |
* starts in the string, if due to a *+? we need to back |
|---|
| 585 |
* up and put the current char in a separate, 1-char, string. |
|---|
| 586 |
* When regprev is NULL, ch is the only char in the |
|---|
| 587 |
* string; this is used in *+? handling, and in setting |
|---|
| 588 |
* flags |= SIMPLE at the end. |
|---|
| 589 |
*/ |
|---|
| 590 |
{ |
|---|
| 591 |
char *regprev; |
|---|
| 592 |
register char ch; |
|---|
| 593 |
|
|---|
| 594 |
regparse--; /* Look at cur char */ |
|---|
| 595 |
ret = regnode(EXACTLY); |
|---|
| 596 |
for ( regprev = 0 ; ; ) { |
|---|
| 597 |
ch = *regparse++; /* Get current char */ |
|---|
| 598 |
switch (*regparse) { /* look at next one */ |
|---|
| 599 |
|
|---|
| 600 |
default: |
|---|
| 601 |
regc(ch); /* Add cur to string */ |
|---|
| 602 |
break; |
|---|
| 603 |
|
|---|
| 604 |
case '.': case '[': case '(': |
|---|
| 605 |
case ')': case '|': case '\n': |
|---|
| 606 |
case '$': case '^': |
|---|
| 607 |
case '\0': |
|---|
| 608 |
/* FIXME, $ and ^ should not always be magic */ |
|---|
| 609 |
magic: |
|---|
| 610 |
regc(ch); /* dump cur char */ |
|---|
| 611 |
&n |
|---|