root/op/1.28/regexp.c

Revision 220, 33.0 kB (checked in by athomas, 4 years ago)

Added nolog

Line 
1 /*
2  * regcomp and regexec -- regsub and regerror are elsewhere
3  *
4  *      Copyright (c) 1986 by University of Toronto.
5  *      Written by Henry Spencer.  Not derived from licensed software.
6  *
7  *      Permission is granted to anyone to use this software for any
8  *      purpose on any computer system, and to redistribute it freely,
9  *      subject to the following restrictions:
10  *
11  *      1. The author is not responsible for the consequences of use of
12  *              this software, no matter how awful, even if they arise
13  *              from defects in it.
14  *
15  *      2. The origin of this software must not be misrepresented, either
16  *              by explicit claim or by omission.
17  *
18  *      3. Altered versions must be plainly marked as such, and must not
19  *              be misrepresented as being the original software.
20  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
21  *** hoptoad!gnu, on 27 Dec 1986, to add \n as an alternative to |
22  *** to assist in implementing egrep.
23  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
24  *** hoptoad!gnu, on 27 Dec 1986, to add \< and \> for word-matching
25  *** as in BSD grep and ex.
26  *** THIS IS AN ALTERED VERSION.  It was altered by John Gilmore,
27  *** hoptoad!gnu, on 28 Dec 1986, to optimize characters quoted with \.
28  *** THIS IS AN ALTERED VERSION.  It was altered by James A. Woods,
29  *** ames!jaw, on 19 June 1987, to quash a regcomp() redundancy.
30  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
31  *** seiwald@vix.com, on 28 August 1993, for use in jam.  Regmagic.h
32  *** was moved into regexp.h, and the include of regexp.h now uses "'s
33  *** to avoid conflicting with the system regexp.h.  Const, bless its
34  *** soul, was removed so it can compile everywhere.  The declaration
35  *** of strchr() was in conflict on AIX, so it was removed (as it is
36  *** happily defined in string.h).
37  *** THIS IS AN ALTERED VERSION.  It was altered by Christopher Seiwald
38  *** seiwald@perforce.com, on 20 January 2000, to use function prototypes.
39  *
40  * Beware that some of this code is subtly aware of the way operator
41  * precedence is structured in regular expressions.  Serious changes in
42  * regular-expression syntax might require a total rethink.
43  */
44 #include "regexp.h"
45 #include <stdio.h>
46 #include <ctype.h>
47 #ifndef ultrix
48 #include <stdlib.h>
49 #endif
50 #include <string.h>
51
52 /*
53  * The "internal use only" fields in regexp.h are present to pass info from
54  * compile to execute that permits the execute phase to run lots faster on
55  * simple cases.  They are:
56  *
57  * regstart     char that must begin a match; '\0' if none obvious
58  * reganch      is the match anchored (at beginning-of-line only)?
59  * regmust      string (pointer into program) that match must include, or NULL
60  * regmlen      length of regmust string
61  *
62  * Regstart and reganch permit very fast decisions on suitable starting points
63  * for a match, cutting down the work a lot.  Regmust permits fast rejection
64  * of lines that cannot possibly match.  The regmust tests are costly enough
65  * that regcomp() supplies a regmust only if the r.e. contains something
66  * potentially expensive (at present, the only such thing detected is * or +
67  * at the start of the r.e., which can involve a lot of backup).  Regmlen is
68  * supplied because the test in regexec() needs it and regcomp() is computing
69  * it anyway.
70  */
71
72 /*
73  * Structure for regexp "program".  This is essentially a linear encoding
74  * of a nondeterministic finite-state machine (aka syntax charts or
75  * "railroad normal form" in parsing technology).  Each node is an opcode
76  * plus a "next" pointer, possibly plus an operand.  "Next" pointers of
77  * all nodes except BRANCH implement concatenation; a "next" pointer with
78  * a BRANCH on both ends of it is connecting two alternatives.  (Here we
79  * have one of the subtle syntax dependencies:  an individual BRANCH (as
80  * opposed to a collection of them) is never concatenated with anything
81  * because of operator precedence.)  The operand of some types of node is
82  * a literal string; for others, it is a node leading into a sub-FSM.  In
83  * particular, the operand of a BRANCH node is the first node of the branch.
84  * (NB this is *not* a tree structure:  the tail of the branch connects
85  * to the thing following the set of BRANCHes.)  The opcodes are:
86  */
87
88 /* definition   number  opnd?   meaning */
89 #define END     0       /* no   End of program. */
90 #define BOL     1       /* no   Match "" at beginning of line. */
91 #define EOL     2       /* no   Match "" at end of line. */
92 #define ANY     3       /* no   Match any one character. */
93 #define ANYOF   4       /* str  Match any character in this string. */
94 #define ANYBUT  5       /* str  Match any character not in this string. */
95 #define BRANCH  6       /* node Match this alternative, or the next... */
96 #define BACK    7       /* no   Match "", "next" ptr points backward. */
97 #define EXACTLY 8       /* str  Match this string. */
98 #define NOTHING 9       /* no   Match empty string. */
99 #define STAR    10      /* node Match this (simple) thing 0 or more times. */
100 #define PLUS    11      /* node Match this (simple) thing 1 or more times. */
101 #define WORDA   12      /* no   Match "" at wordchar, where prev is nonword */
102 #define WORDZ   13      /* no   Match "" at nonwordchar, where prev is word */
103 #define OPEN    20      /* no   Mark this point in input as start of #n. */
104                         /*      OPEN+1 is number 1, etc. */
105 #define CLOSE   30      /* no   Analogous to OPEN. */
106
107 /*
108  * Opcode notes:
109  *
110  * BRANCH       The set of branches constituting a single choice are hooked
111  *              together with their "next" pointers, since precedence prevents
112  *              anything being concatenated to any individual branch.  The
113  *              "next" pointer of the last BRANCH in a choice points to the
114  *              thing following the whole choice.  This is also where the
115  *              final "next" pointer of each individual branch points; each
116  *              branch starts with the operand node of a BRANCH node.
117  *
118  * BACK         Normal "next" pointers all implicitly point forward; BACK
119  *              exists to make loop structures possible.
120  *
121  * STAR,PLUS    '?', and complex '*' and '+', are implemented as circular
122  *              BRANCH structures using BACK.  Simple cases (one character
123  *              per match) are implemented with STAR and PLUS for speed
124  *              and to minimize recursive plunges.
125  *
126  * OPEN,CLOSE   ...are numbered at compile time.
127  */
128
129 /*
130  * A node is one char of opcode followed by two chars of "next" pointer.
131  * "Next" pointers are stored as two 8-bit pieces, high order first.  The
132  * value is a positive offset from the opcode of the node containing it.
133  * An operand, if any, simply follows the node.  (Note that much of the
134  * code generation knows about this implicit relationship.)
135  *
136  * Using two bytes for the "next" pointer is vast overkill for most things,
137  * but allows patterns to get big without disasters.
138  */
139 #define OP(p)   (*(p))
140 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
141 #define OPERAND(p)      ((p) + 3)
142
143 /*
144  * See regmagic.h for one further detail of program structure.
145  */
146
147
148 /*
149  * Utility definitions.
150  */
151 #ifndef CHARBITS
152 #define UCHARAT(p)      ((int)*(unsigned char *)(p))
153 #else
154 #define UCHARAT(p)      ((int)*(p)&CHARBITS)
155 #endif
156
157 #define FAIL(m) { regerror(m); return(NULL); }
158 #define ISMULT(c)       ((c) == '*' || (c) == '+' || (c) == '?')
159
160 /*
161  * Flags to be passed up and down.
162  */
163 #define HASWIDTH        01      /* Known never to match null string. */
164 #define SIMPLE          02      /* Simple enough to be STAR/PLUS operand. */
165 #define SPSTART         04      /* Starts with * or +. */
166 #define WORST           0       /* Worst case. */
167
168 /*
169  * Global work variables for regcomp().
170  */
171 static char *regparse;          /* Input-scan pointer. */
172 static int regnpar;             /* () count. */
173 static char regdummy;
174 static char *regcode;           /* Code-emit pointer; &regdummy = don't. */
175 static long regsize;            /* Code size. */
176
177 /*
178  * Forward declarations for regcomp()'s friends.
179  */
180 #ifndef STATIC
181 #define STATIC  static
182 #endif
183 STATIC char *reg( int paren, int *flagp );
184 STATIC char *regbranch( int *flagp );
185 STATIC char *regpiece( int *flagp );
186 STATIC char *regatom( int *flagp );
187 STATIC char *regnode( int op );
188 STATIC char *regnext( register char *p );
189 STATIC void regc( int b );
190 STATIC void reginsert( char op, char *opnd );
191 STATIC void regtail( char *p, char *val );
192 STATIC void regoptail( char *p, char *val );
193 #ifdef STRCSPN
194 STATIC int strcspn();
195 #endif
196
197 /*
198  - regcomp - compile a regular expression into internal code
199  *
200  * We can't allocate space until we know how big the compiled form will be,
201  * but we can't compile it (and thus know how big it is) until we've got a
202  * place to put the code.  So we cheat:  we compile it twice, once with code
203  * generation turned off and size counting turned on, and once "for real".
204  * This also means that we don't allocate space until we are sure that the
205  * thing really will compile successfully, and we never have to move the
206  * code and thus invalidate pointers into it.  (Note that it has to be in
207  * one piece because free() must be able to free it all.)
208  *
209  * Beware that the optimization-preparation code in here knows about some
210  * of the structure of the compiled regexp.
211  */
212 regexp *
213 regcomp( char *exp )
214 {
215         register regexp *r;
216         register char *scan;
217         register char *longest;
218         register unsigned len;
219         int flags;
220
221         if (exp == NULL)
222                 FAIL("NULL argument");
223
224         /* First pass: determine size, legality. */
225 #ifdef notdef
226         if (exp[0] == '.' && exp[1] == '*') exp += 2;  /* aid grep */
227 #endif
228         regparse = (char *)exp;
229         regnpar = 1;
230         regsize = 0L;
231         regcode = &regdummy;
232         regc(MAGIC);
233         if (reg(0, &flags) == NULL)
234                 return(NULL);
235
236         /* Small enough for pointer-storage convention? */
237         if (regsize >= 32767L)          /* Probably could be 65535L. */
238                 FAIL("regexp too big");
239
240         /* Allocate space. */
241         r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
242         if (r == NULL)
243                 FAIL("out of space");
244
245         /* Second pass: emit code. */
246         regparse = (char *)exp;
247         regnpar = 1;
248         regcode = r->program;
249         regc(MAGIC);
250         if (reg(0, &flags) == NULL)
251                 return(NULL);
252
253         /* Dig out information for optimizations. */
254         r->regstart = '\0';     /* Worst-case defaults. */
255         r->reganch = 0;
256         r->regmust = NULL;
257         r->regmlen = 0;
258         scan = r->program+1;                    /* First BRANCH. */
259         if (OP(regnext(scan)) == END) {         /* Only one top-level choice. */
260                 scan = OPERAND(scan);
261
262                 /* Starting-point info. */
263                 if (OP(scan) == EXACTLY)
264                         r->regstart = *OPERAND(scan);
265                 else if (OP(scan) == BOL)
266                         r->reganch++;
267
268                 /*
269                  * If there's something expensive in the r.e., find the
270                  * longest literal string that must appear and make it the
271                  * regmust.  Resolve ties in favor of later strings, since
272                  * the regstart check works with the beginning of the r.e.
273                  * and avoiding duplication strengthens checking.  Not a
274                  * strong reason, but sufficient in the absence of others.
275                  */
276                 if (flags&SPSTART) {
277                         longest = NULL;
278                         len = 0;
279                         for (; scan != NULL; scan = regnext(scan))
280                                 if (OP(scan) == EXACTLY && strlen(OPERAND(scan)) >= len) {
281                                         longest = OPERAND(scan);
282                                         len = strlen(OPERAND(scan));
283                                 }
284                         r->regmust = longest;
285                         r->regmlen = len;
286                 }
287         }
288
289         return(r);
290 }
291
292 /*
293  - reg - regular expression, i.e. main body or parenthesized thing
294  *
295  * Caller must absorb opening parenthesis.
296  *
297  * Combining parenthesis handling with the base level of regular expression
298  * is a trifle forced, but the need to tie the tails of the branches to what
299  * follows makes it hard to avoid.
300  */
301 static char *
302 reg(
303         int paren,                      /* Parenthesized? */
304         int *flagp )
305 {
306         register char *ret;
307         register char *br;
308         register char *ender;
309         register int parno = 0;
310         int flags;
311
312         *flagp = HASWIDTH;      /* Tentatively. */
313
314         /* Make an OPEN node, if parenthesized. */
315         if (paren) {
316                 if (regnpar >= NSUBEXP)
317                         FAIL("too many ()");
318                 parno = regnpar;
319                 regnpar++;
320                 ret = regnode(OPEN+parno);
321         } else
322                 ret = NULL;
323
324         /* Pick up the branches, linking them together. */
325         br = regbranch(&flags);
326         if (br == NULL)
327                 return(NULL);
328         if (ret != NULL)
329                 regtail(ret, br);       /* OPEN -> first. */
330         else
331                 ret = br;
332         if (!(flags&HASWIDTH))
333                 *flagp &= ~HASWIDTH;
334         *flagp |= flags&SPSTART;
335         while (*regparse == '|' || *regparse == '\n') {
336                 regparse++;
337                 br = regbranch(&flags);
338                 if (br == NULL)
339                         return(NULL);
340                 regtail(ret, br);       /* BRANCH -> BRANCH. */
341                 if (!(flags&HASWIDTH))
342                         *flagp &= ~HASWIDTH;
343                 *flagp |= flags&SPSTART;
344         }
345
346         /* Make a closing node, and hook it on the end. */
347         ender = regnode((paren) ? CLOSE+parno : END);   
348         regtail(ret, ender);
349
350         /* Hook the tails of the branches to the closing node. */
351         for (br = ret; br != NULL; br = regnext(br))
352                 regoptail(br, ender);
353
354         /* Check for proper termination. */
355         if (paren && *regparse++ != ')') {
356                 FAIL("unmatched ()");
357         } else if (!paren && *regparse != '\0') {
358                 if (*regparse == ')') {
359                         FAIL("unmatched ()");
360                 } else
361                         FAIL("junk on end");    /* "Can't happen". */
362                 /* NOTREACHED */
363         }
364
365         return(ret);
366 }
367
368 /*
369  - regbranch - one alternative of an | operator
370  *
371  * Implements the concatenation operator.
372  */
373 static char *
374 regbranch( int *flagp )
375 {
376         register char *ret;
377         register char *chain;
378         register char *latest;
379         int flags;
380
381         *flagp = WORST;         /* Tentatively. */
382
383         ret = regnode(BRANCH);
384         chain = NULL;
385         while (*regparse != '\0' && *regparse != ')' &&
386                *regparse != '\n' && *regparse != '|') {
387                 latest = regpiece(&flags);
388                 if (latest == NULL)
389                         return(NULL);
390                 *flagp |= flags&HASWIDTH;
391                 if (chain == NULL)      /* First piece. */
392                         *flagp |= flags&SPSTART;
393                 else
394                         regtail(chain, latest);
395                 chain = latest;
396         }
397         if (chain == NULL)      /* Loop ran zero times. */
398                 (void) regnode(NOTHING);
399
400         return(ret);
401 }
402
403 /*
404  - regpiece - something followed by possible [*+?]
405  *
406  * Note that the branching code sequences used for ? and the general cases
407  * of * and + are somewhat optimized:  they use the same NOTHING node as
408  * both the endmarker for their branch list and the body of the last branch.
409  * It might seem that this node could be dispensed with entirely, but the
410  * endmarker role is not redundant.
411  */
412 static char *
413 regpiece( int *flagp )
414 {
415         register char *ret;
416         register char op;
417         register char *next;
418         int flags;
419
420         ret = regatom(&flags);
421         if (ret == NULL)
422                 return(NULL);
423
424         op = *regparse;
425         if (!ISMULT(op)) {
426                 *flagp = flags;
427                 return(ret);
428         }
429
430         if (!(flags&HASWIDTH) && op != '?')
431                 FAIL("*+ operand could be empty");
432         *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
433
434         if (op == '*' && (flags&SIMPLE))
435                 reginsert(STAR, ret);
436         else if (op == '*') {
437                 /* Emit x* as (x&|), where & means "self". */
438                 reginsert(BRANCH, ret);                 /* Either x */
439                 regoptail(ret, regnode(BACK));          /* and loop */
440                 regoptail(ret, ret);                    /* back */
441                 regtail(ret, regnode(BRANCH));          /* or */
442                 regtail(ret, regnode(NOTHING));         /* null. */
443         } else if (op == '+' && (flags&SIMPLE))
444                 reginsert(PLUS, ret);
445         else if (op == '+') {
446                 /* Emit x+ as x(&|), where & means "self". */
447                 next = regnode(BRANCH);                 /* Either */
448                 regtail(ret, next);
449                 regtail(regnode(BACK), ret);            /* loop back */
450                 regtail(next, regnode(BRANCH));         /* or */
451                 regtail(ret, regnode(NOTHING));         /* null. */
452         } else if (op == '?') {
453                 /* Emit x? as (x|) */
454                 reginsert(BRANCH, ret);                 /* Either x */
455                 regtail(ret, regnode(BRANCH));          /* or */
456                 next = regnode(NOTHING);                /* null. */
457                 regtail(ret, next);
458                 regoptail(ret, next);
459         }
460         regparse++;
461         if (ISMULT(*regparse))
462                 FAIL("nested *?+");
463
464         return(ret);
465 }
466
467 /*
468  - regatom - the lowest level
469  *
470  * Optimization:  gobbles an entire sequence of ordinary characters so that
471  * it can turn them into a single node, which is smaller to store and
472  * faster to run.  Backslashed characters are exceptions, each becoming a
473  * separate node; the code is simpler that way and it's not worth fixing.
474  */
475 static char *
476 regatom( int *flagp )
477 {
478         register char *ret;
479         int flags;
480
481         *flagp = WORST;         /* Tentatively. */
482
483         switch (*regparse++) {
484         /* FIXME: these chars only have meaning at beg/end of pat? */
485         case '^':
486                 ret = regnode(BOL);
487                 break;
488         case '$':
489                 ret = regnode(EOL);
490                 break;
491         case '.':
492                 ret = regnode(ANY);
493                 *flagp |= HASWIDTH|SIMPLE;
494                 break;
495         case '[': {
496                         register int classr;
497                         register int classend;
498
499                         if (*regparse == '^') { /* Complement of range. */
500                                 ret = regnode(ANYBUT);
501                                 regparse++;
502                         } else
503                                 ret = regnode(ANYOF);
504                         if (*regparse == ']' || *regparse == '-')
505                                 regc(*regparse++);
506                         while (*regparse != '\0' && *regparse != ']') {
507                                 if (*regparse == '-') {
508                                         regparse++;
509                                         if (*regparse == ']' || *regparse == '\0')
510                                                 regc('-');
511                                         else {
512                                                 classr = UCHARAT(regparse-2)+1;
513                                                 classend = UCHARAT(regparse);
514                                                 if (classr > classend+1)
515                                                         FAIL("invalid [] range");
516                                                 for (; classr <= classend; classr++)
517                                                         regc(classr);
518                                                 regparse++;
519                                         }
520                                 } else
521                                         regc(*regparse++);
522                         }
523                         regc('\0');
524                         if (*regparse != ']')
525                                 FAIL("unmatched []");
526                         regparse++;
527                         *flagp |= HASWIDTH|SIMPLE;
528                 }
529                 break;
530         case '(':
531                 ret = reg(1, &flags);
532                 if (ret == NULL)
533                         return(NULL);
534                 *flagp |= flags&(HASWIDTH|SPSTART);
535                 break;
536         case '\0':
537         case '|':
538         case '\n':
539         case ')':
540                 FAIL("internal urp");   /* Supposed to be caught earlier. */
541                 break;
542         case '?':
543         case '+':
544         case '*':
545                 FAIL("?+* follows nothing");
546                 break;
547         case '\\':
548                 switch (*regparse++) {
549                 case '\0':
550                         FAIL("trailing \\");
551                         break;
552                 case '<':
553                         ret = regnode(WORDA);
554                         break;
555                 case '>':
556                         ret = regnode(WORDZ);
557                         break;
558                 /* FIXME: Someday handle \1, \2, ... */
559                 default:
560                         /* Handle general quoted chars in exact-match routine */
561                         goto de_fault;
562                 }
563                 break;
564         de_fault:
565         default:
566                 /*
567                  * Encode a string of characters to be matched exactly.
568                  *
569                  * This is a bit tricky due to quoted chars and due to
570                  * '*', '+', and '?' taking the SINGLE char previous
571                  * as their operand.
572                  *
573                  * On entry, the char at regparse[-1] is going to go
574                  * into the string, no matter what it is.  (It could be
575                  * following a \ if we are entered from the '\' case.)
576                  *
577                  * Basic idea is to pick up a good char in  ch  and
578                  * examine the next char.  If it's *+? then we twiddle.
579                  * If it's \ then we frozzle.  If it's other magic char
580                  * we push  ch  and terminate the string.  If none of the
581                  * above, we push  ch  on the string and go around again.
582                  *
583                  *  regprev  is used to remember where "the current char"
584                  * starts in the string, if due to a *+? we need to back
585                  * up and put the current char in a separate, 1-char, string.
586                  * When  regprev  is NULL,  ch  is the only char in the
587                  * string; this is used in *+? handling, and in setting
588                  * flags |= SIMPLE at the end.
589                  */
590                 {
591                         char *regprev;
592                         register char ch;
593
594                         regparse--;                     /* Look at cur char */
595                         ret = regnode(EXACTLY);
596                         for ( regprev = 0 ; ; ) {
597                                 ch = *regparse++;       /* Get current char */
598                                 switch (*regparse) {    /* look at next one */
599
600                                 default:
601                                         regc(ch);       /* Add cur to string */
602                                         break;
603
604                                 case '.': case '[': case '(':
605                                 case ')': case '|': case '\n':
606                                 case '$': case '^':
607                                 case '\0':
608                                 /* FIXME, $ and ^ should not always be magic */
609                                 magic:
610                                         regc(ch);       /* dump cur char */
611