YugabyteDB (2.13.0.0-b42, bfc6a6643e7399ac8a0e81d06a3ee6d6571b33ab)

Coverage Report

Created: 2022-03-09 17:30

/Users/deen/code/yugabyte-db/src/postgres/src/backend/regex/regcomp.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * re_*comp and friends - compile REs
3
 * This file #includes several others (see the bottom).
4
 *
5
 * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
6
 *
7
 * Development of this software was funded, in part, by Cray Research Inc.,
8
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9
 * Corporation, none of whom are responsible for the results.  The author
10
 * thanks all of them.
11
 *
12
 * Redistribution and use in source and binary forms -- with or without
13
 * modification -- are permitted for any purpose, provided that
14
 * redistributions in source form retain this entire copyright notice and
15
 * indicate the origin and nature of any modifications.
16
 *
17
 * I'd appreciate being given credit for this package in the documentation
18
 * of software which uses it, but that is not a requirement.
19
 *
20
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
23
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
 *
31
 * src/backend/regex/regcomp.c
32
 *
33
 */
34
35
#include "regex/regguts.h"
36
37
/*
38
 * forward declarations, up here so forward datatypes etc. are defined early
39
 */
40
/* === regcomp.c === */
41
static void moresubs(struct vars *, int);
42
static int  freev(struct vars *, int);
43
static void makesearch(struct vars *, struct nfa *);
44
static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
45
static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
46
static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
47
static void nonword(struct vars *, int, struct state *, struct state *);
48
static void word(struct vars *, int, struct state *, struct state *);
49
static int  scannum(struct vars *);
50
static void repeat(struct vars *, struct state *, struct state *, int, int);
51
static void bracket(struct vars *, struct state *, struct state *);
52
static void cbracket(struct vars *, struct state *, struct state *);
53
static void brackpart(struct vars *, struct state *, struct state *);
54
static const chr *scanplain(struct vars *);
55
static void onechr(struct vars *, chr, struct state *, struct state *);
56
static void wordchrs(struct vars *);
57
static void processlacon(struct vars *, struct state *, struct state *, int,
58
       struct state *, struct state *);
59
static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
60
static void freesubre(struct vars *, struct subre *);
61
static void freesrnode(struct vars *, struct subre *);
62
static void optst(struct vars *, struct subre *);
63
static int  numst(struct subre *, int);
64
static void markst(struct subre *);
65
static void cleanst(struct vars *);
66
static long nfatree(struct vars *, struct subre *, FILE *);
67
static long nfanode(struct vars *, struct subre *, int, FILE *);
68
static int  newlacon(struct vars *, struct state *, struct state *, int);
69
static void freelacons(struct subre *, int);
70
static void rfree(regex_t *);
71
static int  rcancelrequested(void);
72
static int  rstacktoodeep(void);
73
74
#ifdef REG_DEBUG
75
static void dump(regex_t *, FILE *);
76
static void dumpst(struct subre *, FILE *, int);
77
static void stdump(struct subre *, FILE *, int);
78
static const char *stid(struct subre *, char *, size_t);
79
#endif
80
/* === regc_lex.c === */
81
static void lexstart(struct vars *);
82
static void prefixes(struct vars *);
83
static void lexnest(struct vars *, const chr *, const chr *);
84
static void lexword(struct vars *);
85
static int  next(struct vars *);
86
static int  lexescape(struct vars *);
87
static chr  lexdigits(struct vars *, int, int, int);
88
static int  brenext(struct vars *, chr);
89
static void skip(struct vars *);
90
static chr  newline(void);
91
static chr  chrnamed(struct vars *, const chr *, const chr *, chr);
92
93
/* === regc_color.c === */
94
static void initcm(struct vars *, struct colormap *);
95
static void freecm(struct colormap *);
96
static color maxcolor(struct colormap *);
97
static color newcolor(struct colormap *);
98
static void freecolor(struct colormap *, color);
99
static color pseudocolor(struct colormap *);
100
static color subcolor(struct colormap *, chr);
101
static color subcolorhi(struct colormap *, color *);
102
static color newsub(struct colormap *, color);
103
static int  newhicolorrow(struct colormap *, int);
104
static void newhicolorcols(struct colormap *);
105
static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *);
106
static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *);
107
static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *);
108
static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *);
109
static void okcolors(struct nfa *, struct colormap *);
110
static void colorchain(struct colormap *, struct arc *);
111
static void uncolorchain(struct colormap *, struct arc *);
112
static void rainbow(struct nfa *, struct colormap *, int, color, struct state *, struct state *);
113
static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
114
115
#ifdef REG_DEBUG
116
static void dumpcolors(struct colormap *, FILE *);
117
static void dumpchr(chr, FILE *);
118
#endif
119
/* === regc_nfa.c === */
120
static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
121
static void freenfa(struct nfa *);
122
static struct state *newstate(struct nfa *);
123
static struct state *newfstate(struct nfa *, int flag);
124
static void dropstate(struct nfa *, struct state *);
125
static void freestate(struct nfa *, struct state *);
126
static void destroystate(struct nfa *, struct state *);
127
static void newarc(struct nfa *, int, color, struct state *, struct state *);
128
static void createarc(struct nfa *, int, color, struct state *, struct state *);
129
static struct arc *allocarc(struct nfa *, struct state *);
130
static void freearc(struct nfa *, struct arc *);
131
static void changearctarget(struct arc *, struct state *);
132
static int  hasnonemptyout(struct state *);
133
static struct arc *findarc(struct state *, int, color);
134
static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
135
static void sortins(struct nfa *, struct state *);
136
static int  sortins_cmp(const void *, const void *);
137
static void sortouts(struct nfa *, struct state *);
138
static int  sortouts_cmp(const void *, const void *);
139
static void moveins(struct nfa *, struct state *, struct state *);
140
static void copyins(struct nfa *, struct state *, struct state *);
141
static void mergeins(struct nfa *, struct state *, struct arc **, int);
142
static void moveouts(struct nfa *, struct state *, struct state *);
143
static void copyouts(struct nfa *, struct state *, struct state *);
144
static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
145
static void delsub(struct nfa *, struct state *, struct state *);
146
static void deltraverse(struct nfa *, struct state *, struct state *);
147
static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
148
static void duptraverse(struct nfa *, struct state *, struct state *);
149
static void cleartraverse(struct nfa *, struct state *);
150
static struct state *single_color_transition(struct state *, struct state *);
151
static void specialcolors(struct nfa *);
152
static long optimize(struct nfa *, FILE *);
153
static void pullback(struct nfa *, FILE *);
154
static int  pull(struct nfa *, struct arc *, struct state **);
155
static void pushfwd(struct nfa *, FILE *);
156
static int  push(struct nfa *, struct arc *, struct state **);
157
158
5.41k
#define INCOMPATIBLE  1    /* destroys arc */
159
578
#define SATISFIED 2      /* constraint satisfied */
160
0
#define COMPATIBLE  3      /* compatible but not satisfied yet */
161
static int  combine(struct arc *, struct arc *);
162
static void fixempties(struct nfa *, FILE *);
163
static struct state *emptyreachable(struct nfa *, struct state *,
164
         struct state *, struct arc **);
165
static int  isconstraintarc(struct arc *);
166
static int  hasconstraintout(struct state *);
167
static void fixconstraintloops(struct nfa *, FILE *);
168
static int  findconstraintloop(struct nfa *, struct state *);
169
static void breakconstraintloop(struct nfa *, struct state *);
170
static void clonesuccessorstates(struct nfa *, struct state *, struct state *,
171
           struct state *, struct arc *,
172
           char *, char *, int);
173
static void cleanup(struct nfa *);
174
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
175
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
176
static long analyze(struct nfa *);
177
static void compact(struct nfa *, struct cnfa *);
178
static void carcsort(struct carc *, size_t);
179
static int  carc_cmp(const void *, const void *);
180
static void freecnfa(struct cnfa *);
181
static void dumpnfa(struct nfa *, FILE *);
182
183
#ifdef REG_DEBUG
184
static void dumpstate(struct state *, FILE *);
185
static void dumparcs(struct state *, FILE *);
186
static void dumparc(struct arc *, struct state *, FILE *);
187
static void dumpcnfa(struct cnfa *, FILE *);
188
static void dumpcstate(int, struct cnfa *, FILE *);
189
#endif
190
/* === regc_cvec.c === */
191
static struct cvec *newcvec(int, int);
192
static struct cvec *clearcvec(struct cvec *);
193
static void addchr(struct cvec *, chr);
194
static void addrange(struct cvec *, chr, chr);
195
static struct cvec *getcvec(struct vars *, int, int);
196
static void freecvec(struct cvec *);
197
198
/* === regc_pg_locale.c === */
199
static int  pg_wc_isdigit(pg_wchar c);
200
static int  pg_wc_isalpha(pg_wchar c);
201
static int  pg_wc_isalnum(pg_wchar c);
202
static int  pg_wc_isupper(pg_wchar c);
203
static int  pg_wc_islower(pg_wchar c);
204
static int  pg_wc_isgraph(pg_wchar c);
205
static int  pg_wc_isprint(pg_wchar c);
206
static int  pg_wc_ispunct(pg_wchar c);
207
static int  pg_wc_isspace(pg_wchar c);
208
static pg_wchar pg_wc_toupper(pg_wchar c);
209
static pg_wchar pg_wc_tolower(pg_wchar c);
210
211
/* === regc_locale.c === */
212
static chr  element(struct vars *, const chr *, const chr *);
213
static struct cvec *range(struct vars *, chr, chr, int);
214
static int  before(chr, chr);
215
static struct cvec *eclass(struct vars *, chr, int);
216
static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
217
static int  cclass_column_index(struct colormap *, chr);
218
static struct cvec *allcases(struct vars *, chr);
219
static int  cmp(const chr *, const chr *, size_t);
220
static int  casecmp(const chr *, const chr *, size_t);
221
222
223
/* internal variables, bundled for easy passing around */
224
struct vars
225
{
226
  regex_t    *re;
227
  const chr  *now;      /* scan pointer into string */
228
  const chr  *stop;     /* end of string */
229
  const chr  *savenow;    /* saved now and stop for "subroutine call" */
230
  const chr  *savestop;
231
  int     err;      /* error code (0 if none) */
232
  int     cflags;     /* copy of compile flags */
233
  int     lasttype;   /* type of previous token */
234
  int     nexttype;   /* type of next token */
235
  chr     nextvalue;    /* value (if any) of next token */
236
  int     lexcon;     /* lexical context type (see lex.c) */
237
  int     nsubexp;    /* subexpression count */
238
  struct subre **subs;    /* subRE pointer vector */
239
  size_t    nsubs;      /* length of vector */
240
  struct subre *sub10[10];  /* initial vector, enough for most */
241
  struct nfa *nfa;      /* the NFA */
242
  struct colormap *cm;    /* character color map */
243
  color   nlcolor;    /* color of newline */
244
  struct state *wordchrs;   /* state in nfa holding word-char outarcs */
245
  struct subre *tree;     /* subexpression tree */
246
  struct subre *treechain;  /* all tree nodes allocated */
247
  struct subre *treefree;   /* any free tree nodes */
248
  int     ntree;      /* number of tree nodes, plus one */
249
  struct cvec *cv;      /* interface cvec */
250
  struct cvec *cv2;     /* utility cvec */
251
  struct subre *lacons;   /* lookaround-constraint vector */
252
  int     nlacons;    /* size of lacons[]; note that only slots
253
                 * numbered 1 .. nlacons-1 are used */
254
  size_t    spaceused;    /* approx. space used for compilation */
255
};
256
257
/* parsing macros; most know that `v' is the struct vars pointer */
258
1.16k
#define NEXT()  (next(v))    /* advance by one token */
259
5.83k
#define SEE(t)  (v->nexttype == (t))  /* is next token this? */
260
180
#define EAT(t)  (SEE(t) && next(v)) /* if next is this, swallow it */
261
202k
#define VISERR(vv)  ((vv)->err != 0)  /* have we seen an error yet? */
262
9.59k
#define ISERR() VISERR(v)
263
100
#define VERR(vv,e)  ((vv)->nexttype = EOS, \
264
100
           (vv)->err = ((vv)->err ? (vv)->err : (e)))
265
100
#define ERR(e)  VERR(v, e)    /* record an error */
266
3.44k
#define NOERR() {if (ISERR()) return;}  /* if error seen, return */
267
2.07k
#define NOERRN()  {if (ISERR()) return NULL;} /* NOERR with retval */
268
282
#define NOERRZ()  {if (ISERR()) return 0;}  /* NOERR with retval */
269
7
#define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
270
76
#define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
271
499
#define EMPTYARC(x, y)  newarc(v->nfa, EMPTY, 0, x, y)
272
273
/* token type codes, some also used as NFA arc types */
274
72.7k
#define EMPTY 'n'        /* no token present */
275
200
#define EOS 'e'          /* end of string */
276
105k
#define PLAIN 'p'        /* ordinary character */
277
#define DIGIT 'd'       /* digit (in bound) */
278
1.34k
#define BACKREF 'b'        /* back reference */
279
0
#define COLLEL  'I'        /* start of [. */
280
0
#define ECLASS  'E'        /* start of [= */
281
29
#define CCLASS  'C'        /* start of [: */
282
#define END 'X'         /* end of [. [= [: */
283
15
#define RANGE 'R'        /* - within [] which might be range delim. */
284
3.69k
#define LACON 'L'        /* lookaround constraint subRE */
285
76.6k
#define AHEAD 'a'        /* color-lookahead arc */
286
33.8k
#define BEHIND  'r'        /* color-lookbehind arc */
287
0
#define WBDRY 'w'        /* word boundary constraint */
288
0
#define NWBDRY  'W'        /* non-word-boundary constraint */
289
0
#define SBEGIN  'A'        /* beginning of string (even if not BOL) */
290
0
#define SEND  'Z'        /* end of string (even if not EOL) */
291
#define PREFER  'P'       /* length preference */
292
293
/* is an arc colored, and hence on a color chain? */
294
#define COLORED(a) \
295
88.1k
  ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
296
297
298
/* static function list */
299
static const struct fns functions = {
300
  rfree,            /* regfree insides */
301
  rcancelrequested,     /* check for cancel request */
302
  rstacktoodeep       /* check for stack getting dangerously deep */
303
};
304
305
306
307
/*
308
 * pg_regcomp - compile regular expression
309
 *
310
 * Note: on failure, no resources remain allocated, so pg_regfree()
311
 * need not be applied to re.
312
 */
313
int
314
pg_regcomp(regex_t *re,
315
       const chr *string,
316
       size_t len,
317
       int flags,
318
       Oid collation)
319
100
{
320
100
  struct vars var;
321
100
  struct vars *v = &var;
322
100
  struct guts *g;
323
100
  int     i;
324
100
  size_t    j;
325
326
#ifdef REG_DEBUG
327
  FILE     *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
328
#else
329
100
  FILE     *debug = (FILE *) NULL;
330
100
#endif
331
332
900
#define  CNOERR()  { if (ISERR()) return freev(v, v->err); }
333
334
  /* sanity checks */
335
336
100
  if (re == NULL || string == NULL)
337
0
    return REG_INVARG;
338
100
  if ((flags & REG_QUOTE) &&
339
0
    (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
340
0
    return REG_INVARG;
341
100
  if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
342
0
    return REG_INVARG;
343
344
  /* Initialize locale-dependent support */
345
100
  pg_set_regex_collation(collation);
346
347
  /* initial setup (after which freev() is callable) */
348
100
  v->re = re;
349
100
  v->now = string;
350
100
  v->stop = v->now + len;
351
100
  v->savenow = v->savestop = NULL;
352
100
  v->err = 0;
353
100
  v->cflags = flags;
354
100
  v->nsubexp = 0;
355
100
  v->subs = v->sub10;
356
100
  v->nsubs = 10;
357
1.10k
  for (j = 0; j < v->nsubs; j++)
358
1.00k
    v->subs[j] = NULL;
359
100
  v->nfa = NULL;
360
100
  v->cm = NULL;
361
100
  v->nlcolor = COLORLESS;
362
100
  v->wordchrs = NULL;
363
100
  v->tree = NULL;
364
100
  v->treechain = NULL;
365
100
  v->treefree = NULL;
366
100
  v->cv = NULL;
367
100
  v->cv2 = NULL;
368
100
  v->lacons = NULL;
369
100
  v->nlacons = 0;
370
100
  v->spaceused = 0;
371
100
  re->re_magic = REMAGIC;
372
100
  re->re_info = 0;      /* bits get set during parse */
373
100
  re->re_csize = sizeof(chr);
374
100
  re->re_collation = collation;
375
100
  re->re_guts = NULL;
376
100
  re->re_fns = VS(&functions);
377
378
  /* more complex setup, malloced things */
379
100
  re->re_guts = VS(MALLOC(sizeof(struct guts)));
380
100
  if (re->re_guts == NULL)
381
0
    return freev(v, REG_ESPACE);
382
100
  g = (struct guts *) re->re_guts;
383
100
  g->tree = NULL;
384
100
  initcm(v, &g->cmap);
385
100
  v->cm = &g->cmap;
386
100
  g->lacons = NULL;
387
100
  g->nlacons = 0;
388
100
  ZAPCNFA(g->search);
389
100
  v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
390
100
  CNOERR();
391
  /* set up a reasonably-sized transient cvec for getcvec usage */
392
100
  v->cv = newcvec(100, 20);
393
100
  if (v->cv == NULL)
394
0
    return freev(v, REG_ESPACE);
395
396
  /* parsing */
397
100
  lexstart(v);        /* also handles prefixes */
398
100
  if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
399
13
  {
400
    /* assign newline a unique color */
401
13
    v->nlcolor = subcolor(v->cm, newline());
402
13
    okcolors(v->nfa, v->cm);
403
13
  }
404
100
  CNOERR();
405
100
  v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
406
100
  assert(SEE(EOS));     /* even if error; ISERR() => SEE(EOS) */
407
100
  CNOERR();
408
100
  assert(v->tree != NULL);
409
410
  /* finish setup of nfa and its subre tree */
411
100
  specialcolors(v->nfa);
412
100
  CNOERR();
413
#ifdef REG_DEBUG
414
  if (debug != NULL)
415
  {
416
    fprintf(debug, "\n\n\n========= RAW ==========\n");
417
    dumpnfa(v->nfa, debug);
418
    dumpst(v->tree, debug, 1);
419
  }
420
#endif
421
100
  optst(v, v->tree);
422
100
  v->ntree = numst(v->tree, 1);
423
100
  markst(v->tree);
424
100
  cleanst(v);
425
#ifdef REG_DEBUG
426
  if (debug != NULL)
427
  {
428
    fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
429
    dumpst(v->tree, debug, 1);
430
  }
431
#endif
432
433
  /* build compacted NFAs for tree and lacons */
434
100
  re->re_info |= nfatree(v, v->tree, debug);
435
100
  CNOERR();
436
100
  assert(v->nlacons == 0 || v->lacons != NULL);
437
103
  for (i = 1; i < v->nlacons; i++)
438
3
  {
439
3
    struct subre *lasub = &v->lacons[i];
440
441
#ifdef REG_DEBUG
442
    if (debug != NULL)
443
      fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
444
#endif
445
446
    /* Prepend .* to pattern if it's a lookbehind LACON */
447
3
    nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
448
3
  }
449
100
  CNOERR();
450
100
  if (v->tree->flags & SHORTER)
451
0
    NOTE(REG_USHORTEST);
452
453
  /* build compacted NFAs for tree, lacons, fast search */
454
#ifdef REG_DEBUG
455
  if (debug != NULL)
456
    fprintf(debug, "\n\n\n========= SEARCH ==========\n");
457
#endif
458
  /* can sacrifice main NFA now, so use it as work area */
459
100
  (DISCARD) optimize(v->nfa, debug);
460
100
  CNOERR();
461
100
  makesearch(v, v->nfa);
462
100
  CNOERR();
463
100
  compact(v->nfa, &g->search);
464
100
  CNOERR();
465
466
  /* looks okay, package it up */
467
100
  re->re_nsub = v->nsubexp;
468
100
  v->re = NULL;       /* freev no longer frees re */
469
100
  g->magic = GUTSMAGIC;
470
100
  g->cflags = v->cflags;
471
100
  g->info = re->re_info;
472
100
  g->nsub = re->re_nsub;
473
100
  g->tree = v->tree;
474
100
  v->tree = NULL;
475
100
  g->ntree = v->ntree;
476
100
  g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
477
100
  g->lacons = v->lacons;
478
100
  v->lacons = NULL;
479
100
  g->nlacons = v->nlacons;
480
481
#ifdef REG_DEBUG
482
  if (flags & REG_DUMP)
483
    dump(re, stdout);
484
#endif
485
486
100
  assert(v->err == 0);
487
100
  return freev(v, 0);
488
100
}
489
490
/*
491
 * moresubs - enlarge subRE vector
492
 */
493
static void
494
moresubs(struct vars *v,
495
     int wanted)      /* want enough room for this one */
496
0
{
497
0
  struct subre **p;
498
0
  size_t    n;
499
500
0
  assert(wanted > 0 && (size_t) wanted >= v->nsubs);
501
0
  n = (size_t) wanted * 3 / 2 + 1;
502
503
0
  if (v->subs == v->sub10)
504
0
  {
505
0
    p = (struct subre **) MALLOC(n * sizeof(struct subre *));
506
0
    if (p != NULL)
507
0
      memcpy(VS(p), VS(v->subs),
508
0
           v->nsubs * sizeof(struct subre *));
509
0
  }
510
0
  else
511
0
    p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
512
0
  if (p == NULL)
513
0
  {
514
0
    ERR(REG_ESPACE);
515
0
    return;
516
0
  }
517
0
  v->subs = p;
518
0
  for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
519
0
    *p = NULL;
520
0
  assert(v->nsubs == n);
521
0
  assert((size_t) wanted < v->nsubs);
522
0
}
523
524
/*
525
 * freev - free vars struct's substructures where necessary
526
 *
527
 * Optionally does error-number setting, and always returns error code
528
 * (if any), to make error-handling code terser.
529
 */
530
static int
531
freev(struct vars *v,
532
    int err)
533
100
{
534
100
  if (v->re != NULL)
535
0
    rfree(v->re);
536
100
  if (v->subs != v->sub10)
537
0
    FREE(v->subs);
538
100
  if (v->nfa != NULL)
539
100
    freenfa(v->nfa);
540
100
  if (v->tree != NULL)
541
0
    freesubre(v, v->tree);
542
100
  if (v->treechain != NULL)
543
0
    cleanst(v);
544
100
  if (v->cv != NULL)
545
100
    freecvec(v->cv);
546
100
  if (v->cv2 != NULL)
547
0
    freecvec(v->cv2);
548
100
  if (v->lacons != NULL)
549
0
    freelacons(v->lacons, v->nlacons);
550
100
  ERR(err);          /* nop if err==0 */
551
552
100
  return v->err;
553
100
}
554
555
/*
556
 * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
557
 * NFA must have been optimize()d already.
558
 */
559
static void
560
makesearch(struct vars *v,
561
       struct nfa *nfa)
562
100
{
563
100
  struct arc *a;
564
100
  struct arc *b;
565
100
  struct state *pre = nfa->pre;
566
100
  struct state *s;
567
100
  struct state *s2;
568
100
  struct state *slist;
569
570
  /* no loops are needed if it's anchored */
571
264
  for (a = pre->outs; a != NULL; a = a->outchain)
572
238
  {
573
238
    assert(a->type == PLAIN);
574
238
    if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
575
74
      break;
576
238
  }
577
100
  if (a != NULL)
578
74
  {
579
    /* add implicit .* in front */
580
74
    rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
581
582
    /* and ^* and \A* too -- not always necessary, but harmless */
583
74
    newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
584
74
    newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
585
74
  }
586
587
  /*
588
   * Now here's the subtle part.  Because many REs have no lookback
589
   * constraints, often knowing when you were in the pre state tells you
590
   * little; it's the next state(s) that are informative.  But some of them
591
   * may have other inarcs, i.e. it may be possible to make actual progress
592
   * and then return to one of them.  We must de-optimize such cases,
593
   * splitting each such state into progress and no-progress states.
594
   */
595
596
  /* first, make a list of the states reachable from pre and elsewhere */
597
100
  slist = NULL;
598
1.07k
  for (a = pre->outs; a != NULL; a = a->outchain)
599
972
  {
600
972
    s = a->to;
601
7.94k
    for (b = s->ins; b != NULL; b = b->inchain)
602
6.99k
    {
603
6.99k
      if (b->from != pre)
604
21
        break;
605
6.99k
    }
606
607
    /*
608
     * We want to mark states as being in the list already by having non
609
     * NULL tmp fields, but we can't just store the old slist value in tmp
610
     * because that doesn't work for the first such state.  Instead, the
611
     * first list entry gets its own address in tmp.
612
     */
613
972
    if (b != NULL && s->tmp == NULL)
614
6
    {
615
6
      s->tmp = (slist != NULL) ? slist : s;
616
6
      slist = s;
617
6
    }
618
972
  }
619
620
  /* do the splits */
621
106
  for (s = slist; s != NULL; s = s2)
622
6
  {
623
6
    s2 = newstate(nfa);
624
6
    NOERR();
625
6
    copyouts(nfa, s, s2);
626
6
    NOERR();
627
33
    for (a = s->ins; a != NULL; a = b)
628
27
    {
629
27
      b = a->inchain;
630
27
      if (a->from != pre)
631
6
      {
632
6
        cparc(nfa, a, a->from, s2);
633
6
        freearc(nfa, a);
634
6
      }
635
27
    }
636
0
    s2 = (s->tmp != s) ? s->tmp : NULL;
637
6
    s->tmp = NULL;      /* clean up while we're at it */
638
6
  }
639
100
}
640
641
/*
642
 * parse - parse an RE
643
 *
644
 * This is actually just the top level, which parses a bunch of branches
645
 * tied together with '|'.  They appear in the tree as the left children
646
 * of a chain of '|' subres.
647
 */
648
static struct subre *
649
parse(struct vars *v,
650
    int stopper,        /* EOS or ')' */
651
    int type,         /* LACON (lookaround subRE) or PLAIN */
652
    struct state *init,   /* initial state */
653
    struct state *final)    /* final state */
654
137
{
655
137
  struct state *left;     /* scaffolding for branch */
656
137
  struct state *right;
657
137
  struct subre *branches;   /* top level */
658
137
  struct subre *branch;   /* current branch */
659
137
  struct subre *t;      /* temporary */
660
137
  int     firstbranch;  /* is this the first branch? */
661
662
137
  assert(stopper == ')' || stopper == EOS);
663
664
137
  branches = subre(v, '|', LONGER, init, final);
665
137
  NOERRN();
666
137
  branch = branches;
667
137
  firstbranch = 1;
668
137
  do
669
146
  {             /* a branch */
670
146
    if (!firstbranch)
671
9
    {
672
      /* need a place to hang it */
673
9
      branch->right = subre(v, '|', LONGER, init, final);
674
9
      NOERRN();
675
9
      branch = branch->right;
676
9
    }
677
146
    firstbranch = 0;
678
146
    left = newstate(v->nfa);
679
146
    right = newstate(v->nfa);
680
146
    NOERRN();
681
146
    EMPTYARC(init, left);
682
146
    EMPTYARC(right, final);
683
146
    NOERRN();
684
146
    branch->left = parsebranch(v, stopper, type, left, right, 0);
685
146
    NOERRN();
686
146
    branch->flags |= UP(branch->flags | branch->left->flags);
687
146
    if ((branch->flags & ~branches->flags) != 0)  /* new flags */
688
0
      for (t = branches; t != branch; t = t->right)
689
0
        t->flags |= branch->flags;
690
146
  } while (EAT('|'));
691
137
  assert(SEE(stopper) || SEE(EOS));
692
693
137
  if (!SEE(stopper))
694
0
  {
695
0
    assert(stopper == ')' && SEE(EOS));
696
0
    ERR(REG_EPAREN);
697
0
  }
698
699
  /* optimize out simple cases */
700
137
  if (branch == branches)
701
131
  {             /* only one branch */
702
131
    assert(branch->right == NULL);
703
131
    t = branch->left;
704
131
    branch->left = NULL;
705
131
    freesubre(v, branches);
706
131
    branches = t;
707
131
  }
708
6
  else if (!MESSY(branches->flags))
709
6
  {             /* no interesting innards */
710
6
    freesubre(v, branches->left);
711
6
    branches->left = NULL;
712
6
    freesubre(v, branches->right);
713
6
    branches->right = NULL;
714
6
    branches->op = '=';
715
6
  }
716
717
137
  return branches;
718
137
}
719
720
/*
721
 * parsebranch - parse one branch of an RE
722
 *
723
 * This mostly manages concatenation, working closely with parseqatom().
724
 * Concatenated things are bundled up as much as possible, with separate
725
 * ',' nodes introduced only when necessary due to substructure.
726
 */
727
static struct subre *
728
parsebranch(struct vars *v,
729
      int stopper,    /* EOS or ')' */
730
      int type,     /* LACON (lookaround subRE) or PLAIN */
731
      struct state *left, /* leftmost state */
732
      struct state *right,  /* rightmost state */
733
      int partial)    /* is this only part of a branch? */
734
176
{
735
176
  struct state *lp;     /* left end of current construct */
736
176
  int     seencontent;  /* is there anything in this branch yet? */
737
176
  struct subre *t;
738
739
176
  lp = left;
740
176
  seencontent = 0;
741
176
  t = subre(v, '=', 0, left, right);  /* op '=' is tentative */
742
176
  NOERRN();
743
913
  while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
744
737
  {
745
737
    if (seencontent)
746
561
    {           /* implicit concat operator */
747
561
      lp = newstate(v->nfa);
748
561
      NOERRN();
749
561
      moveins(v->nfa, right, lp);
750
561
    }
751
737
    seencontent = 1;
752
753
    /* NB, recursion in parseqatom() may swallow rest of branch */
754
737
    parseqatom(v, stopper, type, lp, right, t);
755
737
    NOERRN();
756
737
  }
757
758
176
  if (!seencontent)
759
0
  {             /* empty branch */
760
0
    if (!partial)
761
0
      NOTE(REG_UUNSPEC);
762
0
    assert(lp == left);
763
0
    EMPTYARC(left, right);
764
0
  }
765
766
176
  return t;
767
176
}
768
769
/*
770
 * parseqatom - parse one quantified atom or constraint of an RE
771
 *
772
 * The bookkeeping near the end cooperates very closely with parsebranch();
773
 * in particular, it contains a recursion that can involve parsing the rest
774
 * of the branch, making this function's name somewhat inaccurate.
775
 */
776
static void
777
parseqatom(struct vars *v,
778
       int stopper,     /* EOS or ')' */
779
       int type,      /* LACON (lookaround subRE) or PLAIN */
780
       struct state *lp,  /* left state to hang it on */
781
       struct state *rp,  /* right state to hang it on */
782
       struct subre *top) /* subtree top */
783
737
{
784
737
  struct state *s;      /* temporaries for new states */
785
737
  struct state *s2;
786
787
107
#define  ARCV(t, val)  newarc(v->nfa, t, val, lp, rp)
788
737
  int     m,
789
737
        n;
790
737
  struct subre *atom;     /* atom's subtree */
791
737
  struct subre *t;
792
737
  int     cap;      /* capturing parens? */
793
737
  int     latype;     /* lookaround constraint type */
794
737
  int     subno;      /* capturing-parens or backref number */
795
737
  int     atomtype;
796
737
  int     qprefer;    /* quantifier short/long preference */
797
737
  int     f;
798
737
  struct subre **atomp;   /* where the pointer to atom is */
799
800
  /* initial bookkeeping */
801
737
  atom = NULL;
802
737
  assert(lp->nouts == 0);   /* must string new code */
803
737
  assert(rp->nins == 0);    /* between lp and rp */
804
737
  subno = 0;          /* just to shut lint up */
805
806
  /* an atom or constraint... */
807
737
  atomtype = v->nexttype;
808
737
  switch (atomtype)
809
737
  {
810
      /* first, constraints, which end by returning */
811
42
    case '^':
812
42
      ARCV('^', 1);
813
42
      if (v->cflags & REG_NLANCH)
814
13
        ARCV(BEHIND, v->nlcolor);
815
42
      NEXT();
816
42
      return;
817
0
      break;
818
39
    case '$':
819
39
      ARCV('$', 1);
820
39
      if (v->cflags & REG_NLANCH)
821
13
        ARCV(AHEAD, v->nlcolor);
822
39
      NEXT();
823
39
      return;
824
0
      break;
825
0
    case SBEGIN:
826
0
      ARCV('^', 1);   /* BOL */
827
0
      ARCV('^', 0);   /* or BOS */
828
0
      NEXT();
829
0
      return;
830
0
      break;
831
0
    case SEND:
832
0
      ARCV('$', 1);   /* EOL */
833
0
      ARCV('$', 0);   /* or EOS */
834
0
      NEXT();
835
0
      return;
836
0
      break;
837
0
    case '<':
838
0
      wordchrs(v);    /* does NEXT() */
839
0
      s = newstate(v->nfa);
840
0
      NOERR();
841
0
      nonword(v, BEHIND, lp, s);
842
0
      word(v, AHEAD, s, rp);
843
0
      return;
844
0
      break;
845
0
    case '>':
846
0
      wordchrs(v);    /* does NEXT() */
847
0
      s = newstate(v->nfa);
848
0
      NOERR();
849
0
      word(v, BEHIND, lp, s);
850
0
      nonword(v, AHEAD, s, rp);
851
0
      return;
852
0
      break;
853
0
    case WBDRY:
854
0
      wordchrs(v);    /* does NEXT() */
855
0
      s = newstate(v->nfa);
856
0
      NOERR();
857
0
      nonword(v, BEHIND, lp, s);
858
0
      word(v, AHEAD, s, rp);
859
0
      s = newstate(v->nfa);
860
0
      NOERR();
861
0
      word(v, BEHIND, lp, s);
862
0
      nonword(v, AHEAD, s, rp);
863
0
      return;
864
0
      break;
865
0
    case NWBDRY:
866
0
      wordchrs(v);    /* does NEXT() */
867
0
      s = newstate(v->nfa);
868
0
      NOERR();
869
0
      word(v, BEHIND, lp, s);
870
0
      word(v, AHEAD, s, rp);
871
0
      s = newstate(v->nfa);
872
0
      NOERR();
873
0
      nonword(v, BEHIND, lp, s);
874
0
      nonword(v, AHEAD, s, rp);
875
0
      return;
876
0
      break;
877
3
    case LACON:       /* lookaround constraint */
878
3
      latype = v->nextvalue;
879
3
      NEXT();
880
3
      s = newstate(v->nfa);
881
3
      s2 = newstate(v->nfa);
882
3
      NOERR();
883
3
      t = parse(v, ')', LACON, s, s2);
884
3
      freesubre(v, t);  /* internal structure irrelevant */
885
3
      NOERR();
886
3
      assert(SEE(')'));
887
3
      NEXT();
888
3
      processlacon(v, s, s2, latype, lp, rp);
889
3
      return;
890
0
      break;
891
      /* then errors, to get them out of the way */
892
0
    case '*':
893
0
    case '+':
894
0
    case '?':
895
0
    case '{':
896
0
      ERR(REG_BADRPT);
897
0
      return;
898
0
      break;
899
0
    default:
900
0
      ERR(REG_ASSERT);
901
0
      return;
902
0
      break;
903
      /* then plain characters, and minor variants on that theme */
904
0
    case ')':       /* unbalanced paren */
905
0
      if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
906
0
      {
907
0
        ERR(REG_EPAREN);
908
0
        return;
909
0
      }
910
      /* legal in EREs due to specification botch */
911
0
      NOTE(REG_UPBOTCH);
912
      /* fall through into case PLAIN */
913
0
      switch_fallthrough();
914
567
    case PLAIN:
915
567
      onechr(v, v->nextvalue, lp, rp);
916
567
      okcolors(v->nfa, v->cm);
917
567
      NOERR();
918
567
      NEXT();
919
567
      break;
920
35
    case '[':
921
35
      if (v->nextvalue == 1)
922
35
        bracket(v, lp, rp);
923
0
      else
924
0
        cbracket(v, lp, rp);
925
35
      assert(SEE(']') || ISERR());
926
35
      NEXT();
927
35
      break;
928
17
    case '.':
929
17
      rainbow(v->nfa, v->cm, PLAIN,
930
17
          (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
931
17
          lp, rp);
932
17
      NEXT();
933
17
      break;
934
      /* and finally the ugly stuff */
935
34
    case '(':       /* value flags as capturing or non */
936
34
      cap = (type == LACON) ? 0 : v->nextvalue;
937
34
      if (cap)
938
34
      {
939
34
        v->nsubexp++;
940
34
        subno = v->nsubexp;
941
34
        if ((size_t) subno >= v->nsubs)
942
0
          moresubs(v, subno);
943
34
        assert((size_t) subno < v->nsubs);
944
34
      }
945
0
      else
946
0
        atomtype = PLAIN; /* something that's not '(' */
947
34
      NEXT();
948
      /* need new endpoints because tree will contain pointers */
949
34
      s = newstate(v->nfa);
950
34
      s2 = newstate(v->nfa);
951
34
      NOERR();
952
34
      EMPTYARC(lp, s);
953
34
      EMPTYARC(s2, rp);
954
34
      NOERR();
955
34
      atom = parse(v, ')', type, s, s2);
956
34
      assert(SEE(')') || ISERR());
957
34
      NEXT();
958
34
      NOERR();
959
34
      if (cap)
960
34
      {
961
34
        v->subs[subno] = atom;
962
34
        t = subre(v, '(', atom->flags | CAP, lp, rp);
963
34
        NOERR();
964
34
        t->subno = subno;
965
34
        t->left = atom;
966
34
        atom = t;
967
34
      }
968
      /* postpone everything else pending possible {0} */
969
34
      break;
970
0
    case BACKREF:     /* the Feature From The Black Lagoon */
971
0
      INSIST(type != LACON, REG_ESUBREG);
972
0
      INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
973
0
      INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
974
0
      NOERR();
975
0
      assert(v->nextvalue > 0);
976
0
      atom = subre(v, 'b', BACKR, lp, rp);
977
0
      NOERR();
978
0
      subno = v->nextvalue;
979
0
      atom->subno = subno;
980
0
      EMPTYARC(lp, rp); /* temporarily, so there's something */
981
0
      NEXT();
982
0
      break;
983
653
  }
984
985
  /* ...and an atom may be followed by a quantifier */
986
653
  switch (v->nexttype)
987
653
  {
988
20
    case '*':
989
20
      m = 0;
990
20
      n = DUPINF;
991
20
      qprefer = (v->nextvalue) ? LONGER : SHORTER;
992
20
      NEXT();
993
20
      break;
994
10
    case '+':
995
10
      m = 1;
996
10
      n = DUPINF;
997
10
      qprefer = (v->nextvalue) ? LONGER : SHORTER;
998
10
      NEXT();
999
10
      break;
1000
3
    case '?':
1001
3
      m = 0;
1002
3
      n = 1;
1003
3
      qprefer = (v->nextvalue) ? LONGER : SHORTER;
1004
3
      NEXT();
1005
3
      break;
1006
34
    case '{':
1007
34
      NEXT();
1008
34
      m = scannum(v);
1009
34
      if (EAT(','))
1010
0
      {
1011
0
        if (SEE(DIGIT))
1012
0
          n = scannum(v);
1013
0
        else
1014
0
          n = DUPINF;
1015
0
        if (m > n)
1016
0
        {
1017
0
          ERR(REG_BADBR);
1018
0
          return;
1019
0
        }
1020
        /* {m,n} exercises preference, even if it's {m,m} */
1021
0
        qprefer = (v->nextvalue) ? LONGER : SHORTER;
1022
0
      }
1023
34
      else
1024
34
      {
1025
34
        n = m;
1026
        /* {m} passes operand's preference through */
1027
34
        qprefer = 0;
1028
34
      }
1029
34
      if (!SEE('}'))
1030
0
      {         /* catches errors too */
1031
0
        ERR(REG_BADBR);
1032
0
        return;
1033
0
      }
1034
34
      NEXT();
1035
34
      break;
1036
586
    default:        /* no quantifier */
1037
586
      m = n = 1;
1038
586
      qprefer = 0;
1039
586
      break;
1040
653
  }
1041
1042
  /* annoying special case:  {0} or {0,0} cancels everything */
1043
653
  if (m == 0 && n == 0)
1044
0
  {
1045
0
    if (atom != NULL)
1046
0
      freesubre(v, atom);
1047
0
    if (atomtype == '(')
1048
0
      v->subs[subno] = NULL;
1049
0
    delsub(v->nfa, lp, rp);
1050
0
    EMPTYARC(lp, rp);
1051
0
    return;
1052
0
  }
1053
1054
  /* if not a messy case, avoid hard part */
1055
653
  assert(!MESSY(top->flags));
1056
653
  f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1057
653
  if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
1058
619
  {
1059
619
    if (!(m == 1 && n == 1))
1060
61
      repeat(v, lp, rp, m, n);
1061
619
    if (atom != NULL)
1062
0
      freesubre(v, atom);
1063
619
    top->flags = f;
1064
619
    return;
1065
619
  }
1066
1067
  /*
1068
   * hard part:  something messy
1069
   *
1070
   * That is, capturing parens, back reference, short/long clash, or an atom
1071
   * with substructure containing one of those.
1072
   */
1073
1074
  /* now we'll need a subre for the contents even if they're boring */
1075
34
  if (atom == NULL)
1076
0
  {
1077
0
    atom = subre(v, '=', 0, lp, rp);
1078
0
    NOERR();
1079
0
  }
1080
1081
  /*----------
1082
   * Prepare a general-purpose state skeleton.
1083
   *
1084
   * In the no-backrefs case, we want this:
1085
   *
1086
   * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1087
   *
1088
   * where prefix is some repetitions of atom.  In the general case we need
1089
   *
1090
   * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1091
   *
1092
   * where the iterator wraps around [begin] ---atom---> [end]
1093
   *
1094
   * We make the s state here for both cases; s2 is made below if needed
1095
   *----------
1096
   */
1097
34
  s = newstate(v->nfa);   /* first, new endpoints for the atom */
1098
34
  s2 = newstate(v->nfa);
1099
34
  NOERR();
1100
34
  moveouts(v->nfa, lp, s);
1101
34
  moveins(v->nfa, rp, s2);
1102
34
  NOERR();
1103
34
  atom->begin = s;
1104
34
  atom->end = s2;
1105
34
  s = newstate(v->nfa);   /* set up starting state */
1106
34
  NOERR();
1107
34
  EMPTYARC(lp, s);
1108
34
  NOERR();
1109
1110
  /* break remaining subRE into x{...} and what follows */
1111
34
  t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1112
34
  NOERR();
1113
34
  t->left = atom;
1114
34
  atomp = &t->left;
1115
1116
  /* here we should recurse... but we must postpone that to the end */
1117
1118
  /* split top into prefix and remaining */
1119
34
  assert(top->op == '=' && top->left == NULL && top->right == NULL);
1120
34
  top->left = subre(v, '=', top->flags, top->begin, lp);
1121
34
  NOERR();
1122
34
  top->op = '.';
1123
34
  top->right = t;
1124
1125
  /* if it's a backref, now is the time to replicate the subNFA */
1126
34
  if (atomtype == BACKREF)
1127
0
  {
1128
0
    assert(atom->begin->nouts == 1);  /* just the EMPTY */
1129
0
    delsub(v->nfa, atom->begin, atom->end);
1130
0
    assert(v->subs[subno] != NULL);
1131
1132
    /*
1133
     * And here's why the recursion got postponed: it must wait until the
1134
     * skeleton is filled in, because it may hit a backref that wants to
1135
     * copy the filled-in skeleton.
1136
     */
1137
0
    dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1138
0
         atom->begin, atom->end);
1139
0
    NOERR();
1140
0
  }
1141
1142
  /*
1143
   * It's quantifier time.  If the atom is just a backref, we'll let it deal
1144
   * with quantifiers internally.
1145
   */
1146
34
  if (atomtype == BACKREF)
1147
0
  {
1148
    /* special case:  backrefs have internal quantifiers */
1149
0
    EMPTYARC(s, atom->begin); /* empty prefix */
1150
    /* just stuff everything into atom */
1151
0
    repeat(v, atom->begin, atom->end, m, n);
1152
0
    atom->min = (short) m;
1153
0
    atom->max = (short) n;
1154
0
    atom->flags |= COMBINE(qprefer, atom->flags);
1155
    /* rest of branch can be strung starting from atom->end */
1156
0
    s2 = atom->end;
1157
0
  }
1158
34
  else if (m == 1 && n == 1)
1159
28
  {
1160
    /* no/vacuous quantifier:  done */
1161
28
    EMPTYARC(s, atom->begin); /* empty prefix */
1162
    /* rest of branch can be strung starting from atom->end */
1163
28
    s2 = atom->end;
1164
28
  }
1165
6
  else if (m > 0 && !(atom->flags & BACKR))
1166
3
  {
1167
    /*
1168
     * If there's no backrefs involved, we can turn x{m,n} into
1169
     * x{m-1,n-1}x, with capturing parens in only the second x.  This is
1170
     * valid because we only care about capturing matches from the final
1171
     * iteration of the quantifier.  It's a win because we can implement
1172
     * the backref-free left side as a plain DFA node, since we don't
1173
     * really care where its submatches are.
1174
     */
1175
3
    dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1176
3
    assert(m >= 1 && m != DUPINF && n >= 1);
1177
3
    repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1);
1178
3
    f = COMBINE(qprefer, atom->flags);
1179
3
    t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1180
3
    NOERR();
1181
3
    t->left = subre(v, '=', PREF(f), s, atom->begin);
1182
3
    NOERR();
1183
3
    t->right = atom;
1184
3
    *atomp = t;
1185
    /* rest of branch can be strung starting from atom->end */
1186
3
    s2 = atom->end;
1187
3
  }
1188
3
  else
1189
3
  {
1190
    /* general case: need an iteration node */
1191
3
    s2 = newstate(v->nfa);
1192
3
    NOERR();
1193
3
    moveouts(v->nfa, atom->end, s2);
1194
3
    NOERR();
1195
3
    dupnfa(v->nfa, atom->begin, atom->end, s, s2);
1196
3
    repeat(v, s, s2, m, n);
1197
3
    f = COMBINE(qprefer, atom->flags);
1198
3
    t = subre(v, '*', f, s, s2);
1199
3
    NOERR();
1200
3
    t->min = (short) m;
1201
3
    t->max = (short) n;
1202
3
    t->left = atom;
1203
3
    *atomp = t;
1204
    /* rest of branch is to be strung from iteration's end state */
1205
3
  }
1206
1207
  /* and finally, look after that postponed recursion */
1208
34
  t = top->right;
1209
34
  if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1210
30
    t->right = parsebranch(v, stopper, type, s2, rp, 1);
1211
4
  else
1212
4
  {
1213
4
    EMPTYARC(s2, rp);
1214
4
    t->right = subre(v, '=', 0, s2, rp);
1215
4
  }
1216
34
  NOERR();
1217
34
  assert(SEE('|') || SEE(stopper) || SEE(EOS));
1218
34
  t->flags |= COMBINE(t->flags, t->right->flags);
1219
34
  top->flags |= COMBINE(top->flags, t->flags);
1220
34
}
1221
1222
/*
1223
 * nonword - generate arcs for non-word-character ahead or behind
1224
 */
1225
static void
1226
nonword(struct vars *v,
1227
    int dir,        /* AHEAD or BEHIND */
1228
    struct state *lp,
1229
    struct state *rp)
1230
0
{
1231
0
  int     anchor = (dir == AHEAD) ? '$' : '^';
1232
1233
0
  assert(dir == AHEAD || dir == BEHIND);
1234
0
  newarc(v->nfa, anchor, 1, lp, rp);
1235
0
  newarc(v->nfa, anchor, 0, lp, rp);
1236
0
  colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1237
  /* (no need for special attention to \n) */
1238
0
}
1239
1240
/*
1241
 * word - generate arcs for word character ahead or behind
1242
 */
1243
static void
1244
word(struct vars *v,
1245
   int dir,         /* AHEAD or BEHIND */
1246
   struct state *lp,
1247
   struct state *rp)
1248
0
{
1249
0
  assert(dir == AHEAD || dir == BEHIND);
1250
0
  cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1251
  /* (no need for special attention to \n) */
1252
0
}
1253
1254
/*
1255
 * scannum - scan a number
1256
 */
1257
static int            /* value, <= DUPMAX */
1258
scannum(struct vars *v)
1259
34
{
1260
34
  int     n = 0;
1261
1262
80
  while (SEE(DIGIT) && n < DUPMAX)
1263
46
  {
1264
46
    n = n * 10 + v->nextvalue;
1265
46
    NEXT();
1266
46
  }
1267
34
  if (SEE(DIGIT) || n > DUPMAX)
1268
0
  {
1269
0
    ERR(REG_BADBR);
1270
0
    return 0;
1271
0
  }
1272
34
  return n;
1273
34
}
1274
1275
/*
1276
 * repeat - replicate subNFA for quantifiers
1277
 *
1278
 * The sub-NFA strung from lp to rp is modified to represent m to n
1279
 * repetitions of its initial contents.
1280
 *
1281
 * The duplication sequences used here are chosen carefully so that any
1282
 * pointers starting out pointing into the subexpression end up pointing into
1283
 * the last occurrence.  (Note that it may not be strung between the same
1284
 * left and right end states, however!)  This used to be important for the
1285
 * subRE tree, although the important bits are now handled by the in-line
1286
 * code in parse(), and when this is called, it doesn't matter any more.
1287
 */
1288
static void
1289
repeat(struct vars *v,
1290
     struct state *lp,
1291
     struct state *rp,
1292
     int m,
1293
     int n)
1294
210
{
1295
286
#define  SOME  2
1296
30
#define  INF   3
1297
420
#define  PAIR(x, y)  ((x)*4 + (y))
1298
420
#define  REDUCE(x)   ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) )
1299
210
  const int rm = REDUCE(m);
1300
210
  const int rn = REDUCE(n);
1301
210
  struct state *s;
1302
210
  struct state *s2;
1303
1304
210
  switch (PAIR(rm, rn))
1305
210
  {
1306
0
    case PAIR(0, 0):   /* empty string */
1307
0
      delsub(v->nfa, lp, rp);
1308
0
      EMPTYARC(lp, rp);
1309
0
      break;
1310
3
    case PAIR(0, 1):   /* do as x| */
1311
3
      EMPTYARC(lp, rp);
1312
3
      break;
1313
0
    case PAIR(0, SOME):    /* do as x{1,n}| */
1314
0
      repeat(v, lp, rp, 1, n);
1315
0
      NOERR();
1316
0
      EMPTYARC(lp, rp);
1317
0
      break;
1318
20
    case PAIR(0, INF):   /* loop x around */
1319
20
      s = newstate(v->nfa);
1320
20
      NOERR();
1321
20
      moveouts(v->nfa, lp, s);
1322
20
      moveins(v->nfa, rp, s);
1323
20
      EMPTYARC(lp, s);
1324
20
      EMPTYARC(s, rp);
1325
20
      break;
1326
34
    case PAIR(1, 1):   /* no action required */
1327
34
      break;
1328
0
    case PAIR(1, SOME):    /* do as x{0,n-1}x = (x{1,n-1}|)x */
1329
0
      s = newstate(v->nfa);
1330
0
      NOERR();
1331
0
      moveouts(v->nfa, lp, s);
1332
0
      dupnfa(v->nfa, s, rp, lp, s);
1333
0
      NOERR();
1334
0
      repeat(v, lp, s, 1, n - 1);
1335
0
      NOERR();
1336
0
      EMPTYARC(lp, s);
1337
0
      break;
1338
10
    case PAIR(1, INF):   /* add loopback arc */
1339
10
      s = newstate(v->nfa);
1340
10
      s2 = newstate(v->nfa);
1341
10
      NOERR();
1342
10
      moveouts(v->nfa, lp, s);
1343
10
      moveins(v->nfa, rp, s2);
1344
10
      EMPTYARC(lp, s);
1345
10
      EMPTYARC(s2, rp);
1346
10
      EMPTYARC(s2, s);
1347
10
      break;
1348
143
    case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1349
143
      s = newstate(v->nfa);
1350
143
      NOERR();
1351
143
      moveouts(v->nfa, lp, s);
1352
143
      dupnfa(v->nfa, s, rp, lp, s);
1353
143
      NOERR();
1354
143
      repeat(v, lp, s, m - 1, n - 1);
1355
143
      break;
1356
0
    case PAIR(SOME, INF):  /* do as x{m-1,}x */
1357
0
      s = newstate(v->nfa);
1358
0
      NOERR();
1359
0
      moveouts(v->nfa, lp, s);
1360
0
      dupnfa(v->nfa, s, rp, lp, s);
1361
0
      NOERR();
1362
0
      repeat(v, lp, s, m - 1, n);
1363
0
      break;
1364
0
    default:
1365
0
      ERR(REG_ASSERT);
1366
0
      break;
1367
210
  }
1368
210
}
1369
1370
/*
1371
 * bracket - handle non-complemented bracket expression
1372
 * Also called from cbracket for complemented bracket expressions.
1373
 */
1374
static void
1375
bracket(struct vars *v,
1376
    struct state *lp,
1377
    struct state *rp)
1378
35
{
1379
35
  assert(SEE('['));
1380
35
  NEXT();
1381
138
  while (!SEE(']') && !SEE(EOS))
1382
103
    brackpart(v, lp, rp);
1383
35
  assert(SEE(']') || ISERR());
1384
35
  okcolors(v->nfa, v->cm);
1385
35
}
1386
1387
/*
1388
 * cbracket - handle complemented bracket expression
1389
 * We do it by calling bracket() with dummy endpoints, and then complementing
1390
 * the result.  The alternative would be to invoke rainbow(), and then delete
1391
 * arcs as the b.e. is seen... but that gets messy.
1392
 */
1393
static void
1394
cbracket(struct vars *v,
1395
     struct state *lp,
1396
     struct state *rp)
1397
0
{
1398
0
  struct state *left = newstate(v->nfa);
1399
0
  struct state *right = newstate(v->nfa);
1400
1401
0
  NOERR();
1402
0
  bracket(v, left, right);
1403
0
  if (v->cflags & REG_NLSTOP)
1404
0
    newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1405
0
  NOERR();
1406
1407
0
  assert(lp->nouts == 0);   /* all outarcs will be ours */
1408
1409
  /*
1410
   * Easy part of complementing, and all there is to do since the MCCE code
1411
   * was removed.
1412
   */
1413
0
  colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1414
0
  NOERR();
1415
0
  dropstate(v->nfa, left);
1416
0
  assert(right->nins == 0);
1417
0
  freestate(v->nfa, right);
1418
0
}
1419
1420
/*
1421
 * brackpart - handle one item (or range) within a bracket expression
1422
 */
1423
static void
1424
brackpart(struct vars *v,
1425
      struct state *lp,
1426
      struct state *rp)
1427
103
{
1428
103
  chr     startc;
1429
103
  chr     endc;
1430
103
  struct cvec *cv;
1431
103
  const chr  *startp;
1432
103
  const chr  *endp;
1433
103
  chr     c[1];
1434
1435
  /* parse something, get rid of special cases, take shortcuts */
1436
103
  switch (v->nexttype)
1437
103
  {
1438
0
    case RANGE:       /* a-b-c or other botch */
1439
0
      ERR(REG_ERANGE);
1440
0
      return;
1441
0
      break;
1442
96
    case PLAIN:
1443
96
      c[0] = v->nextvalue;
1444
96
      NEXT();
1445
      /* shortcut for ordinary chr (not range) */
1446
96
      if (!SEE(RANGE))
1447
81
      {
1448
81
        onechr(v, c[0], lp, rp);
1449
81
        return;
1450
81
      }
1451
15
      startc = element(v, c, c + 1);
1452
15
      NOERR();
1453
15
      break;
1454
0
    case COLLEL:
1455
0
      startp = v->now;
1456
0
      endp = scanplain(v);
1457
0
      INSIST(startp < endp, REG_ECOLLATE);
1458
0
      NOERR();
1459
0
      startc = element(v, startp, endp);
1460
0
      NOERR();
1461
0
      break;
1462
0
    case ECLASS:
1463
0
      startp = v->now;
1464
0
      endp = scanplain(v);
1465
0
      INSIST(startp < endp, REG_ECOLLATE);
1466
0
      NOERR();
1467
0
      startc = element(v, startp, endp);
1468
0
      NOERR();
1469
0
      cv = eclass(v, startc, (v->cflags & REG_ICASE));
1470
0
      NOERR();
1471
0
      subcolorcvec(v, cv, lp, rp);
1472
0
      return;
1473
0
      break;
1474
7
    case CCLASS:
1475
7
      startp = v->now;
1476
7
      endp = scanplain(v);
1477
7
      INSIST(startp < endp, REG_ECTYPE);
1478
7
      NOERR();
1479
7
      cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1480
7
      NOERR();
1481
7
      subcolorcvec(v, cv, lp, rp);
1482
7
      return;
1483
0
      break;
1484
0
    default:
1485
0
      ERR(REG_ASSERT);
1486
0
      return;
1487
0
      break;
1488
15
  }
1489
1490
15
  if (SEE(RANGE))
1491
15
  {
1492
15
    NEXT();
1493
15
    switch (v->nexttype)
1494
15
    {
1495
15
      case PLAIN:
1496
15
      case RANGE:
1497
15
        c[0] = v->nextvalue;
1498
15
        NEXT();
1499
15
        endc = element(v, c, c + 1);
1500
15
        NOERR();
1501
15
        break;
1502
0
      case COLLEL:
1503
0
        startp = v->now;
1504
0
        endp = scanplain(v);
1505
0
        INSIST(startp < endp, REG_ECOLLATE);
1506
0
        NOERR();
1507
0
        endc = element(v, startp, endp);
1508
0
        NOERR();
1509
0
        break;
1510
0
      default:
1511
0
        ERR(REG_ERANGE);
1512
0
        return;
1513
0
        break;
1514
0
    }
1515
0
  }
1516
0
  else
1517
0
    endc = startc;
1518
1519
  /*
1520
   * Ranges are unportable.  Actually, standard C does guarantee that digits
1521
   * are contiguous, but making that an exception is just too complicated.
1522
   */
1523
15
  if (startc != endc)
1524
15
    NOTE(REG_UUNPORT);
1525
15
  cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1526
15
  NOERR();
1527
15
  subcolorcvec(v, cv, lp, rp);
1528
15
}
1529
1530
/*
1531
 * scanplain - scan PLAIN contents of [. etc.
1532
 *
1533
 * Certain bits of trickery in lex.c know that this code does not try
1534
 * to look past the final bracket of the [. etc.
1535
 */
1536
static const chr *        /* just after end of sequence */
1537
scanplain(struct vars *v)
1538
7
{
1539
7
  const chr  *endp;
1540
1541
7
  assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1542
7
  NEXT();
1543
1544
7
  endp = v->now;
1545
42
  while (SEE(PLAIN))
1546
35
  {
1547
35
    endp = v->now;
1548
35
    NEXT();
1549
35
  }
1550
1551
7
  assert(SEE(END) || ISERR());
1552
7
  NEXT();
1553
1554
7
  return endp;
1555
7
}
1556
1557
/*
1558
 * onechr - fill in arcs for a plain character, and possible case complements
1559
 * This is mostly a shortcut for efficient handling of the common case.
1560
 */
1561
static void
1562
onechr(struct vars *v,
1563
     chr c,
1564
     struct state *lp,
1565
     struct state *rp)
1566
648
{
1567
648
  if (!(v->cflags & REG_ICASE))
1568
433
  {
1569
433
    color   lastsubcolor = COLORLESS;
1570
1571
433
    subcoloronechr(v, c, lp, rp, &lastsubcolor);
1572
433
    return;
1573
433
  }
1574
1575
  /* rats, need general case anyway... */
1576
215
  subcolorcvec(v, allcases(v, c), lp, rp);
1577
215
}
1578
1579
/*
1580
 * wordchrs - set up word-chr list for word-boundary stuff, if needed
1581
 *
1582
 * The list is kept as a bunch of arcs between two dummy states; it's
1583
 * disposed of by the unreachable-states sweep in NFA optimization.
1584
 * Does NEXT().  Must not be called from any unusual lexical context.
1585
 * This should be reconciled with the \w etc. handling in lex.c, and
1586
 * should be cleaned up to reduce dependencies on input scanning.
1587
 */
1588
static void
1589
wordchrs(struct vars *v)
1590
0
{
1591
0
  struct state *left;
1592
0
  struct state *right;
1593
1594
0
  if (v->wordchrs != NULL)
1595
0
  {
1596
0
    NEXT();         /* for consistency */
1597
0
    return;
1598
0
  }
1599
1600
0
  left = newstate(v->nfa);
1601
0
  right = newstate(v->nfa);
1602
0
  NOERR();
1603
  /* fine point:  implemented with [::], and lexer will set REG_ULOCALE */
1604
0
  lexword(v);
1605
0
  NEXT();
1606
0
  assert(v->savenow != NULL && SEE('['));
1607
0
  bracket(v, left, right);
1608
0
  assert((v->savenow != NULL && SEE(']')) || ISERR());
1609
0
  NEXT();
1610
0
  NOERR();
1611
0
  v->wordchrs = left;
1612
0
}
1613
1614
/*
1615
 * processlacon - generate the NFA representation of a LACON
1616
 *
1617
 * In the general case this is just newlacon() + newarc(), but some cases
1618
 * can be optimized.
1619
 */
1620
static void
1621
processlacon(struct vars *v,
1622
       struct state *begin, /* start of parsed LACON sub-re */
1623
       struct state *end, /* end of parsed LACON sub-re */
1624
       int latype,
1625
       struct state *lp,  /* left state to hang it on */
1626
       struct state *rp)  /* right state to hang it on */
1627
3
{
1628
3
  struct state *s1;
1629
3
  int     n;
1630
1631
  /*
1632
   * Check for lookaround RE consisting of a single plain color arc (or set
1633
   * of arcs); this would typically be a simple chr or a bracket expression.
1634
   */
1635
3
  s1 = single_color_transition(begin, end);
1636
3
  switch (latype)
1637
3
  {
1638
0
    case LATYPE_AHEAD_POS:
1639
      /* If lookahead RE is just colorset C, convert to AHEAD(C) */
1640
0
      if (s1 != NULL)
1641
0
      {
1642
0
        cloneouts(v->nfa, s1, lp, rp, AHEAD);
1643
0
        return;
1644
0
      }
1645
0
      break;
1646
3
    case LATYPE_AHEAD_NEG:
1647
      /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
1648
3
      if (s1 != NULL)
1649
0
      {
1650
0
        colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
1651
0
        newarc(v->nfa, '$', 1, lp, rp);
1652
0
        newarc(v->nfa, '$', 0, lp, rp);
1653
0
        return;
1654
0
      }
1655
3
      break;
1656
0
    case LATYPE_BEHIND_POS:
1657
      /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
1658
0
      if (s1 != NULL)
1659
0
      {
1660
0
        cloneouts(v->nfa, s1, lp, rp, BEHIND);
1661
0
        return;
1662
0
      }
1663
0
      break;
1664
0
    case LATYPE_BEHIND_NEG:
1665
      /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
1666
0
      if (s1 != NULL)
1667
0
      {
1668
0
        colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
1669
0
        newarc(v->nfa, '^', 1, lp, rp);
1670
0
        newarc(v->nfa, '^', 0, lp, rp);
1671
0
        return;
1672
0
      }
1673
0
      break;
1674
0
    default:
1675
0
      assert(NOTREACHED);
1676
3
  }
1677
1678
  /* General case: we need a LACON subre and arc */
1679
3
  n = newlacon(v, begin, end, latype);
1680
3
  newarc(v->nfa, LACON, n, lp, rp);
1681
3
}
1682
1683
/*
1684
 * subre - allocate a subre
1685
 */
1686
static struct subre *
1687
subre(struct vars *v,
1688
    int op,
1689
    int flags,
1690
    struct state *begin,
1691
    struct state *end)
1692
437
{
1693
437
  struct subre *ret = v->treefree;
1694
1695
  /*
1696
   * Checking for stack overflow here is sufficient to protect parse() and
1697
   * its recursive subroutines.
1698
   */
1699
437
  if (STACK_TOO_DEEP(v->re))
1700
0
  {
1701
0
    ERR(REG_ETOOBIG);
1702
0
    return NULL;
1703
0
  }
1704
1705
437
  if (ret != NULL)
1706
52
    v->treefree = ret->left;
1707
385
  else
1708
385
  {
1709
385
    ret = (struct subre *) MALLOC(sizeof(struct subre));
1710
385
    if (ret == NULL)
1711
0
    {
1712
0
      ERR(REG_ESPACE);
1713
0
      return NULL;
1714
0
    }
1715
385
    ret->chain = v->treechain;
1716
385
    v->treechain = ret;
1717
385
  }
1718
1719
437
  assert(strchr("=b|.*(", op) != NULL);
1720
1721
437
  ret->op = op;
1722
437
  ret->flags = flags;
1723
437
  ret->id = 0;        /* will be assigned later */
1724
437
  ret->subno = 0;
1725
437
  ret->min = ret->max = 1;
1726
437
  ret->left = NULL;
1727
437
  ret->right = NULL;
1728
437
  ret->begin = begin;
1729
437
  ret->end = end;
1730
437
  ZAPCNFA(ret->cnfa);
1731
1732
437
  return ret;
1733
437
}
1734
1735
/*
1736
 * freesubre - free a subRE subtree
1737
 */
1738
static void
1739
freesubre(struct vars *v,   /* might be NULL */
1740
      struct subre *sr)
1741
236
{
1742
236
  if (sr == NULL)
1743
0
    return;
1744
1745
236
  if (sr->left != NULL)
1746
31
    freesubre(v, sr->left);
1747
236
  if (sr->right != NULL)
1748
17
    freesubre(v, sr->right);
1749
1750
236
  freesrnode(v, sr);
1751
236
}
1752
1753
/*
1754
 * freesrnode - free one node in a subRE subtree
1755
 */
1756
static void
1757
freesrnode(struct vars *v,    /* might be NULL */
1758
       struct subre *sr)
1759
236
{
1760
236
  if (sr == NULL)
1761
0
    return;
1762
1763
236
  if (!NULLCNFA(sr->cnfa))
1764
78
    freecnfa(&sr->cnfa);
1765
236
  sr->flags = 0;
1766
1767
236
  if (v != NULL && v->treechain != NULL)
1768
158
  {
1769
    /* we're still parsing, maybe we can reuse the subre */
1770
158
    sr->left = v->treefree;
1771
158
    v->treefree = sr;
1772
158
  }
1773
236
  else
1774
78
    FREE(sr);
1775
236
}
1776
1777
/*
1778
 * optst - optimize a subRE subtree
1779
 */
1780
static void
1781
optst(struct vars *v,
1782
    struct subre *t)
1783
100
{
1784
  /*
1785
   * DGP (2007-11-13): I assume it was the programmer's intent to eventually
1786
   * come back and add code to optimize subRE trees, but the routine coded
1787
   * just spends effort traversing the tree and doing nothing. We can do
1788
   * nothing with less effort.
1789
   */
1790
100
  return;
1791
100
}
1792
1793
/*
1794
 * numst - number tree nodes (assigning "id" indexes)
1795
 */
1796
static int            /* next number */
1797
numst(struct subre *t,
1798
    int start)        /* starting point for subtree numbers */
1799
279
{
1800
279
  int     i;
1801
1802
279
  assert(t != NULL);
1803
1804
279
  i = start;
1805
279
  t->id = (short) i++;
1806
279
  if (t->left != NULL)
1807
108
    i = numst(t->left, i);
1808
279
  if (t->right != NULL)
1809
71
    i = numst(t->right, i);
1810
279
  return i;
1811
279
}
1812
1813
/*
1814
 * markst - mark tree nodes as INUSE
1815
 *
1816
 * Note: this is a great deal more subtle than it looks.  During initial
1817
 * parsing of a regex, all subres are linked into the treechain list;
1818
 * discarded ones are also linked into the treefree list for possible reuse.
1819
 * After we are done creating all subres required for a regex, we run markst()
1820
 * then cleanst(), which results in discarding all subres not reachable from
1821
 * v->tree.  We then clear v->treechain, indicating that subres must be found
1822
 * by descending from v->tree.  This changes the behavior of freesubre(): it
1823
 * will henceforth FREE() unwanted subres rather than sticking them into the
1824
 * treefree list.  (Doing that any earlier would result in dangling links in
1825
 * the treechain list.)  This all means that freev() will clean up correctly
1826
 * if invoked before or after markst()+cleanst(); but it would not work if
1827
 * called partway through this state conversion, so we mustn't error out
1828
 * in or between these two functions.
1829
 */
1830
static void
1831
markst(struct subre *t)
1832
279
{
1833
279
  assert(t != NULL);
1834
1835
279
  t->flags |= INUSE;
1836
279
  if (t->left != NULL)
1837
108
    markst(t->left);
1838
279
  if (t->right != NULL)
1839
71
    markst(t->right);
1840
279
}
1841
1842
/*
1843
 * cleanst - free any tree nodes not marked INUSE
1844
 */
1845
static void
1846
cleanst(struct vars *v)
1847
100
{
1848
100
  struct subre *t;
1849
100
  struct subre *next;
1850
1851
485
  for (t = v->treechain; t != NULL; t = next)
1852
385
  {
1853
385
    next = t->chain;
1854
385
    if (!(t->flags & INUSE))
1855
106
      FREE(t);
1856
385
  }
1857
100
  v->treechain = NULL;
1858
100
  v->treefree = NULL;     /* just on general principles */
1859
100
}
1860
1861
/*
1862
 * nfatree - turn a subRE subtree into a tree of compacted NFAs
1863
 */
1864
static long           /* optimize results from top node */
1865
nfatree(struct vars *v,
1866
    struct subre *t,
1867
    FILE *f)        /* for debug output */
1868
279
{
1869
279
  assert(t != NULL && t->begin != NULL);
1870
1871
279
  if (t->left != NULL)
1872
108
    (DISCARD) nfatree(v, t->left, f);
1873
279
  if (t->right != NULL)
1874
71
    (DISCARD) nfatree(v, t->right, f);
1875
1876
279
  return nfanode(v, t, 0, f);
1877
279
}
1878
1879
/*
1880
 * nfanode - do one NFA for nfatree or lacons
1881
 *
1882
 * If converttosearch is true, apply makesearch() to the NFA.
1883
 */
1884
static long           /* optimize results */
1885
nfanode(struct vars *v,
1886
    struct subre *t,
1887
    int converttosearch,
1888
    FILE *f)        /* for debug output */
1889
282
{
1890
282
  struct nfa *nfa;
1891
282
  long    ret = 0;
1892
1893
282
  assert(t->begin != NULL);
1894
1895
#ifdef REG_DEBUG
1896
  if (f != NULL)
1897
  {
1898
    char    idbuf[50];
1899
1900
    fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1901
        stid(t, idbuf, sizeof(idbuf)));
1902
  }
1903
#endif
1904
282
  nfa = newnfa(v, v->cm, v->nfa);
1905
282
  NOERRZ();
1906
282
  dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1907
282
  if (!ISERR())
1908
282
    specialcolors(nfa);
1909
282
  if (!ISERR())
1910
282
    ret = optimize(nfa, f);
1911
282
  if (converttosearch && !ISERR())
1912
0
    makesearch(v, nfa);
1913
282
  if (!ISERR())
1914
282
    compact(nfa, &t->cnfa);
1915
1916
282
  freenfa(nfa);
1917
282
  return ret;
1918
282
}
1919
1920
/*
1921
 * newlacon - allocate a lookaround-constraint subRE
1922
 */
1923
static int            /* lacon number */
1924
newlacon(struct vars *v,
1925
     struct state *begin,
1926
     struct state *end,
1927
     int latype)
1928
3
{
1929
3
  int     n;
1930
3
  struct subre *newlacons;
1931
3
  struct subre *sub;
1932
1933
3
  if (v->nlacons == 0)
1934
3
  {
1935
3
    n = 1;          /* skip 0th */
1936
3
    newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
1937
3
  }
1938
0
  else
1939
0
  {
1940
0
    n = v->nlacons;
1941
0
    newlacons = (struct subre *) REALLOC(v->lacons,
1942
0
                       (n + 1) * sizeof(struct subre));
1943
0
  }
1944
3
  if (newlacons == NULL)
1945
0
  {
1946
0
    ERR(REG_ESPACE);
1947
0
    return 0;
1948
0
  }
1949
3
  v->lacons = newlacons;
1950
3
  v->nlacons = n + 1;
1951
3
  sub = &v->lacons[n];
1952
3
  sub->begin = begin;
1953
3
  sub->end = end;
1954
3
  sub->subno = latype;
1955
3
  ZAPCNFA(sub->cnfa);
1956
3
  return n;
1957
3
}
1958
1959
/*
1960
 * freelacons - free lookaround-constraint subRE vector
1961
 */
1962
static void
1963
freelacons(struct subre *subs,
1964
       int n)
1965
2
{
1966
2
  struct subre *sub;
1967
2
  int     i;
1968
1969
2
  assert(n > 0);
1970
4
  for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
1971
2
    if (!NULLCNFA(sub->cnfa))
1972
2
      freecnfa(&sub->cnfa);
1973
2
  FREE(subs);
1974
2
}
1975
1976
/*
1977
 * rfree - free a whole RE (insides of regfree)
1978
 */
1979
static void
1980
rfree(regex_t *re)
1981
42
{
1982
42
  struct guts *g;
1983
1984
42
  if (re == NULL || re->re_magic != REMAGIC)
1985
0
    return;
1986
1987
42
  re->re_magic = 0;     /* invalidate RE */
1988
42
  g = (struct guts *) re->re_guts;
1989
42
  re->re_guts = NULL;
1990
42
  re->re_fns = NULL;
1991
42
  if (g != NULL)
1992
42
  {
1993
42
    g->magic = 0;
1994
42
    freecm(&g->cmap);
1995
42
    if (g->tree != NULL)
1996
42
      freesubre((struct vars *) NULL, g->tree);
1997
42
    if (g->lacons != NULL)
1998
2
      freelacons(g->lacons, g->nlacons);
1999
42
    if (!NULLCNFA(g->search))
2000
42
      freecnfa(&g->search);
2001
42
    FREE(g);
2002
42
  }
2003
42
}
2004
2005
/*
2006
 * rcancelrequested - check for external request to cancel regex operation
2007
 *
2008
 * Return nonzero to fail the operation with error code REG_CANCEL,
2009
 * zero to keep going
2010
 *
2011
 * The current implementation is Postgres-specific.  If we ever get around
2012
 * to splitting the regex code out as a standalone library, there will need
2013
 * to be some API to let applications define a callback function for this.
2014
 */
2015
static int
2016
rcancelrequested(void)
2017
51.8k
{
2018
51.8k
  return InterruptPending && (QueryCancelPending || ProcDiePending);
2019
51.8k
}
2020
2021
/*
2022
 * rstacktoodeep - check for stack getting dangerously deep
2023
 *
2024
 * Return nonzero to fail the operation with error code REG_ETOOBIG,
2025
 * zero to keep going
2026
 *
2027
 * The current implementation is Postgres-specific.  If we ever get around
2028
 * to splitting the regex code out as a standalone library, there will need
2029
 * to be some API to let applications define a callback function for this.
2030
 */
2031
static int
2032
rstacktoodeep(void)
2033
130k
{
2034
130k
  return stack_is_too_deep();
2035
130k
}
2036
2037
#ifdef REG_DEBUG
2038
2039
/*
2040
 * dump - dump an RE in human-readable form
2041
 */
2042
static void
2043
dump(regex_t *re,
2044
   FILE *f)
2045
{
2046
  struct guts *g;
2047
  int     i;
2048
2049
  if (re->re_magic != REMAGIC)
2050
    fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2051
        REMAGIC);
2052
  if (re->re_guts == NULL)
2053
  {
2054
    fprintf(f, "NULL guts!!!\n");
2055
    return;
2056
  }
2057
  g = (struct guts *) re->re_guts;
2058
  if (g->magic != GUTSMAGIC)
2059
    fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2060
        GUTSMAGIC);
2061
2062
  fprintf(f, "\n\n\n========= DUMP ==========\n");
2063
  fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2064
      (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
2065
2066
  dumpcolors(&g->cmap, f);
2067
  if (!NULLCNFA(g->search))
2068
  {
2069
    fprintf(f, "\nsearch:\n");
2070
    dumpcnfa(&g->search, f);
2071
  }
2072
  for (i = 1; i < g->nlacons; i++)
2073
  {
2074
    struct subre *lasub = &g->lacons[i];
2075
    const char *latype;
2076
2077
    switch (lasub->subno)
2078
    {
2079
      case LATYPE_AHEAD_POS:
2080
        latype = "positive lookahead";
2081
        break;
2082
      case LATYPE_AHEAD_NEG:
2083
        latype = "negative lookahead";
2084
        break;
2085
      case LATYPE_BEHIND_POS:
2086
        latype = "positive lookbehind";
2087
        break;
2088
      case LATYPE_BEHIND_NEG:
2089
        latype = "negative lookbehind";
2090
        break;
2091
      default:
2092
        latype = "???";
2093
        break;
2094
    }
2095
    fprintf(f, "\nla%d (%s):\n", i, latype);
2096
    dumpcnfa(&lasub->cnfa, f);
2097
  }
2098
  fprintf(f, "\n");
2099
  dumpst(g->tree, f, 0);
2100
}
2101
2102
/*
2103
 * dumpst - dump a subRE tree
2104
 */
2105
static void
2106
dumpst(struct subre *t,
2107
     FILE *f,
2108
     int nfapresent)      /* is the original NFA still around? */
2109
{
2110
  if (t == NULL)
2111
    fprintf(f, "null tree\n");
2112
  else
2113
    stdump(t, f, nfapresent);
2114
  fflush(f);
2115
}
2116
2117
/*
2118
 * stdump - recursive guts of dumpst
2119
 */
2120
static void
2121
stdump(struct subre *t,
2122
     FILE *f,
2123
     int nfapresent)      /* is the original NFA still around? */
2124
{
2125
  char    idbuf[50];
2126
2127
  fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2128
  if (t->flags & LONGER)
2129
    fprintf(f, " longest");
2130
  if (t->flags & SHORTER)
2131
    fprintf(f, " shortest");
2132
  if (t->flags & MIXED)
2133
    fprintf(f, " hasmixed");
2134
  if (t->flags & CAP)
2135
    fprintf(f, " hascapture");
2136
  if (t->flags & BACKR)
2137
    fprintf(f, " hasbackref");
2138
  if (!(t->flags & INUSE))
2139
    fprintf(f, " UNUSED");
2140
  if (t->subno != 0)
2141
    fprintf(f, " (#%d)", t->subno);
2142
  if (t->min != 1 || t->max != 1)
2143
  {
2144
    fprintf(f, " {%d,", t->min);
2145
    if (t->max != DUPINF)
2146
      fprintf(f, "%d", t->max);
2147
    fprintf(f, "}");
2148
  }
2149
  if (nfapresent)
2150
    fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2151
  if (t->left != NULL)
2152
    fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2153
  if (t->right != NULL)
2154
    fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2155
  if (!NULLCNFA(t->cnfa))
2156
  {
2157
    fprintf(f, "\n");
2158
    dumpcnfa(&t->cnfa, f);
2159
  }
2160
  fprintf(f, "\n");
2161
  if (t->left != NULL)
2162
    stdump(t->left, f, nfapresent);
2163
  if (t->right != NULL)
2164
    stdump(t->right, f, nfapresent);
2165
}
2166
2167
/*
2168
 * stid - identify a subtree node for dumping
2169
 */
2170
static const char *       /* points to buf or constant string */
2171
stid(struct subre *t,
2172
   char *buf,
2173
   size_t bufsize)
2174
{
2175
  /* big enough for hex int or decimal t->id? */
2176
  if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1)
2177
    return "unable";
2178
  if (t->id != 0)
2179
    sprintf(buf, "%d", t->id);
2180
  else
2181
    sprintf(buf, "%p", t);
2182
  return buf;
2183
}
2184
#endif              /* REG_DEBUG */
2185
2186
2187
#include "regc_lex.c"
2188
#include "regc_color.c"
2189
#include "regc_nfa.c"
2190
#include "regc_cvec.c"
2191
#include "regc_pg_locale.c"
2192
#include "regc_locale.c"