VirtualBox

source: kBuild/vendor/sed/current/lib/regcomp.c

Last change on this file was 599, checked in by bird, 18 years ago

GNU sed 4.1.5.

File size: 107.5 KB
Line 
1/* Extended regular expression matching and search library.
2 Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3 This file is part of the GNU C Library.
4 Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6 The GNU C Library is free software; you can redistribute it and/or
7 modify it under the terms of the GNU Lesser General Public
8 License as published by the Free Software Foundation; either
9 version 2.1 of the License, or (at your option) any later version.
10
11 The GNU C Library is distributed in the hope that it will be useful,
12 but WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public
17 License along with the GNU C Library; if not, write to the Free
18 Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19 02111-1307 USA. */
20
21static reg_errcode_t re_compile_internal (regex_t *preg, const char * pattern,
22 size_t length, reg_syntax_t syntax);
23static void re_compile_fastmap_iter (regex_t *bufp,
24 const re_dfastate_t *init_state,
25 char *fastmap);
26static reg_errcode_t init_dfa (re_dfa_t *dfa, size_t pat_len);
27#ifdef RE_ENABLE_I18N
28static void free_charset (re_charset_t *cset);
29#endif /* RE_ENABLE_I18N */
30static void free_workarea_compile (regex_t *preg);
31static reg_errcode_t create_initial_state (re_dfa_t *dfa);
32#ifdef RE_ENABLE_I18N
33static void optimize_utf8 (re_dfa_t *dfa);
34#endif
35static reg_errcode_t analyze (regex_t *preg);
36static reg_errcode_t preorder (bin_tree_t *root,
37 reg_errcode_t (fn (void *, bin_tree_t *)),
38 void *extra);
39static reg_errcode_t postorder (bin_tree_t *root,
40 reg_errcode_t (fn (void *, bin_tree_t *)),
41 void *extra);
42static reg_errcode_t optimize_subexps (void *extra, bin_tree_t *node);
43static reg_errcode_t lower_subexps (void *extra, bin_tree_t *node);
44static bin_tree_t *lower_subexp (reg_errcode_t *err, regex_t *preg,
45 bin_tree_t *node);
46static reg_errcode_t calc_first (void *extra, bin_tree_t *node);
47static reg_errcode_t calc_next (void *extra, bin_tree_t *node);
48static reg_errcode_t link_nfa_nodes (void *extra, bin_tree_t *node);
49static int duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint);
50static int search_duplicated_node (const re_dfa_t *dfa, int org_node,
51 unsigned int constraint);
52static reg_errcode_t calc_eclosure (re_dfa_t *dfa);
53static reg_errcode_t calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa,
54 int node, int root);
55static reg_errcode_t calc_inveclosure (re_dfa_t *dfa);
56static int fetch_number (re_string_t *input, re_token_t *token,
57 reg_syntax_t syntax);
58static int peek_token (re_token_t *token, re_string_t *input,
59 reg_syntax_t syntax) internal_function;
60static bin_tree_t *parse (re_string_t *regexp, regex_t *preg,
61 reg_syntax_t syntax, reg_errcode_t *err);
62static bin_tree_t *parse_reg_exp (re_string_t *regexp, regex_t *preg,
63 re_token_t *token, reg_syntax_t syntax,
64 int nest, reg_errcode_t *err);
65static bin_tree_t *parse_branch (re_string_t *regexp, regex_t *preg,
66 re_token_t *token, reg_syntax_t syntax,
67 int nest, reg_errcode_t *err);
68static bin_tree_t *parse_expression (re_string_t *regexp, regex_t *preg,
69 re_token_t *token, reg_syntax_t syntax,
70 int nest, reg_errcode_t *err);
71static bin_tree_t *parse_sub_exp (re_string_t *regexp, regex_t *preg,
72 re_token_t *token, reg_syntax_t syntax,
73 int nest, reg_errcode_t *err);
74static bin_tree_t *parse_dup_op (bin_tree_t *dup_elem, re_string_t *regexp,
75 re_dfa_t *dfa, re_token_t *token,
76 reg_syntax_t syntax, reg_errcode_t *err);
77static bin_tree_t *parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa,
78 re_token_t *token, reg_syntax_t syntax,
79 reg_errcode_t *err);
80static reg_errcode_t parse_bracket_element (bracket_elem_t *elem,
81 re_string_t *regexp,
82 re_token_t *token, int token_len,
83 re_dfa_t *dfa,
84 reg_syntax_t syntax,
85 int accept_hyphen);
86static reg_errcode_t parse_bracket_symbol (bracket_elem_t *elem,
87 re_string_t *regexp,
88 re_token_t *token);
89#ifdef RE_ENABLE_I18N
90static reg_errcode_t build_equiv_class (bitset_t sbcset,
91 re_charset_t *mbcset,
92 int *equiv_class_alloc,
93 const unsigned char *name);
94static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
95 bitset_t sbcset,
96 re_charset_t *mbcset,
97 int *char_class_alloc,
98 const unsigned char *class_name,
99 reg_syntax_t syntax);
100#else /* not RE_ENABLE_I18N */
101static reg_errcode_t build_equiv_class (bitset_t sbcset,
102 const unsigned char *name);
103static reg_errcode_t build_charclass (RE_TRANSLATE_TYPE trans,
104 bitset_t sbcset,
105 const unsigned char *class_name,
106 reg_syntax_t syntax);
107#endif /* not RE_ENABLE_I18N */
108static bin_tree_t *build_charclass_op (re_dfa_t *dfa,
109 RE_TRANSLATE_TYPE trans,
110 const unsigned char *class_name,
111 const unsigned char *extra,
112 int non_match, reg_errcode_t *err);
113static bin_tree_t *create_tree (re_dfa_t *dfa,
114 bin_tree_t *left, bin_tree_t *right,
115 re_token_type_t type);
116static bin_tree_t *create_token_tree (re_dfa_t *dfa,
117 bin_tree_t *left, bin_tree_t *right,
118 const re_token_t *token);
119static bin_tree_t *duplicate_tree (const bin_tree_t *src, re_dfa_t *dfa);
120static void free_token (re_token_t *node);
121static reg_errcode_t free_tree (void *extra, bin_tree_t *node);
122static reg_errcode_t mark_opt_subexp (void *extra, bin_tree_t *node);
123
124
125/* This table gives an error message for each of the error codes listed
126 in regex.h. Obviously the order here has to be same as there.
127 POSIX doesn't require that we do anything for REG_NOERROR,
128 but why not be nice? */
129
130const char __re_error_msgid[] attribute_hidden =
131 {
132#define REG_NOERROR_IDX 0
133 gettext_noop ("Success") /* REG_NOERROR */
134 "\0"
135#define REG_NOMATCH_IDX (REG_NOERROR_IDX + sizeof "Success")
136 gettext_noop ("No match") /* REG_NOMATCH */
137 "\0"
138#define REG_BADPAT_IDX (REG_NOMATCH_IDX + sizeof "No match")
139 gettext_noop ("Invalid regular expression") /* REG_BADPAT */
140 "\0"
141#define REG_ECOLLATE_IDX (REG_BADPAT_IDX + sizeof "Invalid regular expression")
142 gettext_noop ("Invalid collation character") /* REG_ECOLLATE */
143 "\0"
144#define REG_ECTYPE_IDX (REG_ECOLLATE_IDX + sizeof "Invalid collation character")
145 gettext_noop ("Invalid character class name") /* REG_ECTYPE */
146 "\0"
147#define REG_EESCAPE_IDX (REG_ECTYPE_IDX + sizeof "Invalid character class name")
148 gettext_noop ("Trailing backslash") /* REG_EESCAPE */
149 "\0"
150#define REG_ESUBREG_IDX (REG_EESCAPE_IDX + sizeof "Trailing backslash")
151 gettext_noop ("Invalid back reference") /* REG_ESUBREG */
152 "\0"
153#define REG_EBRACK_IDX (REG_ESUBREG_IDX + sizeof "Invalid back reference")
154 gettext_noop ("Unmatched [ or [^") /* REG_EBRACK */
155 "\0"
156#define REG_EPAREN_IDX (REG_EBRACK_IDX + sizeof "Unmatched [ or [^")
157 gettext_noop ("Unmatched ( or \\(") /* REG_EPAREN */
158 "\0"
159#define REG_EBRACE_IDX (REG_EPAREN_IDX + sizeof "Unmatched ( or \\(")
160 gettext_noop ("Unmatched \\{") /* REG_EBRACE */
161 "\0"
162#define REG_BADBR_IDX (REG_EBRACE_IDX + sizeof "Unmatched \\{")
163 gettext_noop ("Invalid content of \\{\\}") /* REG_BADBR */
164 "\0"
165#define REG_ERANGE_IDX (REG_BADBR_IDX + sizeof "Invalid content of \\{\\}")
166 gettext_noop ("Invalid range end") /* REG_ERANGE */
167 "\0"
168#define REG_ESPACE_IDX (REG_ERANGE_IDX + sizeof "Invalid range end")
169 gettext_noop ("Memory exhausted") /* REG_ESPACE */
170 "\0"
171#define REG_BADRPT_IDX (REG_ESPACE_IDX + sizeof "Memory exhausted")
172 gettext_noop ("Invalid preceding regular expression") /* REG_BADRPT */
173 "\0"
174#define REG_EEND_IDX (REG_BADRPT_IDX + sizeof "Invalid preceding regular expression")
175 gettext_noop ("Premature end of regular expression") /* REG_EEND */
176 "\0"
177#define REG_ESIZE_IDX (REG_EEND_IDX + sizeof "Premature end of regular expression")
178 gettext_noop ("Regular expression too big") /* REG_ESIZE */
179 "\0"
180#define REG_ERPAREN_IDX (REG_ESIZE_IDX + sizeof "Regular expression too big")
181 gettext_noop ("Unmatched ) or \\)") /* REG_ERPAREN */
182 };
183
184const size_t __re_error_msgid_idx[] attribute_hidden =
185 {
186 REG_NOERROR_IDX,
187 REG_NOMATCH_IDX,
188 REG_BADPAT_IDX,
189 REG_ECOLLATE_IDX,
190 REG_ECTYPE_IDX,
191 REG_EESCAPE_IDX,
192 REG_ESUBREG_IDX,
193 REG_EBRACK_IDX,
194 REG_EPAREN_IDX,
195 REG_EBRACE_IDX,
196 REG_BADBR_IDX,
197 REG_ERANGE_IDX,
198 REG_ESPACE_IDX,
199 REG_BADRPT_IDX,
200 REG_EEND_IDX,
201 REG_ESIZE_IDX,
202 REG_ERPAREN_IDX
203 };
204
205
206/* Entry points for GNU code. */
207
208/* re_compile_pattern is the GNU regular expression compiler: it
209 compiles PATTERN (of length LENGTH) and puts the result in BUFP.
210 Returns 0 if the pattern was valid, otherwise an error string.
211
212 Assumes the `allocated' (and perhaps `buffer') and `translate' fields
213 are set in BUFP on entry. */
214
215const char *
216re_compile_pattern (pattern, length, bufp)
217 const char *pattern;
218 size_t length;
219 struct re_pattern_buffer *bufp;
220{
221 reg_errcode_t ret;
222
223 /* And GNU code determines whether or not to get register information
224 by passing null for the REGS argument to re_match, etc., not by
225 setting no_sub, unless RE_NO_SUB is set. */
226 bufp->no_sub = !!(re_syntax_options & RE_NO_SUB);
227
228 /* Match anchors at newline. */
229 bufp->newline_anchor = 1;
230
231 ret = re_compile_internal (bufp, pattern, length, re_syntax_options);
232
233 if (!ret)
234 return NULL;
235 return gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
236}
237#ifdef _LIBC
238weak_alias (__re_compile_pattern, re_compile_pattern)
239#endif
240
241/* Set by `re_set_syntax' to the current regexp syntax to recognize. Can
242 also be assigned to arbitrarily: each pattern buffer stores its own
243 syntax, so it can be changed between regex compilations. */
244/* This has no initializer because initialized variables in Emacs
245 become read-only after dumping. */
246reg_syntax_t re_syntax_options;
247
248
249/* Specify the precise syntax of regexps for compilation. This provides
250 for compatibility for various utilities which historically have
251 different, incompatible syntaxes.
252
253 The argument SYNTAX is a bit mask comprised of the various bits
254 defined in regex.h. We return the old syntax. */
255
256reg_syntax_t
257re_set_syntax (syntax)
258 reg_syntax_t syntax;
259{
260 reg_syntax_t ret = re_syntax_options;
261
262 re_syntax_options = syntax;
263 return ret;
264}
265#ifdef _LIBC
266weak_alias (__re_set_syntax, re_set_syntax)
267#endif
268
269int
270re_compile_fastmap (bufp)
271 struct re_pattern_buffer *bufp;
272{
273 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
274 char *fastmap = bufp->fastmap;
275
276 memset (fastmap, '\0', sizeof (char) * SBC_MAX);
277 re_compile_fastmap_iter (bufp, dfa->init_state, fastmap);
278 if (dfa->init_state != dfa->init_state_word)
279 re_compile_fastmap_iter (bufp, dfa->init_state_word, fastmap);
280 if (dfa->init_state != dfa->init_state_nl)
281 re_compile_fastmap_iter (bufp, dfa->init_state_nl, fastmap);
282 if (dfa->init_state != dfa->init_state_begbuf)
283 re_compile_fastmap_iter (bufp, dfa->init_state_begbuf, fastmap);
284 bufp->fastmap_accurate = 1;
285 return 0;
286}
287#ifdef _LIBC
288weak_alias (__re_compile_fastmap, re_compile_fastmap)
289#endif
290
291static inline void
292__attribute ((always_inline))
293re_set_fastmap (char *fastmap, int icase, int ch)
294{
295 fastmap[ch] = 1;
296 if (icase)
297 fastmap[tolower (ch)] = 1;
298}
299
300/* Helper function for re_compile_fastmap.
301 Compile fastmap for the initial_state INIT_STATE. */
302
303static void
304re_compile_fastmap_iter (regex_t *bufp, const re_dfastate_t *init_state,
305 char *fastmap)
306{
307 re_dfa_t *dfa = (re_dfa_t *) bufp->buffer;
308 int node_cnt;
309 int icase = (dfa->mb_cur_max == 1 && (bufp->syntax & RE_ICASE));
310 for (node_cnt = 0; node_cnt < init_state->nodes.nelem; ++node_cnt)
311 {
312 int node = init_state->nodes.elems[node_cnt];
313 re_token_type_t type = dfa->nodes[node].type;
314
315 if (type == CHARACTER)
316 {
317 re_set_fastmap (fastmap, icase, dfa->nodes[node].opr.c);
318#ifdef RE_ENABLE_I18N
319 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
320 {
321 unsigned char *buf = alloca (dfa->mb_cur_max), *p;
322 wchar_t wc;
323 mbstate_t state;
324
325 p = buf;
326 *p++ = dfa->nodes[node].opr.c;
327 while (++node < dfa->nodes_len
328 && dfa->nodes[node].type == CHARACTER
329 && dfa->nodes[node].mb_partial)
330 *p++ = dfa->nodes[node].opr.c;
331 memset (&state, '\0', sizeof (state));
332 if (mbrtowc (&wc, (const char *) buf, p - buf,
333 &state) == p - buf
334 && (__wcrtomb ((char *) buf, towlower (wc), &state)
335 != (size_t) -1))
336 re_set_fastmap (fastmap, 0, buf[0]);
337 }
338#endif
339 }
340 else if (type == SIMPLE_BRACKET)
341 {
342 int i, ch;
343 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
344 {
345 int j;
346 bitset_word_t w = dfa->nodes[node].opr.sbcset[i];
347 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
348 if (w & ((bitset_word_t) 1 << j))
349 re_set_fastmap (fastmap, icase, ch);
350 }
351 }
352#ifdef RE_ENABLE_I18N
353 else if (type == COMPLEX_BRACKET)
354 {
355 int i;
356 re_charset_t *cset = dfa->nodes[node].opr.mbcset;
357 if (cset->non_match || cset->ncoll_syms || cset->nequiv_classes
358 || cset->nranges || cset->nchar_classes)
359 {
360# ifdef _LIBC
361 if (_NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES) != 0)
362 {
363 /* In this case we want to catch the bytes which are
364 the first byte of any collation elements.
365 e.g. In da_DK, we want to catch 'a' since "aa"
366 is a valid collation element, and don't catch
367 'b' since 'b' is the only collation element
368 which starts from 'b'. */
369 const int32_t *table = (const int32_t *)
370 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
371 for (i = 0; i < SBC_MAX; ++i)
372 if (table[i] < 0)
373 re_set_fastmap (fastmap, icase, i);
374 }
375# else
376 if (dfa->mb_cur_max > 1)
377 for (i = 0; i < SBC_MAX; ++i)
378 if (__btowc (i) == WEOF)
379 re_set_fastmap (fastmap, icase, i);
380# endif /* not _LIBC */
381 }
382 for (i = 0; i < cset->nmbchars; ++i)
383 {
384 char buf[256];
385 mbstate_t state;
386 memset (&state, '\0', sizeof (state));
387 if (__wcrtomb (buf, cset->mbchars[i], &state) != (size_t) -1)
388 re_set_fastmap (fastmap, icase, *(unsigned char *) buf);
389 if ((bufp->syntax & RE_ICASE) && dfa->mb_cur_max > 1)
390 {
391 if (__wcrtomb (buf, towlower (cset->mbchars[i]), &state)
392 != (size_t) -1)
393 re_set_fastmap (fastmap, 0, *(unsigned char *) buf);
394 }
395 }
396 }
397#endif /* RE_ENABLE_I18N */
398 else if (type == OP_PERIOD
399#ifdef RE_ENABLE_I18N
400 || type == OP_UTF8_PERIOD
401#endif /* RE_ENABLE_I18N */
402 || type == END_OF_RE)
403 {
404 memset (fastmap, '\1', sizeof (char) * SBC_MAX);
405 if (type == END_OF_RE)
406 bufp->can_be_null = 1;
407 return;
408 }
409 }
410}
411
412
413/* Entry point for POSIX code. */
414/* regcomp takes a regular expression as a string and compiles it.
415
416 PREG is a regex_t *. We do not expect any fields to be initialized,
417 since POSIX says we shouldn't. Thus, we set
418
419 `buffer' to the compiled pattern;
420 `used' to the length of the compiled pattern;
421 `syntax' to RE_SYNTAX_POSIX_EXTENDED if the
422 REG_EXTENDED bit in CFLAGS is set; otherwise, to
423 RE_SYNTAX_POSIX_BASIC;
424 `newline_anchor' to REG_NEWLINE being set in CFLAGS;
425 `fastmap' to an allocated space for the fastmap;
426 `fastmap_accurate' to zero;
427 `re_nsub' to the number of subexpressions in PATTERN.
428
429 PATTERN is the address of the pattern string.
430
431 CFLAGS is a series of bits which affect compilation.
432
433 If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
434 use POSIX basic syntax.
435
436 If REG_NEWLINE is set, then . and [^...] don't match newline.
437 Also, regexec will try a match beginning after every newline.
438
439 If REG_ICASE is set, then we considers upper- and lowercase
440 versions of letters to be equivalent when matching.
441
442 If REG_NOSUB is set, then when PREG is passed to regexec, that
443 routine will report only success or failure, and nothing about the
444 registers.
445
446 It returns 0 if it succeeds, nonzero if it doesn't. (See regex.h for
447 the return codes and their meanings.) */
448
449int
450regcomp (preg, pattern, cflags)
451 regex_t *__restrict preg;
452 const char *__restrict pattern;
453 int cflags;
454{
455 reg_errcode_t ret;
456 reg_syntax_t syntax = ((cflags & REG_EXTENDED) ? RE_SYNTAX_POSIX_EXTENDED
457 : RE_SYNTAX_POSIX_BASIC);
458
459 preg->buffer = NULL;
460 preg->allocated = 0;
461 preg->used = 0;
462
463 /* Try to allocate space for the fastmap. */
464 preg->fastmap = re_malloc (char, SBC_MAX);
465 if (BE (preg->fastmap == NULL, 0))
466 return REG_ESPACE;
467
468 syntax |= (cflags & REG_ICASE) ? RE_ICASE : 0;
469
470 /* If REG_NEWLINE is set, newlines are treated differently. */
471 if (cflags & REG_NEWLINE)
472 { /* REG_NEWLINE implies neither . nor [^...] match newline. */
473 syntax &= ~RE_DOT_NEWLINE;
474 syntax |= RE_HAT_LISTS_NOT_NEWLINE;
475 /* It also changes the matching behavior. */
476 preg->newline_anchor = 1;
477 }
478 else
479 preg->newline_anchor = 0;
480 preg->no_sub = !!(cflags & REG_NOSUB);
481 preg->translate = NULL;
482
483 ret = re_compile_internal (preg, pattern, strlen (pattern), syntax);
484
485 /* POSIX doesn't distinguish between an unmatched open-group and an
486 unmatched close-group: both are REG_EPAREN. */
487 if (ret == REG_ERPAREN)
488 ret = REG_EPAREN;
489
490 /* We have already checked preg->fastmap != NULL. */
491 if (BE (ret == REG_NOERROR, 1))
492 /* Compute the fastmap now, since regexec cannot modify the pattern
493 buffer. This function never fails in this implementation. */
494 (void) re_compile_fastmap (preg);
495 else
496 {
497 /* Some error occurred while compiling the expression. */
498 re_free (preg->fastmap);
499 preg->fastmap = NULL;
500 }
501
502 return (int) ret;
503}
504#ifdef _LIBC
505weak_alias (__regcomp, regcomp)
506#endif
507
508/* Returns a message corresponding to an error code, ERRCODE, returned
509 from either regcomp or regexec. We don't use PREG here. */
510
511size_t
512regerror (errcode, preg, errbuf, errbuf_size)
513 int errcode;
514 const regex_t *__restrict preg;
515 char *__restrict errbuf;
516 size_t errbuf_size;
517{
518 const char *msg;
519 size_t msg_size;
520
521 if (BE (errcode < 0
522 || errcode >= (int) (sizeof (__re_error_msgid_idx)
523 / sizeof (__re_error_msgid_idx[0])), 0))
524 /* Only error codes returned by the rest of the code should be passed
525 to this routine. If we are given anything else, or if other regex
526 code generates an invalid error code, then the program has a bug.
527 Dump core so we can fix it. */
528 abort ();
529
530 msg = gettext (__re_error_msgid + __re_error_msgid_idx[errcode]);
531
532 msg_size = strlen (msg) + 1; /* Includes the null. */
533
534 if (BE (errbuf_size != 0, 1))
535 {
536 if (BE (msg_size > errbuf_size, 0))
537 {
538#if defined HAVE_MEMPCPY || defined _LIBC
539 *((char *) __mempcpy (errbuf, msg, errbuf_size - 1)) = '\0';
540#else
541 memcpy (errbuf, msg, errbuf_size - 1);
542 errbuf[errbuf_size - 1] = 0;
543#endif
544 }
545 else
546 memcpy (errbuf, msg, msg_size);
547 }
548
549 return msg_size;
550}
551#ifdef _LIBC
552weak_alias (__regerror, regerror)
553#endif
554
555
556#ifdef RE_ENABLE_I18N
557/* This static array is used for the map to single-byte characters when
558 UTF-8 is used. Otherwise we would allocate memory just to initialize
559 it the same all the time. UTF-8 is the preferred encoding so this is
560 a worthwhile optimization. */
561static const bitset_t utf8_sb_map =
562{
563 /* Set the first 128 bits. */
564 [0 ... 0x80 / BITSET_WORD_BITS - 1] = BITSET_WORD_MAX
565};
566#endif
567
568
569static void
570free_dfa_content (re_dfa_t *dfa)
571{
572 int i, j;
573
574 if (dfa->nodes)
575 for (i = 0; i < dfa->nodes_len; ++i)
576 free_token (dfa->nodes + i);
577 re_free (dfa->nexts);
578 for (i = 0; i < dfa->nodes_len; ++i)
579 {
580 if (dfa->eclosures != NULL)
581 re_node_set_free (dfa->eclosures + i);
582 if (dfa->inveclosures != NULL)
583 re_node_set_free (dfa->inveclosures + i);
584 if (dfa->edests != NULL)
585 re_node_set_free (dfa->edests + i);
586 }
587 re_free (dfa->edests);
588 re_free (dfa->eclosures);
589 re_free (dfa->inveclosures);
590 re_free (dfa->nodes);
591
592 if (dfa->state_table)
593 for (i = 0; i <= dfa->state_hash_mask; ++i)
594 {
595 struct re_state_table_entry *entry = dfa->state_table + i;
596 for (j = 0; j < entry->num; ++j)
597 {
598 re_dfastate_t *state = entry->array[j];
599 free_state (state);
600 }
601 re_free (entry->array);
602 }
603 re_free (dfa->state_table);
604#ifdef RE_ENABLE_I18N
605 if (dfa->sb_char != utf8_sb_map)
606 re_free (dfa->sb_char);
607#endif
608 re_free (dfa->subexp_map);
609#ifdef DEBUG
610 re_free (dfa->re_str);
611#endif
612
613 re_free (dfa);
614}
615
616
617/* Free dynamically allocated space used by PREG. */
618
619void
620regfree (preg)
621 regex_t *preg;
622{
623 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
624 if (BE (dfa != NULL, 1))
625 free_dfa_content (dfa);
626 preg->buffer = NULL;
627 preg->allocated = 0;
628
629 re_free (preg->fastmap);
630 preg->fastmap = NULL;
631
632 re_free (preg->translate);
633 preg->translate = NULL;
634}
635#ifdef _LIBC
636weak_alias (__regfree, regfree)
637#endif
638
639
640/* Entry points compatible with 4.2 BSD regex library. We don't define
641 them unless specifically requested. */
642
643#if defined _REGEX_RE_COMP || defined _LIBC
644
645/* BSD has one and only one pattern buffer. */
646static struct re_pattern_buffer re_comp_buf;
647
648char *
649# ifdef _LIBC
650/* Make these definitions weak in libc, so POSIX programs can redefine
651 these names if they don't use our functions, and still use
652 regcomp/regexec above without link errors. */
653weak_function
654# endif
655re_comp (s)
656 const char *s;
657{
658 reg_errcode_t ret;
659 char *fastmap;
660
661 if (!s)
662 {
663 if (!re_comp_buf.buffer)
664 return gettext ("No previous regular expression");
665 return 0;
666 }
667
668 if (re_comp_buf.buffer)
669 {
670 fastmap = re_comp_buf.fastmap;
671 re_comp_buf.fastmap = NULL;
672 __regfree (&re_comp_buf);
673 memset (&re_comp_buf, '\0', sizeof (re_comp_buf));
674 re_comp_buf.fastmap = fastmap;
675 }
676
677 if (re_comp_buf.fastmap == NULL)
678 {
679 re_comp_buf.fastmap = (char *) malloc (SBC_MAX);
680 if (re_comp_buf.fastmap == NULL)
681 return (char *) gettext (__re_error_msgid
682 + __re_error_msgid_idx[(int) REG_ESPACE]);
683 }
684
685 /* Since `re_exec' always passes NULL for the `regs' argument, we
686 don't need to initialize the pattern buffer fields which affect it. */
687
688 /* Match anchors at newlines. */
689 re_comp_buf.newline_anchor = 1;
690
691 ret = re_compile_internal (&re_comp_buf, s, strlen (s), re_syntax_options);
692
693 if (!ret)
694 return NULL;
695
696 /* Yes, we're discarding `const' here if !HAVE_LIBINTL. */
697 return (char *) gettext (__re_error_msgid + __re_error_msgid_idx[(int) ret]);
698}
699
700#ifdef _LIBC
701libc_freeres_fn (free_mem)
702{
703 __regfree (&re_comp_buf);
704}
705#endif
706
707#endif /* _REGEX_RE_COMP */
708
709
710/* Internal entry point.
711 Compile the regular expression PATTERN, whose length is LENGTH.
712 SYNTAX indicate regular expression's syntax. */
713
714static reg_errcode_t
715re_compile_internal (regex_t *preg, const char * pattern, size_t length,
716 reg_syntax_t syntax)
717{
718 reg_errcode_t err = REG_NOERROR;
719 re_dfa_t *dfa;
720 re_string_t regexp;
721
722 /* Initialize the pattern buffer. */
723 preg->fastmap_accurate = 0;
724 preg->syntax = syntax;
725 preg->not_bol = preg->not_eol = 0;
726 preg->used = 0;
727 preg->re_nsub = 0;
728 preg->can_be_null = 0;
729 preg->regs_allocated = REGS_UNALLOCATED;
730
731 /* Initialize the dfa. */
732 dfa = (re_dfa_t *) preg->buffer;
733 if (BE (preg->allocated < sizeof (re_dfa_t), 0))
734 {
735 /* If zero allocated, but buffer is non-null, try to realloc
736 enough space. This loses if buffer's address is bogus, but
737 that is the user's responsibility. If ->buffer is NULL this
738 is a simple allocation. */
739 dfa = re_realloc (preg->buffer, re_dfa_t, 1);
740 if (dfa == NULL)
741 return REG_ESPACE;
742 preg->allocated = sizeof (re_dfa_t);
743 preg->buffer = (unsigned char *) dfa;
744 }
745 preg->used = sizeof (re_dfa_t);
746
747 err = init_dfa (dfa, length);
748 if (BE (err != REG_NOERROR, 0))
749 {
750 free_dfa_content (dfa);
751 preg->buffer = NULL;
752 preg->allocated = 0;
753 return err;
754 }
755#ifdef DEBUG
756 /* Note: length+1 will not overflow since it is checked in init_dfa. */
757 dfa->re_str = re_malloc (char, length + 1);
758 strncpy (dfa->re_str, pattern, length + 1);
759#endif
760
761 __libc_lock_init (dfa->lock);
762
763 err = re_string_construct (&regexp, pattern, length, preg->translate,
764 syntax & RE_ICASE, dfa);
765 if (BE (err != REG_NOERROR, 0))
766 {
767 re_compile_internal_free_return:
768 free_workarea_compile (preg);
769 re_string_destruct (&regexp);
770 free_dfa_content (dfa);
771 preg->buffer = NULL;
772 preg->allocated = 0;
773 return err;
774 }
775
776 /* Parse the regular expression, and build a structure tree. */
777 preg->re_nsub = 0;
778 dfa->str_tree = parse (&regexp, preg, syntax, &err);
779 if (BE (dfa->str_tree == NULL, 0))
780 goto re_compile_internal_free_return;
781
782 /* Analyze the tree and create the nfa. */
783 err = analyze (preg);
784 if (BE (err != REG_NOERROR, 0))
785 goto re_compile_internal_free_return;
786
787#ifdef RE_ENABLE_I18N
788 /* If possible, do searching in single byte encoding to speed things up. */
789 if (dfa->is_utf8 && !(syntax & RE_ICASE) && preg->translate == NULL)
790 optimize_utf8 (dfa);
791#endif
792
793 /* Then create the initial state of the dfa. */
794 err = create_initial_state (dfa);
795
796 /* Release work areas. */
797 free_workarea_compile (preg);
798 re_string_destruct (&regexp);
799
800 if (BE (err != REG_NOERROR, 0))
801 {
802 free_dfa_content (dfa);
803 preg->buffer = NULL;
804 preg->allocated = 0;
805 }
806
807 return err;
808}
809
810/* Initialize DFA. We use the length of the regular expression PAT_LEN
811 as the initial length of some arrays. */
812
813static reg_errcode_t
814init_dfa (re_dfa_t *dfa, size_t pat_len)
815{
816 unsigned int table_size;
817#ifndef _LIBC
818 char *codeset_name;
819#endif
820
821 memset (dfa, '\0', sizeof (re_dfa_t));
822
823 /* Force allocation of str_tree_storage the first time. */
824 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
825
826 /* Avoid overflows. */
827 if (pat_len == SIZE_MAX)
828 return REG_ESPACE;
829
830 dfa->nodes_alloc = pat_len + 1;
831 dfa->nodes = re_malloc (re_token_t, dfa->nodes_alloc);
832
833 /* table_size = 2 ^ ceil(log pat_len) */
834 for (table_size = 1; ; table_size <<= 1)
835 if (table_size > pat_len)
836 break;
837
838 dfa->state_table = calloc (sizeof (struct re_state_table_entry), table_size);
839 dfa->state_hash_mask = table_size - 1;
840
841 dfa->mb_cur_max = MB_CUR_MAX;
842#ifdef _LIBC
843 if (dfa->mb_cur_max == 6
844 && strcmp (_NL_CURRENT (LC_CTYPE, _NL_CTYPE_CODESET_NAME), "UTF-8") == 0)
845 dfa->is_utf8 = 1;
846 dfa->map_notascii = (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_MAP_TO_NONASCII)
847 != 0);
848#else
849# ifdef HAVE_LANGINFO_CODESET
850 codeset_name = nl_langinfo (CODESET);
851# else
852 codeset_name = getenv ("LC_ALL");
853 if (codeset_name == NULL || codeset_name[0] == '\0')
854 codeset_name = getenv ("LC_CTYPE");
855 if (codeset_name == NULL || codeset_name[0] == '\0')
856 codeset_name = getenv ("LANG");
857 if (codeset_name == NULL)
858 codeset_name = "";
859 else if (strchr (codeset_name, '.') != NULL)
860 codeset_name = strchr (codeset_name, '.') + 1;
861# endif
862
863 if (strcasecmp (codeset_name, "UTF-8") == 0
864 || strcasecmp (codeset_name, "UTF8") == 0)
865 dfa->is_utf8 = 1;
866
867 /* We check exhaustively in the loop below if this charset is a
868 superset of ASCII. */
869 dfa->map_notascii = 0;
870#endif
871
872#ifdef RE_ENABLE_I18N
873 if (dfa->mb_cur_max > 1)
874 {
875 if (dfa->is_utf8)
876 dfa->sb_char = (re_bitset_ptr_t) utf8_sb_map;
877 else
878 {
879 int i, j, ch;
880
881 dfa->sb_char = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
882 if (BE (dfa->sb_char == NULL, 0))
883 return REG_ESPACE;
884
885 /* Set the bits corresponding to single byte chars. */
886 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
887 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
888 {
889 wint_t wch = __btowc (ch);
890 if (wch != WEOF)
891 dfa->sb_char[i] |= (bitset_word_t) 1 << j;
892# ifndef _LIBC
893 if (isascii (ch) && wch != ch)
894 dfa->map_notascii = 1;
895# endif
896 }
897 }
898 }
899#endif
900
901 if (BE (dfa->nodes == NULL || dfa->state_table == NULL, 0))
902 return REG_ESPACE;
903 return REG_NOERROR;
904}
905
906/* Initialize WORD_CHAR table, which indicate which character is
907 "word". In this case "word" means that it is the word construction
908 character used by some operators like "\<", "\>", etc. */
909
910static void
911internal_function
912init_word_char (re_dfa_t *dfa)
913{
914 int i, j, ch;
915 dfa->word_ops_used = 1;
916 for (i = 0, ch = 0; i < BITSET_WORDS; ++i)
917 for (j = 0; j < BITSET_WORD_BITS; ++j, ++ch)
918 if (isalnum (ch) || ch == '_')
919 dfa->word_char[i] |= (bitset_word_t) 1 << j;
920}
921
922/* Free the work area which are only used while compiling. */
923
924static void
925free_workarea_compile (regex_t *preg)
926{
927 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
928 bin_tree_storage_t *storage, *next;
929 for (storage = dfa->str_tree_storage; storage; storage = next)
930 {
931 next = storage->next;
932 re_free (storage);
933 }
934 dfa->str_tree_storage = NULL;
935 dfa->str_tree_storage_idx = BIN_TREE_STORAGE_SIZE;
936 dfa->str_tree = NULL;
937 re_free (dfa->org_indices);
938 dfa->org_indices = NULL;
939}
940
941/* Create initial states for all contexts. */
942
943static reg_errcode_t
944create_initial_state (re_dfa_t *dfa)
945{
946 int first, i;
947 reg_errcode_t err;
948 re_node_set init_nodes;
949
950 /* Initial states have the epsilon closure of the node which is
951 the first node of the regular expression. */
952 first = dfa->str_tree->first->node_idx;
953 dfa->init_node = first;
954 err = re_node_set_init_copy (&init_nodes, dfa->eclosures + first);
955 if (BE (err != REG_NOERROR, 0))
956 return err;
957
958 /* The back-references which are in initial states can epsilon transit,
959 since in this case all of the subexpressions can be null.
960 Then we add epsilon closures of the nodes which are the next nodes of
961 the back-references. */
962 if (dfa->nbackref > 0)
963 for (i = 0; i < init_nodes.nelem; ++i)
964 {
965 int node_idx = init_nodes.elems[i];
966 re_token_type_t type = dfa->nodes[node_idx].type;
967
968 int clexp_idx;
969 if (type != OP_BACK_REF)
970 continue;
971 for (clexp_idx = 0; clexp_idx < init_nodes.nelem; ++clexp_idx)
972 {
973 re_token_t *clexp_node;
974 clexp_node = dfa->nodes + init_nodes.elems[clexp_idx];
975 if (clexp_node->type == OP_CLOSE_SUBEXP
976 && clexp_node->opr.idx == dfa->nodes[node_idx].opr.idx)
977 break;
978 }
979 if (clexp_idx == init_nodes.nelem)
980 continue;
981
982 if (type == OP_BACK_REF)
983 {
984 int dest_idx = dfa->edests[node_idx].elems[0];
985 if (!re_node_set_contains (&init_nodes, dest_idx))
986 {
987 re_node_set_merge (&init_nodes, dfa->eclosures + dest_idx);
988 i = 0;
989 }
990 }
991 }
992
993 /* It must be the first time to invoke acquire_state. */
994 dfa->init_state = re_acquire_state_context (&err, dfa, &init_nodes, 0);
995 /* We don't check ERR here, since the initial state must not be NULL. */
996 if (BE (dfa->init_state == NULL, 0))
997 return err;
998 if (dfa->init_state->has_constraint)
999 {
1000 dfa->init_state_word = re_acquire_state_context (&err, dfa, &init_nodes,
1001 CONTEXT_WORD);
1002 dfa->init_state_nl = re_acquire_state_context (&err, dfa, &init_nodes,
1003 CONTEXT_NEWLINE);
1004 dfa->init_state_begbuf = re_acquire_state_context (&err, dfa,
1005 &init_nodes,
1006 CONTEXT_NEWLINE
1007 | CONTEXT_BEGBUF);
1008 if (BE (dfa->init_state_word == NULL || dfa->init_state_nl == NULL
1009 || dfa->init_state_begbuf == NULL, 0))
1010 return err;
1011 }
1012 else
1013 dfa->init_state_word = dfa->init_state_nl
1014 = dfa->init_state_begbuf = dfa->init_state;
1015
1016 re_node_set_free (&init_nodes);
1017 return REG_NOERROR;
1018}
1019
1020
1021#ifdef RE_ENABLE_I18N
1022/* If it is possible to do searching in single byte encoding instead of UTF-8
1023 to speed things up, set dfa->mb_cur_max to 1, clear is_utf8 and change
1024 DFA nodes where needed. */
1025
1026static void
1027optimize_utf8 (re_dfa_t *dfa)
1028{
1029 int node, i, mb_chars = 0, has_period = 0;
1030
1031 for (node = 0; node < dfa->nodes_len; ++node)
1032 switch (dfa->nodes[node].type)
1033 {
1034 case CHARACTER:
1035 if (dfa->nodes[node].opr.c >= 0x80)
1036 mb_chars = 1;
1037 break;
1038 case ANCHOR:
1039 switch (dfa->nodes[node].opr.idx)
1040 {
1041 case LINE_FIRST:
1042 case LINE_LAST:
1043 case BUF_FIRST:
1044 case BUF_LAST:
1045 break;
1046 default:
1047 /* Word anchors etc. cannot be handled. */
1048 return;
1049 }
1050 break;
1051 case OP_PERIOD:
1052 has_period = 1;
1053 break;
1054 case OP_BACK_REF:
1055 case OP_ALT:
1056 case END_OF_RE:
1057 case OP_DUP_ASTERISK:
1058 case OP_OPEN_SUBEXP:
1059 case OP_CLOSE_SUBEXP:
1060 break;
1061 case COMPLEX_BRACKET:
1062 return;
1063 case SIMPLE_BRACKET:
1064 /* Just double check. The non-ASCII range starts at 0x80. */
1065 assert (0x80 % BITSET_WORD_BITS == 0);
1066 for (i = 0x80 / BITSET_WORD_BITS; i < BITSET_WORDS; ++i)
1067 if (dfa->nodes[node].opr.sbcset[i])
1068 return;
1069 break;
1070 default:
1071 abort ();
1072 }
1073
1074 if (mb_chars || has_period)
1075 for (node = 0; node < dfa->nodes_len; ++node)
1076 {
1077 if (dfa->nodes[node].type == CHARACTER
1078 && dfa->nodes[node].opr.c >= 0x80)
1079 dfa->nodes[node].mb_partial = 0;
1080 else if (dfa->nodes[node].type == OP_PERIOD)
1081 dfa->nodes[node].type = OP_UTF8_PERIOD;
1082 }
1083
1084 /* The search can be in single byte locale. */
1085 dfa->mb_cur_max = 1;
1086 dfa->is_utf8 = 0;
1087 dfa->has_mb_node = dfa->nbackref > 0 || has_period;
1088}
1089#endif
1090
1091
1092/* Analyze the structure tree, and calculate "first", "next", "edest",
1093 "eclosure", and "inveclosure". */
1094
1095static reg_errcode_t
1096analyze (regex_t *preg)
1097{
1098 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1099 reg_errcode_t ret;
1100
1101 /* Allocate arrays. */
1102 dfa->nexts = re_malloc (int, dfa->nodes_alloc);
1103 dfa->org_indices = re_malloc (int, dfa->nodes_alloc);
1104 dfa->edests = re_malloc (re_node_set, dfa->nodes_alloc);
1105 dfa->eclosures = re_malloc (re_node_set, dfa->nodes_alloc);
1106 if (BE (dfa->nexts == NULL || dfa->org_indices == NULL || dfa->edests == NULL
1107 || dfa->eclosures == NULL, 0))
1108 return REG_ESPACE;
1109
1110 dfa->subexp_map = re_malloc (int, preg->re_nsub);
1111 if (dfa->subexp_map != NULL)
1112 {
1113 int i;
1114 for (i = 0; i < preg->re_nsub; i++)
1115 dfa->subexp_map[i] = i;
1116 preorder (dfa->str_tree, optimize_subexps, dfa);
1117 for (i = 0; i < preg->re_nsub; i++)
1118 if (dfa->subexp_map[i] != i)
1119 break;
1120 if (i == preg->re_nsub)
1121 {
1122 free (dfa->subexp_map);
1123 dfa->subexp_map = NULL;
1124 }
1125 }
1126
1127 ret = postorder (dfa->str_tree, lower_subexps, preg);
1128 if (BE (ret != REG_NOERROR, 0))
1129 return ret;
1130 ret = postorder (dfa->str_tree, calc_first, dfa);
1131 if (BE (ret != REG_NOERROR, 0))
1132 return ret;
1133 preorder (dfa->str_tree, calc_next, dfa);
1134 ret = preorder (dfa->str_tree, link_nfa_nodes, dfa);
1135 if (BE (ret != REG_NOERROR, 0))
1136 return ret;
1137 ret = calc_eclosure (dfa);
1138 if (BE (ret != REG_NOERROR, 0))
1139 return ret;
1140
1141 /* We only need this during the prune_impossible_nodes pass in regexec.c;
1142 skip it if p_i_n will not run, as calc_inveclosure can be quadratic. */
1143 if ((!preg->no_sub && preg->re_nsub > 0 && dfa->has_plural_match)
1144 || dfa->nbackref)
1145 {
1146 dfa->inveclosures = re_malloc (re_node_set, dfa->nodes_len);
1147 if (BE (dfa->inveclosures == NULL, 0))
1148 return REG_ESPACE;
1149 ret = calc_inveclosure (dfa);
1150 }
1151
1152 return ret;
1153}
1154
1155/* Our parse trees are very unbalanced, so we cannot use a stack to
1156 implement parse tree visits. Instead, we use parent pointers and
1157 some hairy code in these two functions. */
1158static reg_errcode_t
1159postorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1160 void *extra)
1161{
1162 bin_tree_t *node, *prev;
1163
1164 for (node = root; ; )
1165 {
1166 /* Descend down the tree, preferably to the left (or to the right
1167 if that's the only child). */
1168 while (node->left || node->right)
1169 if (node->left)
1170 node = node->left;
1171 else
1172 node = node->right;
1173
1174 do
1175 {
1176 reg_errcode_t err = fn (extra, node);
1177 if (BE (err != REG_NOERROR, 0))
1178 return err;
1179 if (node->parent == NULL)
1180 return REG_NOERROR;
1181 prev = node;
1182 node = node->parent;
1183 }
1184 /* Go up while we have a node that is reached from the right. */
1185 while (node->right == prev || node->right == NULL);
1186 node = node->right;
1187 }
1188}
1189
1190static reg_errcode_t
1191preorder (bin_tree_t *root, reg_errcode_t (fn (void *, bin_tree_t *)),
1192 void *extra)
1193{
1194 bin_tree_t *node;
1195
1196 for (node = root; ; )
1197 {
1198 reg_errcode_t err = fn (extra, node);
1199 if (BE (err != REG_NOERROR, 0))
1200 return err;
1201
1202 /* Go to the left node, or up and to the right. */
1203 if (node->left)
1204 node = node->left;
1205 else
1206 {
1207 bin_tree_t *prev = NULL;
1208 while (node->right == prev || node->right == NULL)
1209 {
1210 prev = node;
1211 node = node->parent;
1212 if (!node)
1213 return REG_NOERROR;
1214 }
1215 node = node->right;
1216 }
1217 }
1218}
1219
1220/* Optimization pass: if a SUBEXP is entirely contained, strip it and tell
1221 re_search_internal to map the inner one's opr.idx to this one's. Adjust
1222 backreferences as well. Requires a preorder visit. */
1223static reg_errcode_t
1224optimize_subexps (void *extra, bin_tree_t *node)
1225{
1226 re_dfa_t *dfa = (re_dfa_t *) extra;
1227
1228 if (node->token.type == OP_BACK_REF && dfa->subexp_map)
1229 {
1230 int idx = node->token.opr.idx;
1231 node->token.opr.idx = dfa->subexp_map[idx];
1232 dfa->used_bkref_map |= 1 << node->token.opr.idx;
1233 }
1234
1235 else if (node->token.type == SUBEXP
1236 && node->left && node->left->token.type == SUBEXP)
1237 {
1238 int other_idx = node->left->token.opr.idx;
1239
1240 node->left = node->left->left;
1241 if (node->left)
1242 node->left->parent = node;
1243
1244 dfa->subexp_map[other_idx] = dfa->subexp_map[node->token.opr.idx];
1245 if (other_idx < BITSET_WORD_BITS)
1246 dfa->used_bkref_map &= ~((bitset_word_t) 1 << other_idx);
1247 }
1248
1249 return REG_NOERROR;
1250}
1251
1252/* Lowering pass: Turn each SUBEXP node into the appropriate concatenation
1253 of OP_OPEN_SUBEXP, the body of the SUBEXP (if any) and OP_CLOSE_SUBEXP. */
1254static reg_errcode_t
1255lower_subexps (void *extra, bin_tree_t *node)
1256{
1257 regex_t *preg = (regex_t *) extra;
1258 reg_errcode_t err = REG_NOERROR;
1259
1260 if (node->left && node->left->token.type == SUBEXP)
1261 {
1262 node->left = lower_subexp (&err, preg, node->left);
1263 if (node->left)
1264 node->left->parent = node;
1265 }
1266 if (node->right && node->right->token.type == SUBEXP)
1267 {
1268 node->right = lower_subexp (&err, preg, node->right);
1269 if (node->right)
1270 node->right->parent = node;
1271 }
1272
1273 return err;
1274}
1275
1276static bin_tree_t *
1277lower_subexp (reg_errcode_t *err, regex_t *preg, bin_tree_t *node)
1278{
1279 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
1280 bin_tree_t *body = node->left;
1281 bin_tree_t *op, *cls, *tree1, *tree;
1282
1283 if (preg->no_sub
1284 /* We do not optimize empty subexpressions, because otherwise we may
1285 have bad CONCAT nodes with NULL children. This is obviously not
1286 very common, so we do not lose much. An example that triggers
1287 this case is the sed "script" /\(\)/x. */
1288 && node->left != NULL
1289 && (node->token.opr.idx >= BITSET_WORD_BITS
1290 || !(dfa->used_bkref_map
1291 & ((bitset_word_t) 1 << node->token.opr.idx))))
1292 return node->left;
1293
1294 /* Convert the SUBEXP node to the concatenation of an
1295 OP_OPEN_SUBEXP, the contents, and an OP_CLOSE_SUBEXP. */
1296 op = create_tree (dfa, NULL, NULL, OP_OPEN_SUBEXP);
1297 cls = create_tree (dfa, NULL, NULL, OP_CLOSE_SUBEXP);
1298 tree1 = body ? create_tree (dfa, body, cls, CONCAT) : cls;
1299 tree = create_tree (dfa, op, tree1, CONCAT);
1300 if (BE (tree == NULL || tree1 == NULL || op == NULL || cls == NULL, 0))
1301 {
1302 *err = REG_ESPACE;
1303 return NULL;
1304 }
1305
1306 op->token.opr.idx = cls->token.opr.idx = node->token.opr.idx;
1307 op->token.opt_subexp = cls->token.opt_subexp = node->token.opt_subexp;
1308 return tree;
1309}
1310
1311/* Pass 1 in building the NFA: compute FIRST and create unlinked automaton
1312 nodes. Requires a postorder visit. */
1313static reg_errcode_t
1314calc_first (void *extra, bin_tree_t *node)
1315{
1316 re_dfa_t *dfa = (re_dfa_t *) extra;
1317 if (node->token.type == CONCAT)
1318 {
1319 node->first = node->left->first;
1320 node->node_idx = node->left->node_idx;
1321 }
1322 else
1323 {
1324 node->first = node;
1325 node->node_idx = re_dfa_add_node (dfa, node->token);
1326 if (BE (node->node_idx == -1, 0))
1327 return REG_ESPACE;
1328 }
1329 return REG_NOERROR;
1330}
1331
1332/* Pass 2: compute NEXT on the tree. Preorder visit. */
1333static reg_errcode_t
1334calc_next (void *extra, bin_tree_t *node)
1335{
1336 switch (node->token.type)
1337 {
1338 case OP_DUP_ASTERISK:
1339 node->left->next = node;
1340 break;
1341 case CONCAT:
1342 node->left->next = node->right->first;
1343 node->right->next = node->next;
1344 break;
1345 default:
1346 if (node->left)
1347 node->left->next = node->next;
1348 if (node->right)
1349 node->right->next = node->next;
1350 break;
1351 }
1352 return REG_NOERROR;
1353}
1354
1355/* Pass 3: link all DFA nodes to their NEXT node (any order will do). */
1356static reg_errcode_t
1357link_nfa_nodes (void *extra, bin_tree_t *node)
1358{
1359 re_dfa_t *dfa = (re_dfa_t *) extra;
1360 int idx = node->node_idx;
1361 reg_errcode_t err = REG_NOERROR;
1362
1363 switch (node->token.type)
1364 {
1365 case CONCAT:
1366 break;
1367
1368 case END_OF_RE:
1369 assert (node->next == NULL);
1370 break;
1371
1372 case OP_DUP_ASTERISK:
1373 case OP_ALT:
1374 {
1375 int left, right;
1376 dfa->has_plural_match = 1;
1377 if (node->left != NULL)
1378 left = node->left->first->node_idx;
1379 else
1380 left = node->next->node_idx;
1381 if (node->right != NULL)
1382 right = node->right->first->node_idx;
1383 else
1384 right = node->next->node_idx;
1385 assert (left > -1);
1386 assert (right > -1);
1387 err = re_node_set_init_2 (dfa->edests + idx, left, right);
1388 }
1389 break;
1390
1391 case ANCHOR:
1392 case OP_OPEN_SUBEXP:
1393 case OP_CLOSE_SUBEXP:
1394 err = re_node_set_init_1 (dfa->edests + idx, node->next->node_idx);
1395 break;
1396
1397 case OP_BACK_REF:
1398 dfa->nexts[idx] = node->next->node_idx;
1399 if (node->token.type == OP_BACK_REF)
1400 re_node_set_init_1 (dfa->edests + idx, dfa->nexts[idx]);
1401 break;
1402
1403 default:
1404 assert (!IS_EPSILON_NODE (node->token.type));
1405 dfa->nexts[idx] = node->next->node_idx;
1406 break;
1407 }
1408
1409 return err;
1410}
1411
1412/* Duplicate the epsilon closure of the node ROOT_NODE.
1413 Note that duplicated nodes have constraint INIT_CONSTRAINT in addition
1414 to their own constraint. */
1415
1416static reg_errcode_t
1417internal_function
1418duplicate_node_closure (re_dfa_t *dfa, int top_org_node, int top_clone_node,
1419 int root_node, unsigned int init_constraint)
1420{
1421 int org_node, clone_node, ret;
1422 unsigned int constraint = init_constraint;
1423 for (org_node = top_org_node, clone_node = top_clone_node;;)
1424 {
1425 int org_dest, clone_dest;
1426 if (dfa->nodes[org_node].type == OP_BACK_REF)
1427 {
1428 /* If the back reference epsilon-transit, its destination must
1429 also have the constraint. Then duplicate the epsilon closure
1430 of the destination of the back reference, and store it in
1431 edests of the back reference. */
1432 org_dest = dfa->nexts[org_node];
1433 re_node_set_empty (dfa->edests + clone_node);
1434 clone_dest = duplicate_node (dfa, org_dest, constraint);
1435 if (BE (clone_dest == -1, 0))
1436 return REG_ESPACE;
1437 dfa->nexts[clone_node] = dfa->nexts[org_node];
1438 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1439 if (BE (ret < 0, 0))
1440 return REG_ESPACE;
1441 }
1442 else if (dfa->edests[org_node].nelem == 0)
1443 {
1444 /* In case of the node can't epsilon-transit, don't duplicate the
1445 destination and store the original destination as the
1446 destination of the node. */
1447 dfa->nexts[clone_node] = dfa->nexts[org_node];
1448 break;
1449 }
1450 else if (dfa->edests[org_node].nelem == 1)
1451 {
1452 /* In case of the node can epsilon-transit, and it has only one
1453 destination. */
1454 org_dest = dfa->edests[org_node].elems[0];
1455 re_node_set_empty (dfa->edests + clone_node);
1456 if (dfa->nodes[org_node].type == ANCHOR)
1457 {
1458 /* In case of the node has another constraint, append it. */
1459 if (org_node == root_node && clone_node != org_node)
1460 {
1461 /* ...but if the node is root_node itself, it means the
1462 epsilon closure have a loop, then tie it to the
1463 destination of the root_node. */
1464 ret = re_node_set_insert (dfa->edests + clone_node,
1465 org_dest);
1466 if (BE (ret < 0, 0))
1467 return REG_ESPACE;
1468 break;
1469 }
1470 constraint |= dfa->nodes[org_node].opr.ctx_type;
1471 }
1472 clone_dest = duplicate_node (dfa, org_dest, constraint);
1473 if (BE (clone_dest == -1, 0))
1474 return REG_ESPACE;
1475 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1476 if (BE (ret < 0, 0))
1477 return REG_ESPACE;
1478 }
1479 else /* dfa->edests[org_node].nelem == 2 */
1480 {
1481 /* In case of the node can epsilon-transit, and it has two
1482 destinations. In the bin_tree_t and DFA, that's '|' and '*'. */
1483 org_dest = dfa->edests[org_node].elems[0];
1484 re_node_set_empty (dfa->edests + clone_node);
1485 /* Search for a duplicated node which satisfies the constraint. */
1486 clone_dest = search_duplicated_node (dfa, org_dest, constraint);
1487 if (clone_dest == -1)
1488 {
1489 /* There are no such a duplicated node, create a new one. */
1490 reg_errcode_t err;
1491 clone_dest = duplicate_node (dfa, org_dest, constraint);
1492 if (BE (clone_dest == -1, 0))
1493 return REG_ESPACE;
1494 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1495 if (BE (ret < 0, 0))
1496 return REG_ESPACE;
1497 err = duplicate_node_closure (dfa, org_dest, clone_dest,
1498 root_node, constraint);
1499 if (BE (err != REG_NOERROR, 0))
1500 return err;
1501 }
1502 else
1503 {
1504 /* There are a duplicated node which satisfy the constraint,
1505 use it to avoid infinite loop. */
1506 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1507 if (BE (ret < 0, 0))
1508 return REG_ESPACE;
1509 }
1510
1511 org_dest = dfa->edests[org_node].elems[1];
1512 clone_dest = duplicate_node (dfa, org_dest, constraint);
1513 if (BE (clone_dest == -1, 0))
1514 return REG_ESPACE;
1515 ret = re_node_set_insert (dfa->edests + clone_node, clone_dest);
1516 if (BE (ret < 0, 0))
1517 return REG_ESPACE;
1518 }
1519 org_node = org_dest;
1520 clone_node = clone_dest;
1521 }
1522 return REG_NOERROR;
1523}
1524
1525/* Search for a node which is duplicated from the node ORG_NODE, and
1526 satisfies the constraint CONSTRAINT. */
1527
1528static int
1529search_duplicated_node (const re_dfa_t *dfa, int org_node,
1530 unsigned int constraint)
1531{
1532 int idx;
1533 for (idx = dfa->nodes_len - 1; dfa->nodes[idx].duplicated && idx > 0; --idx)
1534 {
1535 if (org_node == dfa->org_indices[idx]
1536 && constraint == dfa->nodes[idx].constraint)
1537 return idx; /* Found. */
1538 }
1539 return -1; /* Not found. */
1540}
1541
1542/* Duplicate the node whose index is ORG_IDX and set the constraint CONSTRAINT.
1543 Return the index of the new node, or -1 if insufficient storage is
1544 available. */
1545
1546static int
1547duplicate_node (re_dfa_t *dfa, int org_idx, unsigned int constraint)
1548{
1549 int dup_idx = re_dfa_add_node (dfa, dfa->nodes[org_idx]);
1550 if (BE (dup_idx != -1, 1))
1551 {
1552 dfa->nodes[dup_idx].constraint = constraint;
1553 if (dfa->nodes[org_idx].type == ANCHOR)
1554 dfa->nodes[dup_idx].constraint |= dfa->nodes[org_idx].opr.ctx_type;
1555 dfa->nodes[dup_idx].duplicated = 1;
1556
1557 /* Store the index of the original node. */
1558 dfa->org_indices[dup_idx] = org_idx;
1559 }
1560 return dup_idx;
1561}
1562
1563static reg_errcode_t
1564calc_inveclosure (re_dfa_t *dfa)
1565{
1566 int src, idx, ret;
1567 for (idx = 0; idx < dfa->nodes_len; ++idx)
1568 re_node_set_init_empty (dfa->inveclosures + idx);
1569
1570 for (src = 0; src < dfa->nodes_len; ++src)
1571 {
1572 int *elems = dfa->eclosures[src].elems;
1573 for (idx = 0; idx < dfa->eclosures[src].nelem; ++idx)
1574 {
1575 ret = re_node_set_insert_last (dfa->inveclosures + elems[idx], src);
1576 if (BE (ret == -1, 0))
1577 return REG_ESPACE;
1578 }
1579 }
1580
1581 return REG_NOERROR;
1582}
1583
1584/* Calculate "eclosure" for all the node in DFA. */
1585
1586static reg_errcode_t
1587calc_eclosure (re_dfa_t *dfa)
1588{
1589 int node_idx, incomplete;
1590#ifdef DEBUG
1591 assert (dfa->nodes_len > 0);
1592#endif
1593 incomplete = 0;
1594 /* For each nodes, calculate epsilon closure. */
1595 for (node_idx = 0; ; ++node_idx)
1596 {
1597 reg_errcode_t err;
1598 re_node_set eclosure_elem;
1599 if (node_idx == dfa->nodes_len)
1600 {
1601 if (!incomplete)
1602 break;
1603 incomplete = 0;
1604 node_idx = 0;
1605 }
1606
1607#ifdef DEBUG
1608 assert (dfa->eclosures[node_idx].nelem != -1);
1609#endif
1610
1611 /* If we have already calculated, skip it. */
1612 if (dfa->eclosures[node_idx].nelem != 0)
1613 continue;
1614 /* Calculate epsilon closure of `node_idx'. */
1615 err = calc_eclosure_iter (&eclosure_elem, dfa, node_idx, 1);
1616 if (BE (err != REG_NOERROR, 0))
1617 return err;
1618
1619 if (dfa->eclosures[node_idx].nelem == 0)
1620 {
1621 incomplete = 1;
1622 re_node_set_free (&eclosure_elem);
1623 }
1624 }
1625 return REG_NOERROR;
1626}
1627
1628/* Calculate epsilon closure of NODE. */
1629
1630static reg_errcode_t
1631calc_eclosure_iter (re_node_set *new_set, re_dfa_t *dfa, int node, int root)
1632{
1633 reg_errcode_t err;
1634 unsigned int constraint;
1635 int i, incomplete;
1636 re_node_set eclosure;
1637 incomplete = 0;
1638 err = re_node_set_alloc (&eclosure, dfa->edests[node].nelem + 1);
1639 if (BE (err != REG_NOERROR, 0))
1640 return err;
1641
1642 /* This indicates that we are calculating this node now.
1643 We reference this value to avoid infinite loop. */
1644 dfa->eclosures[node].nelem = -1;
1645
1646 constraint = ((dfa->nodes[node].type == ANCHOR)
1647 ? dfa->nodes[node].opr.ctx_type : 0);
1648 /* If the current node has constraints, duplicate all nodes.
1649 Since they must inherit the constraints. */
1650 if (constraint
1651 && dfa->edests[node].nelem
1652 && !dfa->nodes[dfa->edests[node].elems[0]].duplicated)
1653 {
1654 int org_node, cur_node;
1655 org_node = cur_node = node;
1656 err = duplicate_node_closure (dfa, node, node, node, constraint);
1657 if (BE (err != REG_NOERROR, 0))
1658 return err;
1659 }
1660
1661 /* Expand each epsilon destination nodes. */
1662 if (IS_EPSILON_NODE(dfa->nodes[node].type))
1663 for (i = 0; i < dfa->edests[node].nelem; ++i)
1664 {
1665 re_node_set eclosure_elem;
1666 int edest = dfa->edests[node].elems[i];
1667 /* If calculating the epsilon closure of `edest' is in progress,
1668 return intermediate result. */
1669 if (dfa->eclosures[edest].nelem == -1)
1670 {
1671 incomplete = 1;
1672 continue;
1673 }
1674 /* If we haven't calculated the epsilon closure of `edest' yet,
1675 calculate now. Otherwise use calculated epsilon closure. */
1676 if (dfa->eclosures[edest].nelem == 0)
1677 {
1678 err = calc_eclosure_iter (&eclosure_elem, dfa, edest, 0);
1679 if (BE (err != REG_NOERROR, 0))
1680 return err;
1681 }
1682 else
1683 eclosure_elem = dfa->eclosures[edest];
1684 /* Merge the epsilon closure of `edest'. */
1685 re_node_set_merge (&eclosure, &eclosure_elem);
1686 /* If the epsilon closure of `edest' is incomplete,
1687 the epsilon closure of this node is also incomplete. */
1688 if (dfa->eclosures[edest].nelem == 0)
1689 {
1690 incomplete = 1;
1691 re_node_set_free (&eclosure_elem);
1692 }
1693 }
1694
1695 /* Epsilon closures include itself. */
1696 re_node_set_insert (&eclosure, node);
1697 if (incomplete && !root)
1698 dfa->eclosures[node].nelem = 0;
1699 else
1700 dfa->eclosures[node] = eclosure;
1701 *new_set = eclosure;
1702 return REG_NOERROR;
1703}
1704
1705
1706/* Functions for token which are used in the parser. */
1707
1708/* Fetch a token from INPUT.
1709 We must not use this function inside bracket expressions. */
1710
1711static void
1712internal_function
1713fetch_token (re_token_t *result, re_string_t *input, reg_syntax_t syntax)
1714{
1715 re_string_skip_bytes (input, peek_token (result, input, syntax));
1716}
1717
1718/* Peek a token from INPUT, and return the length of the token.
1719 We must not use this function inside bracket expressions. */
1720
1721static int
1722internal_function
1723peek_token (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1724{
1725 unsigned char c;
1726
1727 if (re_string_eoi (input))
1728 {
1729 token->type = END_OF_RE;
1730 return 0;
1731 }
1732
1733 c = re_string_peek_byte (input, 0);
1734 token->opr.c = c;
1735
1736 token->word_char = 0;
1737#ifdef RE_ENABLE_I18N
1738 token->mb_partial = 0;
1739 if (input->mb_cur_max > 1 &&
1740 !re_string_first_byte (input, re_string_cur_idx (input)))
1741 {
1742 token->type = CHARACTER;
1743 token->mb_partial = 1;
1744 return 1;
1745 }
1746#endif
1747 if (c == '\\')
1748 {
1749 unsigned char c2;
1750 if (re_string_cur_idx (input) + 1 >= re_string_length (input))
1751 {
1752 token->type = BACK_SLASH;
1753 return 1;
1754 }
1755
1756 c2 = re_string_peek_byte_case (input, 1);
1757 token->opr.c = c2;
1758 token->type = CHARACTER;
1759#ifdef RE_ENABLE_I18N
1760 if (input->mb_cur_max > 1)
1761 {
1762 wint_t wc = re_string_wchar_at (input,
1763 re_string_cur_idx (input) + 1);
1764 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1765 }
1766 else
1767#endif
1768 token->word_char = IS_WORD_CHAR (c2) != 0;
1769
1770 switch (c2)
1771 {
1772 case '|':
1773 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_NO_BK_VBAR))
1774 token->type = OP_ALT;
1775 break;
1776 case '1': case '2': case '3': case '4': case '5':
1777 case '6': case '7': case '8': case '9':
1778 if (!(syntax & RE_NO_BK_REFS))
1779 {
1780 token->type = OP_BACK_REF;
1781 token->opr.idx = c2 - '1';
1782 }
1783 break;
1784 case '<':
1785 if (!(syntax & RE_NO_GNU_OPS))
1786 {
1787 token->type = ANCHOR;
1788 token->opr.ctx_type = WORD_FIRST;
1789 }
1790 break;
1791 case '>':
1792 if (!(syntax & RE_NO_GNU_OPS))
1793 {
1794 token->type = ANCHOR;
1795 token->opr.ctx_type = WORD_LAST;
1796 }
1797 break;
1798 case 'b':
1799 if (!(syntax & RE_NO_GNU_OPS))
1800 {
1801 token->type = ANCHOR;
1802 token->opr.ctx_type = WORD_DELIM;
1803 }
1804 break;
1805 case 'B':
1806 if (!(syntax & RE_NO_GNU_OPS))
1807 {
1808 token->type = ANCHOR;
1809 token->opr.ctx_type = NOT_WORD_DELIM;
1810 }
1811 break;
1812 case 'w':
1813 if (!(syntax & RE_NO_GNU_OPS))
1814 token->type = OP_WORD;
1815 break;
1816 case 'W':
1817 if (!(syntax & RE_NO_GNU_OPS))
1818 token->type = OP_NOTWORD;
1819 break;
1820 case 's':
1821 if (!(syntax & RE_NO_GNU_OPS))
1822 token->type = OP_SPACE;
1823 break;
1824 case 'S':
1825 if (!(syntax & RE_NO_GNU_OPS))
1826 token->type = OP_NOTSPACE;
1827 break;
1828 case '`':
1829 if (!(syntax & RE_NO_GNU_OPS))
1830 {
1831 token->type = ANCHOR;
1832 token->opr.ctx_type = BUF_FIRST;
1833 }
1834 break;
1835 case '\'':
1836 if (!(syntax & RE_NO_GNU_OPS))
1837 {
1838 token->type = ANCHOR;
1839 token->opr.ctx_type = BUF_LAST;
1840 }
1841 break;
1842 case '(':
1843 if (!(syntax & RE_NO_BK_PARENS))
1844 token->type = OP_OPEN_SUBEXP;
1845 break;
1846 case ')':
1847 if (!(syntax & RE_NO_BK_PARENS))
1848 token->type = OP_CLOSE_SUBEXP;
1849 break;
1850 case '+':
1851 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1852 token->type = OP_DUP_PLUS;
1853 break;
1854 case '?':
1855 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_BK_PLUS_QM))
1856 token->type = OP_DUP_QUESTION;
1857 break;
1858 case '{':
1859 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1860 token->type = OP_OPEN_DUP_NUM;
1861 break;
1862 case '}':
1863 if ((syntax & RE_INTERVALS) && (!(syntax & RE_NO_BK_BRACES)))
1864 token->type = OP_CLOSE_DUP_NUM;
1865 break;
1866 default:
1867 break;
1868 }
1869 return 2;
1870 }
1871
1872 token->type = CHARACTER;
1873#ifdef RE_ENABLE_I18N
1874 if (input->mb_cur_max > 1)
1875 {
1876 wint_t wc = re_string_wchar_at (input, re_string_cur_idx (input));
1877 token->word_char = IS_WIDE_WORD_CHAR (wc) != 0;
1878 }
1879 else
1880#endif
1881 token->word_char = IS_WORD_CHAR (token->opr.c);
1882
1883 switch (c)
1884 {
1885 case '\n':
1886 if (syntax & RE_NEWLINE_ALT)
1887 token->type = OP_ALT;
1888 break;
1889 case '|':
1890 if (!(syntax & RE_LIMITED_OPS) && (syntax & RE_NO_BK_VBAR))
1891 token->type = OP_ALT;
1892 break;
1893 case '*':
1894 token->type = OP_DUP_ASTERISK;
1895 break;
1896 case '+':
1897 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1898 token->type = OP_DUP_PLUS;
1899 break;
1900 case '?':
1901 if (!(syntax & RE_LIMITED_OPS) && !(syntax & RE_BK_PLUS_QM))
1902 token->type = OP_DUP_QUESTION;
1903 break;
1904 case '{':
1905 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1906 token->type = OP_OPEN_DUP_NUM;
1907 break;
1908 case '}':
1909 if ((syntax & RE_INTERVALS) && (syntax & RE_NO_BK_BRACES))
1910 token->type = OP_CLOSE_DUP_NUM;
1911 break;
1912 case '(':
1913 if (syntax & RE_NO_BK_PARENS)
1914 token->type = OP_OPEN_SUBEXP;
1915 break;
1916 case ')':
1917 if (syntax & RE_NO_BK_PARENS)
1918 token->type = OP_CLOSE_SUBEXP;
1919 break;
1920 case '[':
1921 token->type = OP_OPEN_BRACKET;
1922 break;
1923 case '.':
1924 token->type = OP_PERIOD;
1925 break;
1926 case '^':
1927 if (!(syntax & (RE_CONTEXT_INDEP_ANCHORS | RE_CARET_ANCHORS_HERE)) &&
1928 re_string_cur_idx (input) != 0)
1929 {
1930 char prev = re_string_peek_byte (input, -1);
1931 if (!(syntax & RE_NEWLINE_ALT) || prev != '\n')
1932 break;
1933 }
1934 token->type = ANCHOR;
1935 token->opr.ctx_type = LINE_FIRST;
1936 break;
1937 case '$':
1938 if (!(syntax & RE_CONTEXT_INDEP_ANCHORS) &&
1939 re_string_cur_idx (input) + 1 != re_string_length (input))
1940 {
1941 re_token_t next;
1942 re_string_skip_bytes (input, 1);
1943 peek_token (&next, input, syntax);
1944 re_string_skip_bytes (input, -1);
1945 if (next.type != OP_ALT && next.type != OP_CLOSE_SUBEXP)
1946 break;
1947 }
1948 token->type = ANCHOR;
1949 token->opr.ctx_type = LINE_LAST;
1950 break;
1951 default:
1952 break;
1953 }
1954 return 1;
1955}
1956
1957/* Peek a token from INPUT, and return the length of the token.
1958 We must not use this function out of bracket expressions. */
1959
1960static int
1961internal_function
1962peek_token_bracket (re_token_t *token, re_string_t *input, reg_syntax_t syntax)
1963{
1964 unsigned char c;
1965 if (re_string_eoi (input))
1966 {
1967 token->type = END_OF_RE;
1968 return 0;
1969 }
1970 c = re_string_peek_byte (input, 0);
1971 token->opr.c = c;
1972
1973#ifdef RE_ENABLE_I18N
1974 if (input->mb_cur_max > 1 &&
1975 !re_string_first_byte (input, re_string_cur_idx (input)))
1976 {
1977 token->type = CHARACTER;
1978 return 1;
1979 }
1980#endif /* RE_ENABLE_I18N */
1981
1982 if (c == '\\' && (syntax & RE_BACKSLASH_ESCAPE_IN_LISTS)
1983 && re_string_cur_idx (input) + 1 < re_string_length (input))
1984 {
1985 /* In this case, '\' escape a character. */
1986 unsigned char c2;
1987 re_string_skip_bytes (input, 1);
1988 c2 = re_string_peek_byte (input, 0);
1989 token->opr.c = c2;
1990 token->type = CHARACTER;
1991 return 1;
1992 }
1993 if (c == '[') /* '[' is a special char in a bracket exps. */
1994 {
1995 unsigned char c2;
1996 int token_len;
1997 if (re_string_cur_idx (input) + 1 < re_string_length (input))
1998 c2 = re_string_peek_byte (input, 1);
1999 else
2000 c2 = 0;
2001 token->opr.c = c2;
2002 token_len = 2;
2003 switch (c2)
2004 {
2005 case '.':
2006 token->type = OP_OPEN_COLL_ELEM;
2007 break;
2008 case '=':
2009 token->type = OP_OPEN_EQUIV_CLASS;
2010 break;
2011 case ':':
2012 if (syntax & RE_CHAR_CLASSES)
2013 {
2014 token->type = OP_OPEN_CHAR_CLASS;
2015 break;
2016 }
2017 /* else fall through. */
2018 default:
2019 token->type = CHARACTER;
2020 token->opr.c = c;
2021 token_len = 1;
2022 break;
2023 }
2024 return token_len;
2025 }
2026 switch (c)
2027 {
2028 case '-':
2029 token->type = OP_CHARSET_RANGE;
2030 break;
2031 case ']':
2032 token->type = OP_CLOSE_BRACKET;
2033 break;
2034 case '^':
2035 token->type = OP_NON_MATCH_LIST;
2036 break;
2037 default:
2038 token->type = CHARACTER;
2039 }
2040 return 1;
2041}
2042
2043
2044/* Functions for parser. */
2045
2046/* Entry point of the parser.
2047 Parse the regular expression REGEXP and return the structure tree.
2048 If an error is occured, ERR is set by error code, and return NULL.
2049 This function build the following tree, from regular expression <reg_exp>:
2050 CAT
2051 / \
2052 / \
2053 <reg_exp> EOR
2054
2055 CAT means concatenation.
2056 EOR means end of regular expression. */
2057
2058static bin_tree_t *
2059parse (re_string_t *regexp, regex_t *preg, reg_syntax_t syntax,
2060 reg_errcode_t *err)
2061{
2062 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2063 bin_tree_t *tree, *eor, *root;
2064 re_token_t current_token;
2065 dfa->syntax = syntax;
2066 fetch_token (&current_token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2067 tree = parse_reg_exp (regexp, preg, &current_token, syntax, 0, err);
2068 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2069 return NULL;
2070 eor = create_tree (dfa, NULL, NULL, END_OF_RE);
2071 if (tree != NULL)
2072 root = create_tree (dfa, tree, eor, CONCAT);
2073 else
2074 root = eor;
2075 if (BE (eor == NULL || root == NULL, 0))
2076 {
2077 *err = REG_ESPACE;
2078 return NULL;
2079 }
2080 return root;
2081}
2082
2083/* This function build the following tree, from regular expression
2084 <branch1>|<branch2>:
2085 ALT
2086 / \
2087 / \
2088 <branch1> <branch2>
2089
2090 ALT means alternative, which represents the operator `|'. */
2091
2092static bin_tree_t *
2093parse_reg_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2094 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2095{
2096 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2097 bin_tree_t *tree, *branch = NULL;
2098 tree = parse_branch (regexp, preg, token, syntax, nest, err);
2099 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2100 return NULL;
2101
2102 while (token->type == OP_ALT)
2103 {
2104 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2105 if (token->type != OP_ALT && token->type != END_OF_RE
2106 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2107 {
2108 branch = parse_branch (regexp, preg, token, syntax, nest, err);
2109 if (BE (*err != REG_NOERROR && branch == NULL, 0))
2110 return NULL;
2111 }
2112 else
2113 branch = NULL;
2114 tree = create_tree (dfa, tree, branch, OP_ALT);
2115 if (BE (tree == NULL, 0))
2116 {
2117 *err = REG_ESPACE;
2118 return NULL;
2119 }
2120 }
2121 return tree;
2122}
2123
2124/* This function build the following tree, from regular expression
2125 <exp1><exp2>:
2126 CAT
2127 / \
2128 / \
2129 <exp1> <exp2>
2130
2131 CAT means concatenation. */
2132
2133static bin_tree_t *
2134parse_branch (re_string_t *regexp, regex_t *preg, re_token_t *token,
2135 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2136{
2137 bin_tree_t *tree, *exp;
2138 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2139 tree = parse_expression (regexp, preg, token, syntax, nest, err);
2140 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2141 return NULL;
2142
2143 while (token->type != OP_ALT && token->type != END_OF_RE
2144 && (nest == 0 || token->type != OP_CLOSE_SUBEXP))
2145 {
2146 exp = parse_expression (regexp, preg, token, syntax, nest, err);
2147 if (BE (*err != REG_NOERROR && exp == NULL, 0))
2148 {
2149 return NULL;
2150 }
2151 if (tree != NULL && exp != NULL)
2152 {
2153 tree = create_tree (dfa, tree, exp, CONCAT);
2154 if (tree == NULL)
2155 {
2156 *err = REG_ESPACE;
2157 return NULL;
2158 }
2159 }
2160 else if (tree == NULL)
2161 tree = exp;
2162 /* Otherwise exp == NULL, we don't need to create new tree. */
2163 }
2164 return tree;
2165}
2166
2167/* This function build the following tree, from regular expression a*:
2168 *
2169 |
2170 a
2171*/
2172
2173static bin_tree_t *
2174parse_expression (re_string_t *regexp, regex_t *preg, re_token_t *token,
2175 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2176{
2177 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2178 bin_tree_t *tree;
2179 switch (token->type)
2180 {
2181 case CHARACTER:
2182 tree = create_token_tree (dfa, NULL, NULL, token);
2183 if (BE (tree == NULL, 0))
2184 {
2185 *err = REG_ESPACE;
2186 return NULL;
2187 }
2188#ifdef RE_ENABLE_I18N
2189 if (dfa->mb_cur_max > 1)
2190 {
2191 while (!re_string_eoi (regexp)
2192 && !re_string_first_byte (regexp, re_string_cur_idx (regexp)))
2193 {
2194 bin_tree_t *mbc_remain;
2195 fetch_token (token, regexp, syntax);
2196 mbc_remain = create_token_tree (dfa, NULL, NULL, token);
2197 tree = create_tree (dfa, tree, mbc_remain, CONCAT);
2198 if (BE (mbc_remain == NULL || tree == NULL, 0))
2199 {
2200 *err = REG_ESPACE;
2201 return NULL;
2202 }
2203 }
2204 }
2205#endif
2206 break;
2207 case OP_OPEN_SUBEXP:
2208 tree = parse_sub_exp (regexp, preg, token, syntax, nest + 1, err);
2209 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2210 return NULL;
2211 break;
2212 case OP_OPEN_BRACKET:
2213 tree = parse_bracket_exp (regexp, dfa, token, syntax, err);
2214 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2215 return NULL;
2216 break;
2217 case OP_BACK_REF:
2218 if (!BE (dfa->completed_bkref_map & (1 << token->opr.idx), 1))
2219 {
2220 *err = REG_ESUBREG;
2221 return NULL;
2222 }
2223 dfa->used_bkref_map |= 1 << token->opr.idx;
2224 tree = create_token_tree (dfa, NULL, NULL, token);
2225 if (BE (tree == NULL, 0))
2226 {
2227 *err = REG_ESPACE;
2228 return NULL;
2229 }
2230 ++dfa->nbackref;
2231 dfa->has_mb_node = 1;
2232 break;
2233 case OP_OPEN_DUP_NUM:
2234 if (syntax & RE_CONTEXT_INVALID_DUP)
2235 {
2236 *err = REG_BADRPT;
2237 return NULL;
2238 }
2239 /* FALLTHROUGH */
2240 case OP_DUP_ASTERISK:
2241 case OP_DUP_PLUS:
2242 case OP_DUP_QUESTION:
2243 if (syntax & RE_CONTEXT_INVALID_OPS)
2244 {
2245 *err = REG_BADRPT;
2246 return NULL;
2247 }
2248 else if (syntax & RE_CONTEXT_INDEP_OPS)
2249 {
2250 fetch_token (token, regexp, syntax);
2251 return parse_expression (regexp, preg, token, syntax, nest, err);
2252 }
2253 /* else fall through */
2254 case OP_CLOSE_SUBEXP:
2255 if ((token->type == OP_CLOSE_SUBEXP) &&
2256 !(syntax & RE_UNMATCHED_RIGHT_PAREN_ORD))
2257 {
2258 *err = REG_ERPAREN;
2259 return NULL;
2260 }
2261 /* else fall through */
2262 case OP_CLOSE_DUP_NUM:
2263 /* We treat it as a normal character. */
2264
2265 /* Then we can these characters as normal characters. */
2266 token->type = CHARACTER;
2267 /* mb_partial and word_char bits should be initialized already
2268 by peek_token. */
2269 tree = create_token_tree (dfa, NULL, NULL, token);
2270 if (BE (tree == NULL, 0))
2271 {
2272 *err = REG_ESPACE;
2273 return NULL;
2274 }
2275 break;
2276 case ANCHOR:
2277 if ((token->opr.ctx_type
2278 & (WORD_DELIM | NOT_WORD_DELIM | WORD_FIRST | WORD_LAST))
2279 && dfa->word_ops_used == 0)
2280 init_word_char (dfa);
2281 if (token->opr.ctx_type == WORD_DELIM
2282 || token->opr.ctx_type == NOT_WORD_DELIM)
2283 {
2284 bin_tree_t *tree_first, *tree_last;
2285 if (token->opr.ctx_type == WORD_DELIM)
2286 {
2287 token->opr.ctx_type = WORD_FIRST;
2288 tree_first = create_token_tree (dfa, NULL, NULL, token);
2289 token->opr.ctx_type = WORD_LAST;
2290 }
2291 else
2292 {
2293 token->opr.ctx_type = INSIDE_WORD;
2294 tree_first = create_token_tree (dfa, NULL, NULL, token);
2295 token->opr.ctx_type = INSIDE_NOTWORD;
2296 }
2297 tree_last = create_token_tree (dfa, NULL, NULL, token);
2298 tree = create_tree (dfa, tree_first, tree_last, OP_ALT);
2299 if (BE (tree_first == NULL || tree_last == NULL || tree == NULL, 0))
2300 {
2301 *err = REG_ESPACE;
2302 return NULL;
2303 }
2304 }
2305 else
2306 {
2307 tree = create_token_tree (dfa, NULL, NULL, token);
2308 if (BE (tree == NULL, 0))
2309 {
2310 *err = REG_ESPACE;
2311 return NULL;
2312 }
2313 }
2314 /* We must return here, since ANCHORs can't be followed
2315 by repetition operators.
2316 eg. RE"^*" is invalid or "<ANCHOR(^)><CHAR(*)>",
2317 it must not be "<ANCHOR(^)><REPEAT(*)>". */
2318 fetch_token (token, regexp, syntax);
2319 return tree;
2320 case OP_PERIOD:
2321 tree = create_token_tree (dfa, NULL, NULL, token);
2322 if (BE (tree == NULL, 0))
2323 {
2324 *err = REG_ESPACE;
2325 return NULL;
2326 }
2327 if (dfa->mb_cur_max > 1)
2328 dfa->has_mb_node = 1;
2329 break;
2330 case OP_WORD:
2331 case OP_NOTWORD:
2332 tree = build_charclass_op (dfa, regexp->trans,
2333 (const unsigned char *) "alnum",
2334 (const unsigned char *) "_",
2335 token->type == OP_NOTWORD, err);
2336 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2337 return NULL;
2338 break;
2339 case OP_SPACE:
2340 case OP_NOTSPACE:
2341 tree = build_charclass_op (dfa, regexp->trans,
2342 (const unsigned char *) "space",
2343 (const unsigned char *) "",
2344 token->type == OP_NOTSPACE, err);
2345 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2346 return NULL;
2347 break;
2348 case OP_ALT:
2349 case END_OF_RE:
2350 return NULL;
2351 case BACK_SLASH:
2352 *err = REG_EESCAPE;
2353 return NULL;
2354 default:
2355 /* Must not happen? */
2356#ifdef DEBUG
2357 assert (0);
2358#endif
2359 return NULL;
2360 }
2361 fetch_token (token, regexp, syntax);
2362
2363 while (token->type == OP_DUP_ASTERISK || token->type == OP_DUP_PLUS
2364 || token->type == OP_DUP_QUESTION || token->type == OP_OPEN_DUP_NUM)
2365 {
2366 tree = parse_dup_op (tree, regexp, dfa, token, syntax, err);
2367 if (BE (*err != REG_NOERROR && tree == NULL, 0))
2368 return NULL;
2369 /* In BRE consecutive duplications are not allowed. */
2370 if ((syntax & RE_CONTEXT_INVALID_DUP)
2371 && (token->type == OP_DUP_ASTERISK
2372 || token->type == OP_OPEN_DUP_NUM))
2373 {
2374 *err = REG_BADRPT;
2375 return NULL;
2376 }
2377 }
2378
2379 return tree;
2380}
2381
2382/* This function build the following tree, from regular expression
2383 (<reg_exp>):
2384 SUBEXP
2385 |
2386 <reg_exp>
2387*/
2388
2389static bin_tree_t *
2390parse_sub_exp (re_string_t *regexp, regex_t *preg, re_token_t *token,
2391 reg_syntax_t syntax, int nest, reg_errcode_t *err)
2392{
2393 re_dfa_t *dfa = (re_dfa_t *) preg->buffer;
2394 bin_tree_t *tree;
2395 size_t cur_nsub;
2396 cur_nsub = preg->re_nsub++;
2397
2398 fetch_token (token, regexp, syntax | RE_CARET_ANCHORS_HERE);
2399
2400 /* The subexpression may be a null string. */
2401 if (token->type == OP_CLOSE_SUBEXP)
2402 tree = NULL;
2403 else
2404 {
2405 tree = parse_reg_exp (regexp, preg, token, syntax, nest, err);
2406 if (BE (*err == REG_NOERROR && token->type != OP_CLOSE_SUBEXP, 0))
2407 *err = REG_EPAREN;
2408 if (BE (*err != REG_NOERROR, 0))
2409 return NULL;
2410 }
2411
2412 if (cur_nsub <= '9' - '1')
2413 dfa->completed_bkref_map |= 1 << cur_nsub;
2414
2415 tree = create_tree (dfa, tree, NULL, SUBEXP);
2416 if (BE (tree == NULL, 0))
2417 {
2418 *err = REG_ESPACE;
2419 return NULL;
2420 }
2421 tree->token.opr.idx = cur_nsub;
2422 return tree;
2423}
2424
2425/* This function parse repetition operators like "*", "+", "{1,3}" etc. */
2426
2427static bin_tree_t *
2428parse_dup_op (bin_tree_t *elem, re_string_t *regexp, re_dfa_t *dfa,
2429 re_token_t *token, reg_syntax_t syntax, reg_errcode_t *err)
2430{
2431 bin_tree_t *tree = NULL, *old_tree = NULL;
2432 int i, start, end, start_idx = re_string_cur_idx (regexp);
2433 re_token_t start_token = *token;
2434
2435 if (token->type == OP_OPEN_DUP_NUM)
2436 {
2437 end = 0;
2438 start = fetch_number (regexp, token, syntax);
2439 if (start == -1)
2440 {
2441 if (token->type == CHARACTER && token->opr.c == ',')
2442 start = 0; /* We treat "{,m}" as "{0,m}". */
2443 else
2444 {
2445 *err = REG_BADBR; /* <re>{} is invalid. */
2446 return NULL;
2447 }
2448 }
2449 if (BE (start != -2, 1))
2450 {
2451 /* We treat "{n}" as "{n,n}". */
2452 end = ((token->type == OP_CLOSE_DUP_NUM) ? start
2453 : ((token->type == CHARACTER && token->opr.c == ',')
2454 ? fetch_number (regexp, token, syntax) : -2));
2455 }
2456 if (BE (start == -2 || end == -2, 0))
2457 {
2458 /* Invalid sequence. */
2459 if (BE (!(syntax & RE_INVALID_INTERVAL_ORD), 0))
2460 {
2461 if (token->type == END_OF_RE)
2462 *err = REG_EBRACE;
2463 else
2464 *err = REG_BADBR;
2465
2466 return NULL;
2467 }
2468
2469 /* If the syntax bit is set, rollback. */
2470 re_string_set_index (regexp, start_idx);
2471 *token = start_token;
2472 token->type = CHARACTER;
2473 /* mb_partial and word_char bits should be already initialized by
2474 peek_token. */
2475 return elem;
2476 }
2477
2478 if (BE (end != -1 && start > end, 0))
2479 {
2480 /* First number greater than second. */
2481 *err = REG_BADBR;
2482 return NULL;
2483 }
2484 }
2485 else
2486 {
2487 start = (token->type == OP_DUP_PLUS) ? 1 : 0;
2488 end = (token->type == OP_DUP_QUESTION) ? 1 : -1;
2489 }
2490
2491 fetch_token (token, regexp, syntax);
2492
2493 if (BE (elem == NULL, 0))
2494 return NULL;
2495 if (BE (start == 0 && end == 0, 0))
2496 {
2497 postorder (elem, free_tree, NULL);
2498 return NULL;
2499 }
2500
2501 /* Extract "<re>{n,m}" to "<re><re>...<re><re>{0,<m-n>}". */
2502 if (BE (start > 0, 0))
2503 {
2504 tree = elem;
2505 for (i = 2; i <= start; ++i)
2506 {
2507 elem = duplicate_tree (elem, dfa);
2508 tree = create_tree (dfa, tree, elem, CONCAT);
2509 if (BE (elem == NULL || tree == NULL, 0))
2510 goto parse_dup_op_espace;
2511 }
2512
2513 if (start == end)
2514 return tree;
2515
2516 /* Duplicate ELEM before it is marked optional. */
2517 elem = duplicate_tree (elem, dfa);
2518 old_tree = tree;
2519 }
2520 else
2521 old_tree = NULL;
2522
2523 if (elem->token.type == SUBEXP)
2524 postorder (elem, mark_opt_subexp, (void *) (long) elem->token.opr.idx);
2525
2526 tree = create_tree (dfa, elem, NULL, (end == -1 ? OP_DUP_ASTERISK : OP_ALT));
2527 if (BE (tree == NULL, 0))
2528 goto parse_dup_op_espace;
2529
2530 /* This loop is actually executed only when end != -1,
2531 to rewrite <re>{0,n} as (<re>(<re>...<re>?)?)?... We have
2532 already created the start+1-th copy. */
2533 for (i = start + 2; i <= end; ++i)
2534 {
2535 elem = duplicate_tree (elem, dfa);
2536 tree = create_tree (dfa, tree, elem, CONCAT);
2537 if (BE (elem == NULL || tree == NULL, 0))
2538 goto parse_dup_op_espace;
2539
2540 tree = create_tree (dfa, tree, NULL, OP_ALT);
2541 if (BE (tree == NULL, 0))
2542 goto parse_dup_op_espace;
2543 }
2544
2545 if (old_tree)
2546 tree = create_tree (dfa, old_tree, tree, CONCAT);
2547
2548 return tree;
2549
2550 parse_dup_op_espace:
2551 *err = REG_ESPACE;
2552 return NULL;
2553}
2554
2555/* Size of the names for collating symbol/equivalence_class/character_class.
2556 I'm not sure, but maybe enough. */
2557#define BRACKET_NAME_BUF_SIZE 32
2558
2559#ifndef _LIBC
2560 /* Local function for parse_bracket_exp only used in case of NOT _LIBC.
2561 Build the range expression which starts from START_ELEM, and ends
2562 at END_ELEM. The result are written to MBCSET and SBCSET.
2563 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2564 mbcset->range_ends, is a pointer argument sinse we may
2565 update it. */
2566
2567static reg_errcode_t
2568internal_function
2569# ifdef RE_ENABLE_I18N
2570build_range_exp (bitset_t sbcset, re_charset_t *mbcset, int *range_alloc,
2571 bracket_elem_t *start_elem, bracket_elem_t *end_elem)
2572# else /* not RE_ENABLE_I18N */
2573build_range_exp (bitset_t sbcset, bracket_elem_t *start_elem,
2574 bracket_elem_t *end_elem)
2575# endif /* not RE_ENABLE_I18N */
2576{
2577 unsigned int start_ch, end_ch;
2578 /* Equivalence Classes and Character Classes can't be a range start/end. */
2579 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2580 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2581 0))
2582 return REG_ERANGE;
2583
2584 /* We can handle no multi character collating elements without libc
2585 support. */
2586 if (BE ((start_elem->type == COLL_SYM
2587 && strlen ((char *) start_elem->opr.name) > 1)
2588 || (end_elem->type == COLL_SYM
2589 && strlen ((char *) end_elem->opr.name) > 1), 0))
2590 return REG_ECOLLATE;
2591
2592# ifdef RE_ENABLE_I18N
2593 {
2594 wchar_t wc;
2595 wint_t start_wc;
2596 wint_t end_wc;
2597 wchar_t cmp_buf[6] = {L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'};
2598
2599 start_ch = ((start_elem->type == SB_CHAR) ? start_elem->opr.ch
2600 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2601 : 0));
2602 end_ch = ((end_elem->type == SB_CHAR) ? end_elem->opr.ch
2603 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2604 : 0));
2605 start_wc = ((start_elem->type == SB_CHAR || start_elem->type == COLL_SYM)
2606 ? __btowc (start_ch) : start_elem->opr.wch);
2607 end_wc = ((end_elem->type == SB_CHAR || end_elem->type == COLL_SYM)
2608 ? __btowc (end_ch) : end_elem->opr.wch);
2609 if (start_wc == WEOF || end_wc == WEOF)
2610 return REG_ECOLLATE;
2611 cmp_buf[0] = start_wc;
2612 cmp_buf[4] = end_wc;
2613 if (wcscoll (cmp_buf, cmp_buf + 4) > 0)
2614 return REG_ERANGE;
2615
2616 /* Got valid collation sequence values, add them as a new entry.
2617 However, for !_LIBC we have no collation elements: if the
2618 character set is single byte, the single byte character set
2619 that we build below suffices. parse_bracket_exp passes
2620 no MBCSET if dfa->mb_cur_max == 1. */
2621 if (mbcset)
2622 {
2623 /* Check the space of the arrays. */
2624 if (BE (*range_alloc == mbcset->nranges, 0))
2625 {
2626 /* There is not enough space, need realloc. */
2627 wchar_t *new_array_start, *new_array_end;
2628 int new_nranges;
2629
2630 /* +1 in case of mbcset->nranges is 0. */
2631 new_nranges = 2 * mbcset->nranges + 1;
2632 /* Use realloc since mbcset->range_starts and mbcset->range_ends
2633 are NULL if *range_alloc == 0. */
2634 new_array_start = re_realloc (mbcset->range_starts, wchar_t,
2635 new_nranges);
2636 new_array_end = re_realloc (mbcset->range_ends, wchar_t,
2637 new_nranges);
2638
2639 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2640 return REG_ESPACE;
2641
2642 mbcset->range_starts = new_array_start;
2643 mbcset->range_ends = new_array_end;
2644 *range_alloc = new_nranges;
2645 }
2646
2647 mbcset->range_starts[mbcset->nranges] = start_wc;
2648 mbcset->range_ends[mbcset->nranges++] = end_wc;
2649 }
2650
2651 /* Build the table for single byte characters. */
2652 for (wc = 0; wc < SBC_MAX; ++wc)
2653 {
2654 cmp_buf[2] = wc;
2655 if (wcscoll (cmp_buf, cmp_buf + 2) <= 0
2656 && wcscoll (cmp_buf + 2, cmp_buf + 4) <= 0)
2657 bitset_set (sbcset, wc);
2658 }
2659 }
2660# else /* not RE_ENABLE_I18N */
2661 {
2662 unsigned int ch;
2663 start_ch = ((start_elem->type == SB_CHAR ) ? start_elem->opr.ch
2664 : ((start_elem->type == COLL_SYM) ? start_elem->opr.name[0]
2665 : 0));
2666 end_ch = ((end_elem->type == SB_CHAR ) ? end_elem->opr.ch
2667 : ((end_elem->type == COLL_SYM) ? end_elem->opr.name[0]
2668 : 0));
2669 if (start_ch > end_ch)
2670 return REG_ERANGE;
2671 /* Build the table for single byte characters. */
2672 for (ch = 0; ch < SBC_MAX; ++ch)
2673 if (start_ch <= ch && ch <= end_ch)
2674 bitset_set (sbcset, ch);
2675 }
2676# endif /* not RE_ENABLE_I18N */
2677 return REG_NOERROR;
2678}
2679#endif /* not _LIBC */
2680
2681#ifndef _LIBC
2682/* Helper function for parse_bracket_exp only used in case of NOT _LIBC..
2683 Build the collating element which is represented by NAME.
2684 The result are written to MBCSET and SBCSET.
2685 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2686 pointer argument since we may update it. */
2687
2688static reg_errcode_t
2689internal_function
2690# ifdef RE_ENABLE_I18N
2691build_collating_symbol (bitset_t sbcset, re_charset_t *mbcset,
2692 int *coll_sym_alloc, const unsigned char *name)
2693# else /* not RE_ENABLE_I18N */
2694build_collating_symbol (bitset_t sbcset, const unsigned char *name)
2695# endif /* not RE_ENABLE_I18N */
2696{
2697 size_t name_len = strlen ((const char *) name);
2698 if (BE (name_len != 1, 0))
2699 return REG_ECOLLATE;
2700 else
2701 {
2702 bitset_set (sbcset, name[0]);
2703 return REG_NOERROR;
2704 }
2705}
2706#endif /* not _LIBC */
2707
2708/* This function parse bracket expression like "[abc]", "[a-c]",
2709 "[[.a-a.]]" etc. */
2710
2711static bin_tree_t *
2712parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
2713 reg_syntax_t syntax, reg_errcode_t *err)
2714{
2715#ifdef _LIBC
2716 const unsigned char *collseqmb;
2717 const char *collseqwc;
2718 uint32_t nrules;
2719 int32_t table_size;
2720 const int32_t *symb_table;
2721 const unsigned char *extra;
2722
2723 /* Local function for parse_bracket_exp used in _LIBC environement.
2724 Seek the collating symbol entry correspondings to NAME.
2725 Return the index of the symbol in the SYMB_TABLE. */
2726
2727 auto inline int32_t
2728 __attribute ((always_inline))
2729 seek_collating_symbol_entry (name, name_len)
2730 const unsigned char *name;
2731 size_t name_len;
2732 {
2733 int32_t hash = elem_hash ((const char *) name, name_len);
2734 int32_t elem = hash % table_size;
2735 if (symb_table[2 * elem] != 0)
2736 {
2737 int32_t second = hash % (table_size - 2) + 1;
2738
2739 do
2740 {
2741 /* First compare the hashing value. */
2742 if (symb_table[2 * elem] == hash
2743 /* Compare the length of the name. */
2744 && name_len == extra[symb_table[2 * elem + 1]]
2745 /* Compare the name. */
2746 && memcmp (name, &extra[symb_table[2 * elem + 1] + 1],
2747 name_len) == 0)
2748 {
2749 /* Yep, this is the entry. */
2750 break;
2751 }
2752
2753 /* Next entry. */
2754 elem += second;
2755 }
2756 while (symb_table[2 * elem] != 0);
2757 }
2758 return elem;
2759 }
2760
2761 /* Local function for parse_bracket_exp used in _LIBC environement.
2762 Look up the collation sequence value of BR_ELEM.
2763 Return the value if succeeded, UINT_MAX otherwise. */
2764
2765 auto inline unsigned int
2766 __attribute ((always_inline))
2767 lookup_collation_sequence_value (br_elem)
2768 bracket_elem_t *br_elem;
2769 {
2770 if (br_elem->type == SB_CHAR)
2771 {
2772 /*
2773 if (MB_CUR_MAX == 1)
2774 */
2775 if (nrules == 0)
2776 return collseqmb[br_elem->opr.ch];
2777 else
2778 {
2779 wint_t wc = __btowc (br_elem->opr.ch);
2780 return __collseq_table_lookup (collseqwc, wc);
2781 }
2782 }
2783 else if (br_elem->type == MB_CHAR)
2784 {
2785 return __collseq_table_lookup (collseqwc, br_elem->opr.wch);
2786 }
2787 else if (br_elem->type == COLL_SYM)
2788 {
2789 size_t sym_name_len = strlen ((char *) br_elem->opr.name);
2790 if (nrules != 0)
2791 {
2792 int32_t elem, idx;
2793 elem = seek_collating_symbol_entry (br_elem->opr.name,
2794 sym_name_len);
2795 if (symb_table[2 * elem] != 0)
2796 {
2797 /* We found the entry. */
2798 idx = symb_table[2 * elem + 1];
2799 /* Skip the name of collating element name. */
2800 idx += 1 + extra[idx];
2801 /* Skip the byte sequence of the collating element. */
2802 idx += 1 + extra[idx];
2803 /* Adjust for the alignment. */
2804 idx = (idx + 3) & ~3;
2805 /* Skip the multibyte collation sequence value. */
2806 idx += sizeof (unsigned int);
2807 /* Skip the wide char sequence of the collating element. */
2808 idx += sizeof (unsigned int) *
2809 (1 + *(unsigned int *) (extra + idx));
2810 /* Return the collation sequence value. */
2811 return *(unsigned int *) (extra + idx);
2812 }
2813 else if (symb_table[2 * elem] == 0 && sym_name_len == 1)
2814 {
2815 /* No valid character. Match it as a single byte
2816 character. */
2817 return collseqmb[br_elem->opr.name[0]];
2818 }
2819 }
2820 else if (sym_name_len == 1)
2821 return collseqmb[br_elem->opr.name[0]];
2822 }
2823 return UINT_MAX;
2824 }
2825
2826 /* Local function for parse_bracket_exp used in _LIBC environement.
2827 Build the range expression which starts from START_ELEM, and ends
2828 at END_ELEM. The result are written to MBCSET and SBCSET.
2829 RANGE_ALLOC is the allocated size of mbcset->range_starts, and
2830 mbcset->range_ends, is a pointer argument sinse we may
2831 update it. */
2832
2833 auto inline reg_errcode_t
2834 __attribute ((always_inline))
2835 build_range_exp (sbcset, mbcset, range_alloc, start_elem, end_elem)
2836 re_charset_t *mbcset;
2837 int *range_alloc;
2838 bitset_t sbcset;
2839 bracket_elem_t *start_elem, *end_elem;
2840 {
2841 unsigned int ch;
2842 uint32_t start_collseq;
2843 uint32_t end_collseq;
2844
2845 /* Equivalence Classes and Character Classes can't be a range
2846 start/end. */
2847 if (BE (start_elem->type == EQUIV_CLASS || start_elem->type == CHAR_CLASS
2848 || end_elem->type == EQUIV_CLASS || end_elem->type == CHAR_CLASS,
2849 0))
2850 return REG_ERANGE;
2851
2852 start_collseq = lookup_collation_sequence_value (start_elem);
2853 end_collseq = lookup_collation_sequence_value (end_elem);
2854 /* Check start/end collation sequence values. */
2855 if (BE (start_collseq == UINT_MAX || end_collseq == UINT_MAX, 0))
2856 return REG_ECOLLATE;
2857 if (BE ((syntax & RE_NO_EMPTY_RANGES) && start_collseq > end_collseq, 0))
2858 return REG_ERANGE;
2859
2860 /* Got valid collation sequence values, add them as a new entry.
2861 However, if we have no collation elements, and the character set
2862 is single byte, the single byte character set that we
2863 build below suffices. */
2864 if (nrules > 0 || dfa->mb_cur_max > 1)
2865 {
2866 /* Check the space of the arrays. */
2867 if (BE (*range_alloc == mbcset->nranges, 0))
2868 {
2869 /* There is not enough space, need realloc. */
2870 uint32_t *new_array_start;
2871 uint32_t *new_array_end;
2872 int new_nranges;
2873
2874 /* +1 in case of mbcset->nranges is 0. */
2875 new_nranges = 2 * mbcset->nranges + 1;
2876 new_array_start = re_realloc (mbcset->range_starts, uint32_t,
2877 new_nranges);
2878 new_array_end = re_realloc (mbcset->range_ends, uint32_t,
2879 new_nranges);
2880
2881 if (BE (new_array_start == NULL || new_array_end == NULL, 0))
2882 return REG_ESPACE;
2883
2884 mbcset->range_starts = new_array_start;
2885 mbcset->range_ends = new_array_end;
2886 *range_alloc = new_nranges;
2887 }
2888
2889 mbcset->range_starts[mbcset->nranges] = start_collseq;
2890 mbcset->range_ends[mbcset->nranges++] = end_collseq;
2891 }
2892
2893 /* Build the table for single byte characters. */
2894 for (ch = 0; ch < SBC_MAX; ch++)
2895 {
2896 uint32_t ch_collseq;
2897 /*
2898 if (MB_CUR_MAX == 1)
2899 */
2900 if (nrules == 0)
2901 ch_collseq = collseqmb[ch];
2902 else
2903 ch_collseq = __collseq_table_lookup (collseqwc, __btowc (ch));
2904 if (start_collseq <= ch_collseq && ch_collseq <= end_collseq)
2905 bitset_set (sbcset, ch);
2906 }
2907 return REG_NOERROR;
2908 }
2909
2910 /* Local function for parse_bracket_exp used in _LIBC environement.
2911 Build the collating element which is represented by NAME.
2912 The result are written to MBCSET and SBCSET.
2913 COLL_SYM_ALLOC is the allocated size of mbcset->coll_sym, is a
2914 pointer argument sinse we may update it. */
2915
2916 auto inline reg_errcode_t
2917 __attribute ((always_inline))
2918 build_collating_symbol (sbcset, mbcset, coll_sym_alloc, name)
2919 re_charset_t *mbcset;
2920 int *coll_sym_alloc;
2921 bitset_t sbcset;
2922 const unsigned char *name;
2923 {
2924 int32_t elem, idx;
2925 size_t name_len = strlen ((const char *) name);
2926 if (nrules != 0)
2927 {
2928 elem = seek_collating_symbol_entry (name, name_len);
2929 if (symb_table[2 * elem] != 0)
2930 {
2931 /* We found the entry. */
2932 idx = symb_table[2 * elem + 1];
2933 /* Skip the name of collating element name. */
2934 idx += 1 + extra[idx];
2935 }
2936 else if (symb_table[2 * elem] == 0 && name_len == 1)
2937 {
2938 /* No valid character, treat it as a normal
2939 character. */
2940 bitset_set (sbcset, name[0]);
2941 return REG_NOERROR;
2942 }
2943 else
2944 return REG_ECOLLATE;
2945
2946 /* Got valid collation sequence, add it as a new entry. */
2947 /* Check the space of the arrays. */
2948 if (BE (*coll_sym_alloc == mbcset->ncoll_syms, 0))
2949 {
2950 /* Not enough, realloc it. */
2951 /* +1 in case of mbcset->ncoll_syms is 0. */
2952 int new_coll_sym_alloc = 2 * mbcset->ncoll_syms + 1;
2953 /* Use realloc since mbcset->coll_syms is NULL
2954 if *alloc == 0. */
2955 int32_t *new_coll_syms = re_realloc (mbcset->coll_syms, int32_t,
2956 new_coll_sym_alloc);
2957 if (BE (new_coll_syms == NULL, 0))
2958 return REG_ESPACE;
2959 mbcset->coll_syms = new_coll_syms;
2960 *coll_sym_alloc = new_coll_sym_alloc;
2961 }
2962 mbcset->coll_syms[mbcset->ncoll_syms++] = idx;
2963 return REG_NOERROR;
2964 }
2965 else
2966 {
2967 if (BE (name_len != 1, 0))
2968 return REG_ECOLLATE;
2969 else
2970 {
2971 bitset_set (sbcset, name[0]);
2972 return REG_NOERROR;
2973 }
2974 }
2975 }
2976#endif
2977
2978 re_token_t br_token;
2979 re_bitset_ptr_t sbcset;
2980#ifdef RE_ENABLE_I18N
2981 re_charset_t *mbcset;
2982 int coll_sym_alloc = 0, range_alloc = 0, mbchar_alloc = 0;
2983 int equiv_class_alloc = 0, char_class_alloc = 0;
2984#endif /* not RE_ENABLE_I18N */
2985 int non_match = 0;
2986 bin_tree_t *work_tree;
2987 int token_len;
2988 int first_round = 1;
2989#ifdef _LIBC
2990 collseqmb = (const unsigned char *)
2991 _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQMB);
2992 nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
2993 if (nrules)
2994 {
2995 /*
2996 if (MB_CUR_MAX > 1)
2997 */
2998 collseqwc = _NL_CURRENT (LC_COLLATE, _NL_COLLATE_COLLSEQWC);
2999 table_size = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_SYMB_HASH_SIZEMB);
3000 symb_table = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3001 _NL_COLLATE_SYMB_TABLEMB);
3002 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3003 _NL_COLLATE_SYMB_EXTRAMB);
3004 }
3005#endif
3006 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3007#ifdef RE_ENABLE_I18N
3008 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3009#endif /* RE_ENABLE_I18N */
3010#ifdef RE_ENABLE_I18N
3011 if (BE (sbcset == NULL || mbcset == NULL, 0))
3012#else
3013 if (BE (sbcset == NULL, 0))
3014#endif /* RE_ENABLE_I18N */
3015 {
3016 *err = REG_ESPACE;
3017 return NULL;
3018 }
3019
3020 token_len = peek_token_bracket (token, regexp, syntax);
3021 if (BE (token->type == END_OF_RE, 0))
3022 {
3023 *err = REG_BADPAT;
3024 goto parse_bracket_exp_free_return;
3025 }
3026 if (token->type == OP_NON_MATCH_LIST)
3027 {
3028#ifdef RE_ENABLE_I18N
3029 mbcset->non_match = 1;
3030#endif /* not RE_ENABLE_I18N */
3031 non_match = 1;
3032 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3033 bitset_set (sbcset, '\0');
3034 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3035 token_len = peek_token_bracket (token, regexp, syntax);
3036 if (BE (token->type == END_OF_RE, 0))
3037 {
3038 *err = REG_BADPAT;
3039 goto parse_bracket_exp_free_return;
3040 }
3041 }
3042
3043 /* We treat the first ']' as a normal character. */
3044 if (token->type == OP_CLOSE_BRACKET)
3045 token->type = CHARACTER;
3046
3047 while (1)
3048 {
3049 bracket_elem_t start_elem, end_elem;
3050 unsigned char start_name_buf[BRACKET_NAME_BUF_SIZE];
3051 unsigned char end_name_buf[BRACKET_NAME_BUF_SIZE];
3052 reg_errcode_t ret;
3053 int token_len2 = 0, is_range_exp = 0;
3054 re_token_t token2;
3055
3056 start_elem.opr.name = start_name_buf;
3057 ret = parse_bracket_element (&start_elem, regexp, token, token_len, dfa,
3058 syntax, first_round);
3059 if (BE (ret != REG_NOERROR, 0))
3060 {
3061 *err = ret;
3062 goto parse_bracket_exp_free_return;
3063 }
3064 first_round = 0;
3065
3066 /* Get information about the next token. We need it in any case. */
3067 token_len = peek_token_bracket (token, regexp, syntax);
3068
3069 /* Do not check for ranges if we know they are not allowed. */
3070 if (start_elem.type != CHAR_CLASS && start_elem.type != EQUIV_CLASS)
3071 {
3072 if (BE (token->type == END_OF_RE, 0))
3073 {
3074 *err = REG_EBRACK;
3075 goto parse_bracket_exp_free_return;
3076 }
3077 if (token->type == OP_CHARSET_RANGE)
3078 {
3079 re_string_skip_bytes (regexp, token_len); /* Skip '-'. */
3080 token_len2 = peek_token_bracket (&token2, regexp, syntax);
3081 if (BE (token2.type == END_OF_RE, 0))
3082 {
3083 *err = REG_EBRACK;
3084 goto parse_bracket_exp_free_return;
3085 }
3086 if (token2.type == OP_CLOSE_BRACKET)
3087 {
3088 /* We treat the last '-' as a normal character. */
3089 re_string_skip_bytes (regexp, -token_len);
3090 token->type = CHARACTER;
3091 }
3092 else
3093 is_range_exp = 1;
3094 }
3095 }
3096
3097 if (is_range_exp == 1)
3098 {
3099 end_elem.opr.name = end_name_buf;
3100 ret = parse_bracket_element (&end_elem, regexp, &token2, token_len2,
3101 dfa, syntax, 1);
3102 if (BE (ret != REG_NOERROR, 0))
3103 {
3104 *err = ret;
3105 goto parse_bracket_exp_free_return;
3106 }
3107
3108 token_len = peek_token_bracket (token, regexp, syntax);
3109
3110#ifdef _LIBC
3111 *err = build_range_exp (sbcset, mbcset, &range_alloc,
3112 &start_elem, &end_elem);
3113#else
3114# ifdef RE_ENABLE_I18N
3115 *err = build_range_exp (sbcset,
3116 dfa->mb_cur_max > 1 ? mbcset : NULL,
3117 &range_alloc, &start_elem, &end_elem);
3118# else
3119 *err = build_range_exp (sbcset, &start_elem, &end_elem);
3120# endif
3121#endif /* RE_ENABLE_I18N */
3122 if (BE (*err != REG_NOERROR, 0))
3123 goto parse_bracket_exp_free_return;
3124 }
3125 else
3126 {
3127 switch (start_elem.type)
3128 {
3129 case SB_CHAR:
3130 bitset_set (sbcset, start_elem.opr.ch);
3131 break;
3132#ifdef RE_ENABLE_I18N
3133 case MB_CHAR:
3134 /* Check whether the array has enough space. */
3135 if (BE (mbchar_alloc == mbcset->nmbchars, 0))
3136 {
3137 wchar_t *new_mbchars;
3138 /* Not enough, realloc it. */
3139 /* +1 in case of mbcset->nmbchars is 0. */
3140 mbchar_alloc = 2 * mbcset->nmbchars + 1;
3141 /* Use realloc since array is NULL if *alloc == 0. */
3142 new_mbchars = re_realloc (mbcset->mbchars, wchar_t,
3143 mbchar_alloc);
3144 if (BE (new_mbchars == NULL, 0))
3145 goto parse_bracket_exp_espace;
3146 mbcset->mbchars = new_mbchars;
3147 }
3148 mbcset->mbchars[mbcset->nmbchars++] = start_elem.opr.wch;
3149 break;
3150#endif /* RE_ENABLE_I18N */
3151 case EQUIV_CLASS:
3152 *err = build_equiv_class (sbcset,
3153#ifdef RE_ENABLE_I18N
3154 mbcset, &equiv_class_alloc,
3155#endif /* RE_ENABLE_I18N */
3156 start_elem.opr.name);
3157 if (BE (*err != REG_NOERROR, 0))
3158 goto parse_bracket_exp_free_return;
3159 break;
3160 case COLL_SYM:
3161 *err = build_collating_symbol (sbcset,
3162#ifdef RE_ENABLE_I18N
3163 mbcset, &coll_sym_alloc,
3164#endif /* RE_ENABLE_I18N */
3165 start_elem.opr.name);
3166 if (BE (*err != REG_NOERROR, 0))
3167 goto parse_bracket_exp_free_return;
3168 break;
3169 case CHAR_CLASS:
3170 *err = build_charclass (regexp->trans, sbcset,
3171#ifdef RE_ENABLE_I18N
3172 mbcset, &char_class_alloc,
3173#endif /* RE_ENABLE_I18N */
3174 start_elem.opr.name, syntax);
3175 if (BE (*err != REG_NOERROR, 0))
3176 goto parse_bracket_exp_free_return;
3177 break;
3178 default:
3179 assert (0);
3180 break;
3181 }
3182 }
3183 if (BE (token->type == END_OF_RE, 0))
3184 {
3185 *err = REG_EBRACK;
3186 goto parse_bracket_exp_free_return;
3187 }
3188 if (token->type == OP_CLOSE_BRACKET)
3189 break;
3190 }
3191
3192 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3193
3194 /* If it is non-matching list. */
3195 if (non_match)
3196 bitset_not (sbcset);
3197
3198#ifdef RE_ENABLE_I18N
3199 /* Ensure only single byte characters are set. */
3200 if (dfa->mb_cur_max > 1)
3201 bitset_mask (sbcset, dfa->sb_char);
3202
3203 if (mbcset->nmbchars || mbcset->ncoll_syms || mbcset->nequiv_classes
3204 || mbcset->nranges || (dfa->mb_cur_max > 1 && (mbcset->nchar_classes
3205 || mbcset->non_match)))
3206 {
3207 bin_tree_t *mbc_tree;
3208 int sbc_idx;
3209 /* Build a tree for complex bracket. */
3210 dfa->has_mb_node = 1;
3211 br_token.type = COMPLEX_BRACKET;
3212 br_token.opr.mbcset = mbcset;
3213 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3214 if (BE (mbc_tree == NULL, 0))
3215 goto parse_bracket_exp_espace;
3216 for (sbc_idx = 0; sbc_idx < BITSET_WORDS; ++sbc_idx)
3217 if (sbcset[sbc_idx])
3218 break;
3219 /* If there are no bits set in sbcset, there is no point
3220 of having both SIMPLE_BRACKET and COMPLEX_BRACKET. */
3221 if (sbc_idx < BITSET_WORDS)
3222 {
3223 /* Build a tree for simple bracket. */
3224 br_token.type = SIMPLE_BRACKET;
3225 br_token.opr.sbcset = sbcset;
3226 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3227 if (BE (work_tree == NULL, 0))
3228 goto parse_bracket_exp_espace;
3229
3230 /* Then join them by ALT node. */
3231 work_tree = create_tree (dfa, work_tree, mbc_tree, OP_ALT);
3232 if (BE (work_tree == NULL, 0))
3233 goto parse_bracket_exp_espace;
3234 }
3235 else
3236 {
3237 re_free (sbcset);
3238 work_tree = mbc_tree;
3239 }
3240 }
3241 else
3242#endif /* not RE_ENABLE_I18N */
3243 {
3244#ifdef RE_ENABLE_I18N
3245 free_charset (mbcset);
3246#endif
3247 /* Build a tree for simple bracket. */
3248 br_token.type = SIMPLE_BRACKET;
3249 br_token.opr.sbcset = sbcset;
3250 work_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3251 if (BE (work_tree == NULL, 0))
3252 goto parse_bracket_exp_espace;
3253 }
3254 return work_tree;
3255
3256 parse_bracket_exp_espace:
3257 *err = REG_ESPACE;
3258 parse_bracket_exp_free_return:
3259 re_free (sbcset);
3260#ifdef RE_ENABLE_I18N
3261 free_charset (mbcset);
3262#endif /* RE_ENABLE_I18N */
3263 return NULL;
3264}
3265
3266/* Parse an element in the bracket expression. */
3267
3268static reg_errcode_t
3269parse_bracket_element (bracket_elem_t *elem, re_string_t *regexp,
3270 re_token_t *token, int token_len, re_dfa_t *dfa,
3271 reg_syntax_t syntax, int accept_hyphen)
3272{
3273#ifdef RE_ENABLE_I18N
3274 int cur_char_size;
3275 cur_char_size = re_string_char_size_at (regexp, re_string_cur_idx (regexp));
3276 if (cur_char_size > 1)
3277 {
3278 elem->type = MB_CHAR;
3279 elem->opr.wch = re_string_wchar_at (regexp, re_string_cur_idx (regexp));
3280 re_string_skip_bytes (regexp, cur_char_size);
3281 return REG_NOERROR;
3282 }
3283#endif /* RE_ENABLE_I18N */
3284 re_string_skip_bytes (regexp, token_len); /* Skip a token. */
3285 if (token->type == OP_OPEN_COLL_ELEM || token->type == OP_OPEN_CHAR_CLASS
3286 || token->type == OP_OPEN_EQUIV_CLASS)
3287 return parse_bracket_symbol (elem, regexp, token);
3288 if (BE (token->type == OP_CHARSET_RANGE, 0) && !accept_hyphen)
3289 {
3290 /* A '-' must only appear as anything but a range indicator before
3291 the closing bracket. Everything else is an error. */
3292 re_token_t token2;
3293 (void) peek_token_bracket (&token2, regexp, syntax);
3294 if (token2.type != OP_CLOSE_BRACKET)
3295 /* The actual error value is not standardized since this whole
3296 case is undefined. But ERANGE makes good sense. */
3297 return REG_ERANGE;
3298 }
3299 elem->type = SB_CHAR;
3300 elem->opr.ch = token->opr.c;
3301 return REG_NOERROR;
3302}
3303
3304/* Parse a bracket symbol in the bracket expression. Bracket symbols are
3305 such as [:<character_class>:], [.<collating_element>.], and
3306 [=<equivalent_class>=]. */
3307
3308static reg_errcode_t
3309parse_bracket_symbol (bracket_elem_t *elem, re_string_t *regexp,
3310 re_token_t *token)
3311{
3312 unsigned char ch, delim = token->opr.c;
3313 int i = 0;
3314 if (re_string_eoi(regexp))
3315 return REG_EBRACK;
3316 for (;; ++i)
3317 {
3318 if (i >= BRACKET_NAME_BUF_SIZE)
3319 return REG_EBRACK;
3320 if (token->type == OP_OPEN_CHAR_CLASS)
3321 ch = re_string_fetch_byte_case (regexp);
3322 else
3323 ch = re_string_fetch_byte (regexp);
3324 if (re_string_eoi(regexp))
3325 return REG_EBRACK;
3326 if (ch == delim && re_string_peek_byte (regexp, 0) == ']')
3327 break;
3328 elem->opr.name[i] = ch;
3329 }
3330 re_string_skip_bytes (regexp, 1);
3331 elem->opr.name[i] = '\0';
3332 switch (token->type)
3333 {
3334 case OP_OPEN_COLL_ELEM:
3335 elem->type = COLL_SYM;
3336 break;
3337 case OP_OPEN_EQUIV_CLASS:
3338 elem->type = EQUIV_CLASS;
3339 break;
3340 case OP_OPEN_CHAR_CLASS:
3341 elem->type = CHAR_CLASS;
3342 break;
3343 default:
3344 break;
3345 }
3346 return REG_NOERROR;
3347}
3348
3349 /* Helper function for parse_bracket_exp.
3350 Build the equivalence class which is represented by NAME.
3351 The result are written to MBCSET and SBCSET.
3352 EQUIV_CLASS_ALLOC is the allocated size of mbcset->equiv_classes,
3353 is a pointer argument sinse we may update it. */
3354
3355static reg_errcode_t
3356#ifdef RE_ENABLE_I18N
3357build_equiv_class (bitset_t sbcset, re_charset_t *mbcset,
3358 int *equiv_class_alloc, const unsigned char *name)
3359#else /* not RE_ENABLE_I18N */
3360build_equiv_class (bitset_t sbcset, const unsigned char *name)
3361#endif /* not RE_ENABLE_I18N */
3362{
3363#ifdef _LIBC
3364 uint32_t nrules = _NL_CURRENT_WORD (LC_COLLATE, _NL_COLLATE_NRULES);
3365 if (nrules != 0)
3366 {
3367 const int32_t *table, *indirect;
3368 const unsigned char *weights, *extra, *cp;
3369 unsigned char char_buf[2];
3370 int32_t idx1, idx2;
3371 unsigned int ch;
3372 size_t len;
3373 /* This #include defines a local function! */
3374# include <locale/weight.h>
3375 /* Calculate the index for equivalence class. */
3376 cp = name;
3377 table = (const int32_t *) _NL_CURRENT (LC_COLLATE, _NL_COLLATE_TABLEMB);
3378 weights = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3379 _NL_COLLATE_WEIGHTMB);
3380 extra = (const unsigned char *) _NL_CURRENT (LC_COLLATE,
3381 _NL_COLLATE_EXTRAMB);
3382 indirect = (const int32_t *) _NL_CURRENT (LC_COLLATE,
3383 _NL_COLLATE_INDIRECTMB);
3384 idx1 = findidx (&cp);
3385 if (BE (idx1 == 0 || cp < name + strlen ((const char *) name), 0))
3386 /* This isn't a valid character. */
3387 return REG_ECOLLATE;
3388
3389 /* Build single byte matcing table for this equivalence class. */
3390 char_buf[1] = (unsigned char) '\0';
3391 len = weights[idx1];
3392 for (ch = 0; ch < SBC_MAX; ++ch)
3393 {
3394 char_buf[0] = ch;
3395 cp = char_buf;
3396 idx2 = findidx (&cp);
3397/*
3398 idx2 = table[ch];
3399*/
3400 if (idx2 == 0)
3401 /* This isn't a valid character. */
3402 continue;
3403 if (len == weights[idx2])
3404 {
3405 int cnt = 0;
3406 while (cnt <= len &&
3407 weights[idx1 + 1 + cnt] == weights[idx2 + 1 + cnt])
3408 ++cnt;
3409
3410 if (cnt > len)
3411 bitset_set (sbcset, ch);
3412 }
3413 }
3414 /* Check whether the array has enough space. */
3415 if (BE (*equiv_class_alloc == mbcset->nequiv_classes, 0))
3416 {
3417 /* Not enough, realloc it. */
3418 /* +1 in case of mbcset->nequiv_classes is 0. */
3419 int new_equiv_class_alloc = 2 * mbcset->nequiv_classes + 1;
3420 /* Use realloc since the array is NULL if *alloc == 0. */
3421 int32_t *new_equiv_classes = re_realloc (mbcset->equiv_classes,
3422 int32_t,
3423 new_equiv_class_alloc);
3424 if (BE (new_equiv_classes == NULL, 0))
3425 return REG_ESPACE;
3426 mbcset->equiv_classes = new_equiv_classes;
3427 *equiv_class_alloc = new_equiv_class_alloc;
3428 }
3429 mbcset->equiv_classes[mbcset->nequiv_classes++] = idx1;
3430 }
3431 else
3432#endif /* _LIBC */
3433 {
3434 if (BE (strlen ((const char *) name) != 1, 0))
3435 return REG_ECOLLATE;
3436 bitset_set (sbcset, *name);
3437 }
3438 return REG_NOERROR;
3439}
3440
3441 /* Helper function for parse_bracket_exp.
3442 Build the character class which is represented by NAME.
3443 The result are written to MBCSET and SBCSET.
3444 CHAR_CLASS_ALLOC is the allocated size of mbcset->char_classes,
3445 is a pointer argument sinse we may update it. */
3446
3447static reg_errcode_t
3448#ifdef RE_ENABLE_I18N
3449build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3450 re_charset_t *mbcset, int *char_class_alloc,
3451 const unsigned char *class_name, reg_syntax_t syntax)
3452#else /* not RE_ENABLE_I18N */
3453build_charclass (RE_TRANSLATE_TYPE trans, bitset_t sbcset,
3454 const unsigned char *class_name, reg_syntax_t syntax)
3455#endif /* not RE_ENABLE_I18N */
3456{
3457 int i;
3458 const char *name = (const char *) class_name;
3459
3460 /* In case of REG_ICASE "upper" and "lower" match the both of
3461 upper and lower cases. */
3462 if ((syntax & RE_ICASE)
3463 && (strcmp (name, "upper") == 0 || strcmp (name, "lower") == 0))
3464 name = "alpha";
3465
3466#ifdef RE_ENABLE_I18N
3467 /* Check the space of the arrays. */
3468 if (BE (*char_class_alloc == mbcset->nchar_classes, 0))
3469 {
3470 /* Not enough, realloc it. */
3471 /* +1 in case of mbcset->nchar_classes is 0. */
3472 int new_char_class_alloc = 2 * mbcset->nchar_classes + 1;
3473 /* Use realloc since array is NULL if *alloc == 0. */
3474 wctype_t *new_char_classes = re_realloc (mbcset->char_classes, wctype_t,
3475 new_char_class_alloc);
3476 if (BE (new_char_classes == NULL, 0))
3477 return REG_ESPACE;
3478 mbcset->char_classes = new_char_classes;
3479 *char_class_alloc = new_char_class_alloc;
3480 }
3481 mbcset->char_classes[mbcset->nchar_classes++] = __wctype (name);
3482#endif /* RE_ENABLE_I18N */
3483
3484#define BUILD_CHARCLASS_LOOP(ctype_func) \
3485 do { \
3486 if (BE (trans != NULL, 0)) \
3487 { \
3488 for (i = 0; i < SBC_MAX; ++i) \
3489 if (ctype_func (i)) \
3490 bitset_set (sbcset, trans[i]); \
3491 } \
3492 else \
3493 { \
3494 for (i = 0; i < SBC_MAX; ++i) \
3495 if (ctype_func (i)) \
3496 bitset_set (sbcset, i); \
3497 } \
3498 } while (0)
3499
3500 if (strcmp (name, "alnum") == 0)
3501 BUILD_CHARCLASS_LOOP (isalnum);
3502 else if (strcmp (name, "cntrl") == 0)
3503 BUILD_CHARCLASS_LOOP (iscntrl);
3504 else if (strcmp (name, "lower") == 0)
3505 BUILD_CHARCLASS_LOOP (islower);
3506 else if (strcmp (name, "space") == 0)
3507 BUILD_CHARCLASS_LOOP (isspace);
3508 else if (strcmp (name, "alpha") == 0)
3509 BUILD_CHARCLASS_LOOP (isalpha);
3510 else if (strcmp (name, "digit") == 0)
3511 BUILD_CHARCLASS_LOOP (isdigit);
3512 else if (strcmp (name, "print") == 0)
3513 BUILD_CHARCLASS_LOOP (isprint);
3514 else if (strcmp (name, "upper") == 0)
3515 BUILD_CHARCLASS_LOOP (isupper);
3516 else if (strcmp (name, "blank") == 0)
3517 BUILD_CHARCLASS_LOOP (isblank);
3518 else if (strcmp (name, "graph") == 0)
3519 BUILD_CHARCLASS_LOOP (isgraph);
3520 else if (strcmp (name, "punct") == 0)
3521 BUILD_CHARCLASS_LOOP (ispunct);
3522 else if (strcmp (name, "xdigit") == 0)
3523 BUILD_CHARCLASS_LOOP (isxdigit);
3524 else
3525 return REG_ECTYPE;
3526
3527 return REG_NOERROR;
3528}
3529
3530static bin_tree_t *
3531build_charclass_op (re_dfa_t *dfa, RE_TRANSLATE_TYPE trans,
3532 const unsigned char *class_name,
3533 const unsigned char *extra, int non_match,
3534 reg_errcode_t *err)
3535{
3536 re_bitset_ptr_t sbcset;
3537#ifdef RE_ENABLE_I18N
3538 re_charset_t *mbcset;
3539 int alloc = 0;
3540#endif /* not RE_ENABLE_I18N */
3541 reg_errcode_t ret;
3542 re_token_t br_token;
3543 bin_tree_t *tree;
3544
3545 sbcset = (re_bitset_ptr_t) calloc (sizeof (bitset_t), 1);
3546#ifdef RE_ENABLE_I18N
3547 mbcset = (re_charset_t *) calloc (sizeof (re_charset_t), 1);
3548#endif /* RE_ENABLE_I18N */
3549
3550#ifdef RE_ENABLE_I18N
3551 if (BE (sbcset == NULL || mbcset == NULL, 0))
3552#else /* not RE_ENABLE_I18N */
3553 if (BE (sbcset == NULL, 0))
3554#endif /* not RE_ENABLE_I18N */
3555 {
3556 *err = REG_ESPACE;
3557 return NULL;
3558 }
3559
3560 if (non_match)
3561 {
3562#ifdef RE_ENABLE_I18N
3563 /*
3564 if (syntax & RE_HAT_LISTS_NOT_NEWLINE)
3565 bitset_set(cset->sbcset, '\0');
3566 */
3567 mbcset->non_match = 1;
3568#endif /* not RE_ENABLE_I18N */
3569 }
3570
3571 /* We don't care the syntax in this case. */
3572 ret = build_charclass (trans, sbcset,
3573#ifdef RE_ENABLE_I18N
3574 mbcset, &alloc,
3575#endif /* RE_ENABLE_I18N */
3576 class_name, 0);
3577
3578 if (BE (ret != REG_NOERROR, 0))
3579 {
3580 re_free (sbcset);
3581#ifdef RE_ENABLE_I18N
3582 free_charset (mbcset);
3583#endif /* RE_ENABLE_I18N */
3584 *err = ret;
3585 return NULL;
3586 }
3587 /* \w match '_' also. */
3588 for (; *extra; extra++)
3589 bitset_set (sbcset, *extra);
3590
3591 /* If it is non-matching list. */
3592 if (non_match)
3593 bitset_not (sbcset);
3594
3595#ifdef RE_ENABLE_I18N
3596 /* Ensure only single byte characters are set. */
3597 if (dfa->mb_cur_max > 1)
3598 bitset_mask (sbcset, dfa->sb_char);
3599#endif
3600
3601 /* Build a tree for simple bracket. */
3602 br_token.type = SIMPLE_BRACKET;
3603 br_token.opr.sbcset = sbcset;
3604 tree = create_token_tree (dfa, NULL, NULL, &br_token);
3605 if (BE (tree == NULL, 0))
3606 goto build_word_op_espace;
3607
3608#ifdef RE_ENABLE_I18N
3609 if (dfa->mb_cur_max > 1)
3610 {
3611 bin_tree_t *mbc_tree;
3612 /* Build a tree for complex bracket. */
3613 br_token.type = COMPLEX_BRACKET;
3614 br_token.opr.mbcset = mbcset;
3615 dfa->has_mb_node = 1;
3616 mbc_tree = create_token_tree (dfa, NULL, NULL, &br_token);
3617 if (BE (mbc_tree == NULL, 0))
3618 goto build_word_op_espace;
3619 /* Then join them by ALT node. */
3620 tree = create_tree (dfa, tree, mbc_tree, OP_ALT);
3621 if (BE (mbc_tree != NULL, 1))
3622 return tree;
3623 }
3624 else
3625 {
3626 free_charset (mbcset);
3627 return tree;
3628 }
3629#else /* not RE_ENABLE_I18N */
3630 return tree;
3631#endif /* not RE_ENABLE_I18N */
3632
3633 build_word_op_espace:
3634 re_free (sbcset);
3635#ifdef RE_ENABLE_I18N
3636 free_charset (mbcset);
3637#endif /* RE_ENABLE_I18N */
3638 *err = REG_ESPACE;
3639 return NULL;
3640}
3641
3642/* This is intended for the expressions like "a{1,3}".
3643 Fetch a number from `input', and return the number.
3644 Return -1, if the number field is empty like "{,1}".
3645 Return -2, If an error is occured. */
3646
3647static int
3648fetch_number (re_string_t *input, re_token_t *token, reg_syntax_t syntax)
3649{
3650 int num = -1;
3651 unsigned char c;
3652 while (1)
3653 {
3654 fetch_token (token, input, syntax);
3655 c = token->opr.c;
3656 if (BE (token->type == END_OF_RE, 0))
3657 return -2;
3658 if (token->type == OP_CLOSE_DUP_NUM || c == ',')
3659 break;
3660 num = ((token->type != CHARACTER || c < '0' || '9' < c || num == -2)
3661 ? -2 : ((num == -1) ? c - '0' : num * 10 + c - '0'));
3662 num = (num > RE_DUP_MAX) ? -2 : num;
3663 }
3664 return num;
3665}
3666
3667
3668#ifdef RE_ENABLE_I18N
3669static void
3670free_charset (re_charset_t *cset)
3671{
3672 re_free (cset->mbchars);
3673# ifdef _LIBC
3674 re_free (cset->coll_syms);
3675 re_free (cset->equiv_classes);
3676 re_free (cset->range_starts);
3677 re_free (cset->range_ends);
3678# endif
3679 re_free (cset->char_classes);
3680 re_free (cset);
3681}
3682#endif /* RE_ENABLE_I18N */
3683
3684
3685/* Functions for binary tree operation. */
3686
3687/* Create a tree node. */
3688
3689static bin_tree_t *
3690create_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3691 re_token_type_t type)
3692{
3693 re_token_t t;
3694 t.type = type;
3695 return create_token_tree (dfa, left, right, &t);
3696}
3697
3698static bin_tree_t *
3699create_token_tree (re_dfa_t *dfa, bin_tree_t *left, bin_tree_t *right,
3700 const re_token_t *token)
3701{
3702 bin_tree_t *tree;
3703 if (BE (dfa->str_tree_storage_idx == BIN_TREE_STORAGE_SIZE, 0))
3704 {
3705 bin_tree_storage_t *storage = re_malloc (bin_tree_storage_t, 1);
3706
3707 if (storage == NULL)
3708 return NULL;
3709 storage->next = dfa->str_tree_storage;
3710 dfa->str_tree_storage = storage;
3711 dfa->str_tree_storage_idx = 0;
3712 }
3713 tree = &dfa->str_tree_storage->data[dfa->str_tree_storage_idx++];
3714
3715 tree->parent = NULL;
3716 tree->left = left;
3717 tree->right = right;
3718 tree->token = *token;
3719 tree->token.duplicated = 0;
3720 tree->token.opt_subexp = 0;
3721 tree->first = NULL;
3722 tree->next = NULL;
3723 tree->node_idx = -1;
3724
3725 if (left != NULL)
3726 left->parent = tree;
3727 if (right != NULL)
3728 right->parent = tree;
3729 return tree;
3730}
3731
3732/* Mark the tree SRC as an optional subexpression.
3733 To be called from preorder or postorder. */
3734
3735static reg_errcode_t
3736mark_opt_subexp (void *extra, bin_tree_t *node)
3737{
3738 int idx = (int) (long) extra;
3739 if (node->token.type == SUBEXP && node->token.opr.idx == idx)
3740 node->token.opt_subexp = 1;
3741
3742 return REG_NOERROR;
3743}
3744
3745/* Free the allocated memory inside NODE. */
3746
3747static void
3748free_token (re_token_t *node)
3749{
3750#ifdef RE_ENABLE_I18N
3751 if (node->type == COMPLEX_BRACKET && node->duplicated == 0)
3752 free_charset (node->opr.mbcset);
3753 else
3754#endif /* RE_ENABLE_I18N */
3755 if (node->type == SIMPLE_BRACKET && node->duplicated == 0)
3756 re_free (node->opr.sbcset);
3757}
3758
3759/* Worker function for tree walking. Free the allocated memory inside NODE
3760 and its children. */
3761
3762static reg_errcode_t
3763free_tree (void *extra, bin_tree_t *node)
3764{
3765 free_token (&node->token);
3766 return REG_NOERROR;
3767}
3768
3769
3770/* Duplicate the node SRC, and return new node. This is a preorder
3771 visit similar to the one implemented by the generic visitor, but
3772 we need more infrastructure to maintain two parallel trees --- so,
3773 it's easier to duplicate. */
3774
3775static bin_tree_t *
3776duplicate_tree (const bin_tree_t *root, re_dfa_t *dfa)
3777{
3778 const bin_tree_t *node;
3779 bin_tree_t *dup_root;
3780 bin_tree_t **p_new = &dup_root, *dup_node = root->parent;
3781
3782 for (node = root; ; )
3783 {
3784 /* Create a new tree and link it back to the current parent. */
3785 *p_new = create_token_tree (dfa, NULL, NULL, &node->token);
3786 if (*p_new == NULL)
3787 return NULL;
3788 (*p_new)->parent = dup_node;
3789 (*p_new)->token.duplicated = 1;
3790 dup_node = *p_new;
3791
3792 /* Go to the left node, or up and to the right. */
3793 if (node->left)
3794 {
3795 node = node->left;
3796 p_new = &dup_node->left;
3797 }
3798 else
3799 {
3800 const bin_tree_t *prev = NULL;
3801 while (node->right == prev || node->right == NULL)
3802 {
3803 prev = node;
3804 node = node->parent;
3805 dup_node = dup_node->parent;
3806 if (!node)
3807 return dup_root;
3808 }
3809 node = node->right;
3810 p_new = &dup_node->right;
3811 }
3812 }
3813}
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette