regex_internal.c@ 3138

Last change on this file since 3138 was 599, checked in by bird, 18 years ago
GNU sed 4.1.5.
File size: 44.0 KB

Line
1	/* Extended regular expression matching and search library.
2	Copyright (C) 2002, 2003, 2004, 2005 Free Software Foundation, Inc.
3	This file is part of the GNU C Library.
4	Contributed by Isamu Hasegawa <isamu@yamato.ibm.com>.
5
6	The GNU C Library is free software; you can redistribute it and/or
7	modify it under the terms of the GNU Lesser General Public
8	License as published by the Free Software Foundation; either
9	version 2.1 of the License, or (at your option) any later version.
10
11	The GNU C Library is distributed in the hope that it will be useful,
12	but WITHOUT ANY WARRANTY; without even the implied warranty of
13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	Lesser General Public License for more details.
15
16	You should have received a copy of the GNU Lesser General Public
17	License along with the GNU C Library; if not, write to the Free
18	Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19	02111-1307 USA. */
20
21	static void re_string_construct_common (const char *str, int len,
22	re_string_t *pstr,
23	RE_TRANSLATE_TYPE trans, int icase,
24	const re_dfa_t *dfa) internal_function;
25	static re_dfastate_t create_ci_newstate (const re_dfa_t dfa,
26	const re_node_set *nodes,
27	unsigned int hash) internal_function;
28	static re_dfastate_t create_cd_newstate (const re_dfa_t dfa,
29	const re_node_set *nodes,
30	unsigned int context,
31	unsigned int hash) internal_function;
32
33
34	/* Functions for string operation. */
35
36	/* This function allocate the buffers. It is necessary to call
37	re_string_reconstruct before using the object. */
38
39	static reg_errcode_t
40	internal_function
41	re_string_allocate (re_string_t pstr, const char str, int len, int init_len,
42	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
43	{
44	reg_errcode_t ret;
45	int init_buf_len;
46
47	/* Ensure at least one character fits into the buffers. */
48	if (init_len < dfa->mb_cur_max)
49	init_len = dfa->mb_cur_max;
50	init_buf_len = (len + 1 < init_len) ? len + 1: init_len;
51	re_string_construct_common (str, len, pstr, trans, icase, dfa);
52
53	ret = re_string_realloc_buffers (pstr, init_buf_len);
54	if (BE (ret != REG_NOERROR, 0))
55	return ret;
56
57	pstr->word_char = dfa->word_char;
58	pstr->word_ops_used = dfa->word_ops_used;
59	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
60	pstr->valid_len = (pstr->mbs_allocated \|\| dfa->mb_cur_max > 1) ? 0 : len;
61	pstr->valid_raw_len = pstr->valid_len;
62	return REG_NOERROR;
63	}
64
65	/* This function allocate the buffers, and initialize them. */
66
67	static reg_errcode_t
68	internal_function
69	re_string_construct (re_string_t pstr, const char str, int len,
70	RE_TRANSLATE_TYPE trans, int icase, const re_dfa_t *dfa)
71	{
72	reg_errcode_t ret;
73	memset (pstr, '\0', sizeof (re_string_t));
74	re_string_construct_common (str, len, pstr, trans, icase, dfa);
75
76	if (len > 0)
77	{
78	ret = re_string_realloc_buffers (pstr, len + 1);
79	if (BE (ret != REG_NOERROR, 0))
80	return ret;
81	}
82	pstr->mbs = pstr->mbs_allocated ? pstr->mbs : (unsigned char *) str;
83
84	if (icase)
85	{
86	#ifdef RE_ENABLE_I18N
87	if (dfa->mb_cur_max > 1)
88	{
89	while (1)
90	{
91	ret = build_wcs_upper_buffer (pstr);
92	if (BE (ret != REG_NOERROR, 0))
93	return ret;
94	if (pstr->valid_raw_len >= len)
95	break;
96	if (pstr->bufs_len > pstr->valid_len + dfa->mb_cur_max)
97	break;
98	ret = re_string_realloc_buffers (pstr, pstr->bufs_len * 2);
99	if (BE (ret != REG_NOERROR, 0))
100	return ret;
101	}
102	}
103	else
104	#endif /* RE_ENABLE_I18N */
105	build_upper_buffer (pstr);
106	}
107	else
108	{
109	#ifdef RE_ENABLE_I18N
110	if (dfa->mb_cur_max > 1)
111	build_wcs_buffer (pstr);
112	else
113	#endif /* RE_ENABLE_I18N */
114	{
115	if (trans != NULL)
116	re_string_translate_buffer (pstr);
117	else
118	{
119	pstr->valid_len = pstr->bufs_len;
120	pstr->valid_raw_len = pstr->bufs_len;
121	}
122	}
123	}
124
125	return REG_NOERROR;
126	}
127
128	/* Helper functions for re_string_allocate, and re_string_construct. */
129
130	static reg_errcode_t
131	internal_function
132	re_string_realloc_buffers (re_string_t *pstr, int new_buf_len)
133	{
134	#ifdef RE_ENABLE_I18N
135	if (pstr->mb_cur_max > 1)
136	{
137	wint_t *new_wcs = re_realloc (pstr->wcs, wint_t, new_buf_len);
138	if (BE (new_wcs == NULL, 0))
139	return REG_ESPACE;
140	pstr->wcs = new_wcs;
141	if (pstr->offsets != NULL)
142	{
143	int *new_offsets = re_realloc (pstr->offsets, int, new_buf_len);
144	if (BE (new_offsets == NULL, 0))
145	return REG_ESPACE;
146	pstr->offsets = new_offsets;
147	}
148	}
149	#endif /* RE_ENABLE_I18N */
150	if (pstr->mbs_allocated)
151	{
152	unsigned char *new_mbs = re_realloc (pstr->mbs, unsigned char,
153	new_buf_len);
154	if (BE (new_mbs == NULL, 0))
155	return REG_ESPACE;
156	pstr->mbs = new_mbs;
157	}
158	pstr->bufs_len = new_buf_len;
159	return REG_NOERROR;
160	}
161
162
163	static void
164	internal_function
165	re_string_construct_common (const char str, int len, re_string_t pstr,
166	RE_TRANSLATE_TYPE trans, int icase,
167	const re_dfa_t *dfa)
168	{
169	pstr->raw_mbs = (const unsigned char *) str;
170	pstr->len = len;
171	pstr->raw_len = len;
172	pstr->trans = trans;
173	pstr->icase = icase ? 1 : 0;
174	pstr->mbs_allocated = (trans != NULL \|\| icase);
175	pstr->mb_cur_max = dfa->mb_cur_max;
176	pstr->is_utf8 = dfa->is_utf8;
177	pstr->map_notascii = dfa->map_notascii;
178	pstr->stop = pstr->len;
179	pstr->raw_stop = pstr->stop;
180	}
181
182	#ifdef RE_ENABLE_I18N
183
184	/* Build wide character buffer PSTR->WCS.
185	If the byte sequence of the string are:
186	<mb1>(0), <mb1>(1), <mb2>(0), <mb2>(1), <sb3>
187	Then wide character buffer will be:
188	<wc1> , WEOF , <wc2> , WEOF , <wc3>
189	We use WEOF for padding, they indicate that the position isn't
190	a first byte of a multibyte character.
191
192	Note that this function assumes PSTR->VALID_LEN elements are already
193	built and starts from PSTR->VALID_LEN. */
194
195	static void
196	internal_function
197	build_wcs_buffer (re_string_t *pstr)
198	{
199	#ifdef _LIBC
200	unsigned char buf[MB_LEN_MAX];
201	assert (MB_LEN_MAX >= pstr->mb_cur_max);
202	#else
203	unsigned char buf[64];
204	#endif
205	mbstate_t prev_st;
206	int byte_idx, end_idx, remain_len;
207	size_t mbclen;
208
209	/* Build the buffers from pstr->valid_len to either pstr->len or
210	pstr->bufs_len. */
211	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
212	for (byte_idx = pstr->valid_len; byte_idx < end_idx;)
213	{
214	wchar_t wc;
215	const char *p;
216
217	remain_len = end_idx - byte_idx;
218	prev_st = pstr->cur_state;
219	/* Apply the translation if we need. */
220	if (BE (pstr->trans != NULL, 0))
221	{
222	int i, ch;
223
224	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
225	{
226	ch = pstr->raw_mbs [pstr->raw_mbs_idx + byte_idx + i];
227	buf[i] = pstr->mbs[byte_idx + i] = pstr->trans[ch];
228	}
229	p = (const char *) buf;
230	}
231	else
232	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx;
233	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
234	if (BE (mbclen == (size_t) -2, 0))
235	{
236	/* The buffer doesn't have enough space, finish to build. */
237	pstr->cur_state = prev_st;
238	break;
239	}
240	else if (BE (mbclen == (size_t) -1 \|\| mbclen == 0, 0))
241	{
242	/* We treat these cases as a singlebyte character. */
243	mbclen = 1;
244	wc = (wchar_t) pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
245	if (BE (pstr->trans != NULL, 0))
246	wc = pstr->trans[wc];
247	pstr->cur_state = prev_st;
248	}
249
250	/* Write wide character and padding. */
251	pstr->wcs[byte_idx++] = wc;
252	/* Write paddings. */
253	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
254	pstr->wcs[byte_idx++] = WEOF;
255	}
256	pstr->valid_len = byte_idx;
257	pstr->valid_raw_len = byte_idx;
258	}
259
260	/* Build wide character buffer PSTR->WCS like build_wcs_buffer,
261	but for REG_ICASE. */
262
263	static int
264	internal_function
265	build_wcs_upper_buffer (re_string_t *pstr)
266	{
267	mbstate_t prev_st;
268	int src_idx, byte_idx, end_idx, remain_len;
269	size_t mbclen;
270	#ifdef _LIBC
271	char buf[MB_LEN_MAX];
272	assert (MB_LEN_MAX >= pstr->mb_cur_max);
273	#else
274	char buf[64];
275	#endif
276
277	byte_idx = pstr->valid_len;
278	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
279
280	/* The following optimization assumes that ASCII characters can be
281	mapped to wide characters with a simple cast. */
282	if (! pstr->map_notascii && pstr->trans == NULL && !pstr->offsets_needed)
283	{
284	while (byte_idx < end_idx)
285	{
286	wchar_t wc;
287
288	if (isascii (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx])
289	&& mbsinit (&pstr->cur_state))
290	{
291	/* In case of a singlebyte character. */
292	pstr->mbs[byte_idx]
293	= toupper (pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx]);
294	/* The next step uses the assumption that wchar_t is encoded
295	ASCII-safe: all ASCII values can be converted like this. */
296	pstr->wcs[byte_idx] = (wchar_t) pstr->mbs[byte_idx];
297	++byte_idx;
298	continue;
299	}
300
301	remain_len = end_idx - byte_idx;
302	prev_st = pstr->cur_state;
303	mbclen = mbrtowc (&wc,
304	((const char *) pstr->raw_mbs + pstr->raw_mbs_idx
305	+ byte_idx), remain_len, &pstr->cur_state);
306	if (BE (mbclen + 2 > 2, 1))
307	{
308	wchar_t wcu = wc;
309	if (iswlower (wc))
310	{
311	size_t mbcdlen;
312
313	wcu = towupper (wc);
314	mbcdlen = wcrtomb (buf, wcu, &prev_st);
315	if (BE (mbclen == mbcdlen, 1))
316	memcpy (pstr->mbs + byte_idx, buf, mbclen);
317	else
318	{
319	src_idx = byte_idx;
320	goto offsets_needed;
321	}
322	}
323	else
324	memcpy (pstr->mbs + byte_idx,
325	pstr->raw_mbs + pstr->raw_mbs_idx + byte_idx, mbclen);
326	pstr->wcs[byte_idx++] = wcu;
327	/* Write paddings. */
328	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
329	pstr->wcs[byte_idx++] = WEOF;
330	}
331	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
332	{
333	/* It is an invalid character or '\0'. Just use the byte. */
334	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + byte_idx];
335	pstr->mbs[byte_idx] = ch;
336	/* And also cast it to wide char. */
337	pstr->wcs[byte_idx++] = (wchar_t) ch;
338	if (BE (mbclen == (size_t) -1, 0))
339	pstr->cur_state = prev_st;
340	}
341	else
342	{
343	/* The buffer doesn't have enough space, finish to build. */
344	pstr->cur_state = prev_st;
345	break;
346	}
347	}
348	pstr->valid_len = byte_idx;
349	pstr->valid_raw_len = byte_idx;
350	return REG_NOERROR;
351	}
352	else
353	for (src_idx = pstr->valid_raw_len; byte_idx < end_idx;)
354	{
355	wchar_t wc;
356	const char *p;
357	offsets_needed:
358	remain_len = end_idx - byte_idx;
359	prev_st = pstr->cur_state;
360	if (BE (pstr->trans != NULL, 0))
361	{
362	int i, ch;
363
364	for (i = 0; i < pstr->mb_cur_max && i < remain_len; ++i)
365	{
366	ch = pstr->raw_mbs [pstr->raw_mbs_idx + src_idx + i];
367	buf[i] = pstr->trans[ch];
368	}
369	p = (const char *) buf;
370	}
371	else
372	p = (const char *) pstr->raw_mbs + pstr->raw_mbs_idx + src_idx;
373	mbclen = mbrtowc (&wc, p, remain_len, &pstr->cur_state);
374	if (BE (mbclen + 2 > 2, 1))
375	{
376	wchar_t wcu = wc;
377	if (iswlower (wc))
378	{
379	size_t mbcdlen;
380
381	wcu = towupper (wc);
382	mbcdlen = wcrtomb ((char *) buf, wcu, &prev_st);
383	if (BE (mbclen == mbcdlen, 1))
384	memcpy (pstr->mbs + byte_idx, buf, mbclen);
385	else if (mbcdlen != (size_t) -1)
386	{
387	size_t i;
388
389	if (byte_idx + mbcdlen > pstr->bufs_len)
390	{
391	pstr->cur_state = prev_st;
392	break;
393	}
394
395	if (pstr->offsets == NULL)
396	{
397	pstr->offsets = re_malloc (int, pstr->bufs_len);
398
399	if (pstr->offsets == NULL)
400	return REG_ESPACE;
401	}
402	if (!pstr->offsets_needed)
403	{
404	for (i = 0; i < (size_t) byte_idx; ++i)
405	pstr->offsets[i] = i;
406	pstr->offsets_needed = 1;
407	}
408
409	memcpy (pstr->mbs + byte_idx, buf, mbcdlen);
410	pstr->wcs[byte_idx] = wcu;
411	pstr->offsets[byte_idx] = src_idx;
412	for (i = 1; i < mbcdlen; ++i)
413	{
414	pstr->offsets[byte_idx + i]
415	= src_idx + (i < mbclen ? i : mbclen - 1);
416	pstr->wcs[byte_idx + i] = WEOF;
417	}
418	pstr->len += mbcdlen - mbclen;
419	if (pstr->raw_stop > src_idx)
420	pstr->stop += mbcdlen - mbclen;
421	end_idx = (pstr->bufs_len > pstr->len)
422	? pstr->len : pstr->bufs_len;
423	byte_idx += mbcdlen;
424	src_idx += mbclen;
425	continue;
426	}
427	else
428	memcpy (pstr->mbs + byte_idx, p, mbclen);
429	}
430	else
431	memcpy (pstr->mbs + byte_idx, p, mbclen);
432
433	if (BE (pstr->offsets_needed != 0, 0))
434	{
435	size_t i;
436	for (i = 0; i < mbclen; ++i)
437	pstr->offsets[byte_idx + i] = src_idx + i;
438	}
439	src_idx += mbclen;
440
441	pstr->wcs[byte_idx++] = wcu;
442	/* Write paddings. */
443	for (remain_len = byte_idx + mbclen - 1; byte_idx < remain_len ;)
444	pstr->wcs[byte_idx++] = WEOF;
445	}
446	else if (mbclen == (size_t) -1 \|\| mbclen == 0)
447	{
448	/* It is an invalid character or '\0'. Just use the byte. */
449	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + src_idx];
450
451	if (BE (pstr->trans != NULL, 0))
452	ch = pstr->trans [ch];
453	pstr->mbs[byte_idx] = ch;
454
455	if (BE (pstr->offsets_needed != 0, 0))
456	pstr->offsets[byte_idx] = src_idx;
457	++src_idx;
458
459	/* And also cast it to wide char. */
460	pstr->wcs[byte_idx++] = (wchar_t) ch;
461	if (BE (mbclen == (size_t) -1, 0))
462	pstr->cur_state = prev_st;
463	}
464	else
465	{
466	/* The buffer doesn't have enough space, finish to build. */
467	pstr->cur_state = prev_st;
468	break;
469	}
470	}
471	pstr->valid_len = byte_idx;
472	pstr->valid_raw_len = src_idx;
473	return REG_NOERROR;
474	}
475
476	/* Skip characters until the index becomes greater than NEW_RAW_IDX.
477	Return the index. */
478
479	static int
480	internal_function
481	re_string_skip_chars (re_string_t pstr, int new_raw_idx, wint_t last_wc)
482	{
483	mbstate_t prev_st;
484	int rawbuf_idx;
485	size_t mbclen;
486	wchar_t wc = 0;
487
488	/* Skip the characters which are not necessary to check. */
489	for (rawbuf_idx = pstr->raw_mbs_idx + pstr->valid_raw_len;
490	rawbuf_idx < new_raw_idx;)
491	{
492	int remain_len;
493	remain_len = pstr->len - rawbuf_idx;
494	prev_st = pstr->cur_state;
495	mbclen = mbrtowc (&wc, (const char *) pstr->raw_mbs + rawbuf_idx,
496	remain_len, &pstr->cur_state);
497	if (BE (mbclen == (size_t) -2 \|\| mbclen == (size_t) -1 \|\| mbclen == 0, 0))
498	{
499	/* We treat these cases as a singlebyte character. */
500	mbclen = 1;
501	pstr->cur_state = prev_st;
502	}
503	/* Then proceed the next character. */
504	rawbuf_idx += mbclen;
505	}
506	*last_wc = (wint_t) wc;
507	return rawbuf_idx;
508	}
509	#endif /* RE_ENABLE_I18N */
510
511	/* Build the buffer PSTR->MBS, and apply the translation if we need.
512	This function is used in case of REG_ICASE. */
513
514	static void
515	internal_function
516	build_upper_buffer (re_string_t *pstr)
517	{
518	int char_idx, end_idx;
519	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
520
521	for (char_idx = pstr->valid_len; char_idx < end_idx; ++char_idx)
522	{
523	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + char_idx];
524	if (BE (pstr->trans != NULL, 0))
525	ch = pstr->trans[ch];
526	if (islower (ch))
527	pstr->mbs[char_idx] = toupper (ch);
528	else
529	pstr->mbs[char_idx] = ch;
530	}
531	pstr->valid_len = char_idx;
532	pstr->valid_raw_len = char_idx;
533	}
534
535	/* Apply TRANS to the buffer in PSTR. */
536
537	static void
538	internal_function
539	re_string_translate_buffer (re_string_t *pstr)
540	{
541	int buf_idx, end_idx;
542	end_idx = (pstr->bufs_len > pstr->len) ? pstr->len : pstr->bufs_len;
543
544	for (buf_idx = pstr->valid_len; buf_idx < end_idx; ++buf_idx)
545	{
546	int ch = pstr->raw_mbs[pstr->raw_mbs_idx + buf_idx];
547	pstr->mbs[buf_idx] = pstr->trans[ch];
548	}
549
550	pstr->valid_len = buf_idx;
551	pstr->valid_raw_len = buf_idx;
552	}
553
554	/* This function re-construct the buffers.
555	Concretely, convert to wide character in case of pstr->mb_cur_max > 1,
556	convert to upper case in case of REG_ICASE, apply translation. */
557
558	static reg_errcode_t
559	internal_function
560	re_string_reconstruct (re_string_t *pstr, int idx, int eflags)
561	{
562	int offset = idx - pstr->raw_mbs_idx;
563	if (BE (offset < 0, 0))
564	{
565	/* Reset buffer. */
566	#ifdef RE_ENABLE_I18N
567	if (pstr->mb_cur_max > 1)
568	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
569	#endif /* RE_ENABLE_I18N */
570	pstr->len = pstr->raw_len;
571	pstr->stop = pstr->raw_stop;
572	pstr->valid_len = 0;
573	pstr->raw_mbs_idx = 0;
574	pstr->valid_raw_len = 0;
575	pstr->offsets_needed = 0;
576	pstr->tip_context = ((eflags & REG_NOTBOL) ? CONTEXT_BEGBUF
577	: CONTEXT_NEWLINE \| CONTEXT_BEGBUF);
578	if (!pstr->mbs_allocated)
579	pstr->mbs = (unsigned char *) pstr->raw_mbs;
580	offset = idx;
581	}
582
583	if (BE (offset != 0, 1))
584	{
585	/* Are the characters which are already checked remain? */
586	if (BE (offset < pstr->valid_raw_len, 1)
587	#ifdef RE_ENABLE_I18N
588	/* Handling this would enlarge the code too much.
589	Accept a slowdown in that case. */
590	&& pstr->offsets_needed == 0
591	#endif
592	)
593	{
594	/* Yes, move them to the front of the buffer. */
595	pstr->tip_context = re_string_context_at (pstr, offset - 1, eflags);
596	#ifdef RE_ENABLE_I18N
597	if (pstr->mb_cur_max > 1)
598	memmove (pstr->wcs, pstr->wcs + offset,
599	(pstr->valid_len - offset) * sizeof (wint_t));
600	#endif /* RE_ENABLE_I18N */
601	if (BE (pstr->mbs_allocated, 0))
602	memmove (pstr->mbs, pstr->mbs + offset,
603	pstr->valid_len - offset);
604	pstr->valid_len -= offset;
605	pstr->valid_raw_len -= offset;
606	#if DEBUG
607	assert (pstr->valid_len > 0);
608	#endif
609	}
610	else
611	{
612	/* No, skip all characters until IDX. */
613	#ifdef RE_ENABLE_I18N
614	if (BE (pstr->offsets_needed, 0))
615	{
616	pstr->len = pstr->raw_len - idx + offset;
617	pstr->stop = pstr->raw_stop - idx + offset;
618	pstr->offsets_needed = 0;
619	}
620	#endif
621	pstr->valid_len = 0;
622	pstr->valid_raw_len = 0;
623	#ifdef RE_ENABLE_I18N
624	if (pstr->mb_cur_max > 1)
625	{
626	int wcs_idx;
627	wint_t wc = WEOF;
628
629	if (pstr->is_utf8)
630	{
631	const unsigned char raw, p, q, end;
632
633	/* Special case UTF-8. Multi-byte chars start with any
634	byte other than 0x80 - 0xbf. */
635	raw = pstr->raw_mbs + pstr->raw_mbs_idx;
636	end = raw + (offset - pstr->mb_cur_max);
637	p = raw + offset - 1;
638	#ifdef _LIBC
639	/* We know the wchar_t encoding is UCS4, so for the simple
640	case, ASCII characters, skip the conversion step. */
641	if (isascii (*p) && BE (pstr->trans == NULL, 1))
642	{
643	memset (&pstr->cur_state, '\0', sizeof (mbstate_t));
644	pstr->valid_len = 0;
645	wc = (wchar_t) *p;
646	}
647	else
648	#endif
649	for (; p >= end; --p)
650	if ((*p & 0xc0) != 0x80)
651	{
652	mbstate_t cur_state;
653	wchar_t wc2;
654	int mlen = raw + pstr->len - p;
655	unsigned char buf[6];
656	size_t mbclen;
657
658	q = p;
659	if (BE (pstr->trans != NULL, 0))
660	{
661	int i = mlen < 6 ? mlen : 6;
662	while (--i >= 0)
663	buf[i] = pstr->trans[p[i]];
664	q = buf;
665	}
666	/* XXX Don't use mbrtowc, we know which conversion
667	to use (UTF-8 -> UCS4). */
668	memset (&cur_state, 0, sizeof (cur_state));
669	mbclen = mbrtowc (&wc2, (const char *) p, mlen,
670	&cur_state);
671	if (raw + offset - p <= mbclen
672	&& mbclen < (size_t) -2)
673	{
674	memset (&pstr->cur_state, '\0',
675	sizeof (mbstate_t));
676	pstr->valid_len = mbclen - (raw + offset - p);
677	wc = wc2;
678	}
679	break;
680	}
681	}
682
683	if (wc == WEOF)
684	pstr->valid_len = re_string_skip_chars (pstr, idx, &wc) - idx;
685	if (BE (pstr->valid_len, 0))
686	{
687	for (wcs_idx = 0; wcs_idx < pstr->valid_len; ++wcs_idx)
688	pstr->wcs[wcs_idx] = WEOF;
689	if (pstr->mbs_allocated)
690	memset (pstr->mbs, 255, pstr->valid_len);
691	}
692	pstr->valid_raw_len = pstr->valid_len;
693	pstr->tip_context = ((BE (pstr->word_ops_used != 0, 0)
694	&& IS_WIDE_WORD_CHAR (wc))
695	? CONTEXT_WORD
696	: ((IS_WIDE_NEWLINE (wc)
697	&& pstr->newline_anchor)
698	? CONTEXT_NEWLINE : 0));
699	}
700	else
701	#endif /* RE_ENABLE_I18N */
702	{
703	int c = pstr->raw_mbs[pstr->raw_mbs_idx + offset - 1];
704	if (pstr->trans)
705	c = pstr->trans[c];
706	pstr->tip_context = (bitset_contain (pstr->word_char, c)
707	? CONTEXT_WORD
708	: ((IS_NEWLINE (c) && pstr->newline_anchor)
709	? CONTEXT_NEWLINE : 0));
710	}
711	}
712	if (!BE (pstr->mbs_allocated, 0))
713	pstr->mbs += offset;
714	}
715	pstr->raw_mbs_idx = idx;
716	pstr->len -= offset;
717	pstr->stop -= offset;
718
719	/* Then build the buffers. */
720	#ifdef RE_ENABLE_I18N
721	if (pstr->mb_cur_max > 1)
722	{
723	if (pstr->icase)
724	{
725	int ret = build_wcs_upper_buffer (pstr);
726	if (BE (ret != REG_NOERROR, 0))
727	return ret;
728	}
729	else
730	build_wcs_buffer (pstr);
731	}
732	else
733	#endif /* RE_ENABLE_I18N */
734	if (BE (pstr->mbs_allocated, 0))
735	{
736	if (pstr->icase)
737	build_upper_buffer (pstr);
738	else if (pstr->trans != NULL)
739	re_string_translate_buffer (pstr);
740	}
741	else
742	pstr->valid_len = pstr->len;
743
744	pstr->cur_idx = 0;
745	return REG_NOERROR;
746	}
747
748	static unsigned char
749	internal_function __attribute ((pure))
750	re_string_peek_byte_case (const re_string_t *pstr, int idx)
751	{
752	int ch, off;
753
754	/* Handle the common (easiest) cases first. */
755	if (BE (!pstr->mbs_allocated, 1))
756	return re_string_peek_byte (pstr, idx);
757
758	#ifdef RE_ENABLE_I18N
759	if (pstr->mb_cur_max > 1
760	&& ! re_string_is_single_byte_char (pstr, pstr->cur_idx + idx))
761	return re_string_peek_byte (pstr, idx);
762	#endif
763
764	off = pstr->cur_idx + idx;
765	#ifdef RE_ENABLE_I18N
766	if (pstr->offsets_needed)
767	off = pstr->offsets[off];
768	#endif
769
770	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
771
772	#ifdef RE_ENABLE_I18N
773	/* Ensure that e.g. for tr_TR.UTF-8 BACKSLASH DOTLESS SMALL LETTER I
774	this function returns CAPITAL LETTER I instead of first byte of
775	DOTLESS SMALL LETTER I. The latter would confuse the parser,
776	since peek_byte_case doesn't advance cur_idx in any way. */
777	if (pstr->offsets_needed && !isascii (ch))
778	return re_string_peek_byte (pstr, idx);
779	#endif
780
781	return ch;
782	}
783
784	static unsigned char
785	internal_function __attribute ((pure))
786	re_string_fetch_byte_case (re_string_t *pstr)
787	{
788	if (BE (!pstr->mbs_allocated, 1))
789	return re_string_fetch_byte (pstr);
790
791	#ifdef RE_ENABLE_I18N
792	if (pstr->offsets_needed)
793	{
794	int off, ch;
795
796	/* For tr_TR.UTF-8 [[:islower:]] there is
797	[[: CAPITAL LETTER I WITH DOT lower:]] in mbs. Skip
798	in that case the whole multi-byte character and return
799	the original letter. On the other side, with
800	[[: DOTLESS SMALL LETTER I return [[:I, as doing
801	anything else would complicate things too much. */
802
803	if (!re_string_first_byte (pstr, pstr->cur_idx))
804	return re_string_fetch_byte (pstr);
805
806	off = pstr->offsets[pstr->cur_idx];
807	ch = pstr->raw_mbs[pstr->raw_mbs_idx + off];
808
809	if (! isascii (ch))
810	return re_string_fetch_byte (pstr);
811
812	re_string_skip_bytes (pstr,
813	re_string_char_size_at (pstr, pstr->cur_idx));
814	return ch;
815	}
816	#endif
817
818	return pstr->raw_mbs[pstr->raw_mbs_idx + pstr->cur_idx++];
819	}
820
821	static void
822	internal_function
823	re_string_destruct (re_string_t *pstr)
824	{
825	#ifdef RE_ENABLE_I18N
826	re_free (pstr->wcs);
827	re_free (pstr->offsets);
828	#endif /* RE_ENABLE_I18N */
829	if (pstr->mbs_allocated)
830	re_free (pstr->mbs);
831	}
832
833	/* Return the context at IDX in INPUT. */
834
835	static unsigned int
836	internal_function
837	re_string_context_at (const re_string_t *input, int idx, int eflags)
838	{
839	int c;
840	if (BE (idx < 0, 0))
841	/* In this case, we use the value stored in input->tip_context,
842	since we can't know the character in input->mbs[-1] here. */
843	return input->tip_context;
844	if (BE (idx == input->len, 0))
845	return ((eflags & REG_NOTEOL) ? CONTEXT_ENDBUF
846	: CONTEXT_NEWLINE \| CONTEXT_ENDBUF);
847	#ifdef RE_ENABLE_I18N
848	if (input->mb_cur_max > 1)
849	{
850	wint_t wc;
851	int wc_idx = idx;
852	while(input->wcs[wc_idx] == WEOF)
853	{
854	#ifdef DEBUG
855	/* It must not happen. */
856	assert (wc_idx >= 0);
857	#endif
858	--wc_idx;
859	if (wc_idx < 0)
860	return input->tip_context;
861	}
862	wc = input->wcs[wc_idx];
863	if (BE (input->word_ops_used != 0, 0) && IS_WIDE_WORD_CHAR (wc))
864	return CONTEXT_WORD;
865	return (IS_WIDE_NEWLINE (wc) && input->newline_anchor
866	? CONTEXT_NEWLINE : 0);
867	}
868	else
869	#endif
870	{
871	c = re_string_byte_at (input, idx);
872	if (bitset_contain (input->word_char, c))
873	return CONTEXT_WORD;
874	return IS_NEWLINE (c) && input->newline_anchor ? CONTEXT_NEWLINE : 0;
875	}
876	}
877
878
879	/* Functions for set operation. */
880
881	static reg_errcode_t
882	internal_function
883	re_node_set_alloc (re_node_set *set, int size)
884	{
885	set->alloc = size;
886	set->nelem = 0;
887	set->elems = re_malloc (int, size);
888	if (BE (set->elems == NULL, 0))
889	return REG_ESPACE;
890	return REG_NOERROR;
891	}
892
893	static reg_errcode_t
894	internal_function
895	re_node_set_init_1 (re_node_set *set, int elem)
896	{
897	set->alloc = 1;
898	set->nelem = 1;
899	set->elems = re_malloc (int, 1);
900	if (BE (set->elems == NULL, 0))
901	{
902	set->alloc = set->nelem = 0;
903	return REG_ESPACE;
904	}
905	set->elems[0] = elem;
906	return REG_NOERROR;
907	}
908
909	static reg_errcode_t
910	internal_function
911	re_node_set_init_2 (re_node_set *set, int elem1, int elem2)
912	{
913	set->alloc = 2;
914	set->elems = re_malloc (int, 2);
915	if (BE (set->elems == NULL, 0))
916	return REG_ESPACE;
917	if (elem1 == elem2)
918	{
919	set->nelem = 1;
920	set->elems[0] = elem1;
921	}
922	else
923	{
924	set->nelem = 2;
925	if (elem1 < elem2)
926	{
927	set->elems[0] = elem1;
928	set->elems[1] = elem2;
929	}
930	else
931	{
932	set->elems[0] = elem2;
933	set->elems[1] = elem1;
934	}
935	}
936	return REG_NOERROR;
937	}
938
939	static reg_errcode_t
940	internal_function
941	re_node_set_init_copy (re_node_set dest, const re_node_set src)
942	{
943	dest->nelem = src->nelem;
944	if (src->nelem > 0)
945	{
946	dest->alloc = dest->nelem;
947	dest->elems = re_malloc (int, dest->alloc);
948	if (BE (dest->elems == NULL, 0))
949	{
950	dest->alloc = dest->nelem = 0;
951	return REG_ESPACE;
952	}
953	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
954	}
955	else
956	re_node_set_init_empty (dest);
957	return REG_NOERROR;
958	}
959
960	/* Calculate the intersection of the sets SRC1 and SRC2. And merge it to
961	DEST. Return value indicate the error code or REG_NOERROR if succeeded.
962	Note: We assume dest->elems is NULL, when dest->alloc is 0. */
963
964	static reg_errcode_t
965	internal_function
966	re_node_set_add_intersect (re_node_set dest, const re_node_set src1,
967	const re_node_set *src2)
968	{
969	int i1, i2, is, id, delta, sbase;
970	if (src1->nelem == 0 \|\| src2->nelem == 0)
971	return REG_NOERROR;
972
973	/* We need dest->nelem + 2 * elems_in_intersection; this is a
974	conservative estimate. */
975	if (src1->nelem + src2->nelem + dest->nelem > dest->alloc)
976	{
977	int new_alloc = src1->nelem + src2->nelem + dest->alloc;
978	int *new_elems = re_realloc (dest->elems, int, new_alloc);
979	if (BE (new_elems == NULL, 0))
980	return REG_ESPACE;
981	dest->elems = new_elems;
982	dest->alloc = new_alloc;
983	}
984
985	/* Find the items in the intersection of SRC1 and SRC2, and copy
986	into the top of DEST those that are not already in DEST itself. */
987	sbase = dest->nelem + src1->nelem + src2->nelem;
988	i1 = src1->nelem - 1;
989	i2 = src2->nelem - 1;
990	id = dest->nelem - 1;
991	for (;;)
992	{
993	if (src1->elems[i1] == src2->elems[i2])
994	{
995	/* Try to find the item in DEST. Maybe we could binary search? */
996	while (id >= 0 && dest->elems[id] > src1->elems[i1])
997	--id;
998
999	if (id < 0 \|\| dest->elems[id] != src1->elems[i1])
1000	dest->elems[--sbase] = src1->elems[i1];
1001
1002	if (--i1 < 0 \|\| --i2 < 0)
1003	break;
1004	}
1005
1006	/* Lower the highest of the two items. */
1007	else if (src1->elems[i1] < src2->elems[i2])
1008	{
1009	if (--i2 < 0)
1010	break;
1011	}
1012	else
1013	{
1014	if (--i1 < 0)
1015	break;
1016	}
1017	}
1018
1019	id = dest->nelem - 1;
1020	is = dest->nelem + src1->nelem + src2->nelem - 1;
1021	delta = is - sbase + 1;
1022
1023	/* Now copy. When DELTA becomes zero, the remaining
1024	DEST elements are already in place; this is more or
1025	less the same loop that is in re_node_set_merge. */
1026	dest->nelem += delta;
1027	if (delta > 0 && id >= 0)
1028	for (;;)
1029	{
1030	if (dest->elems[is] > dest->elems[id])
1031	{
1032	/* Copy from the top. */
1033	dest->elems[id + delta--] = dest->elems[is--];
1034	if (delta == 0)
1035	break;
1036	}
1037	else
1038	{
1039	/* Slide from the bottom. */
1040	dest->elems[id + delta] = dest->elems[id];
1041	if (--id < 0)
1042	break;
1043	}
1044	}
1045
1046	/* Copy remaining SRC elements. */
1047	memcpy (dest->elems, dest->elems + sbase, delta * sizeof (int));
1048
1049	return REG_NOERROR;
1050	}
1051
1052	/* Calculate the union set of the sets SRC1 and SRC2. And store it to
1053	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1054
1055	static reg_errcode_t
1056	internal_function
1057	re_node_set_init_union (re_node_set dest, const re_node_set src1,
1058	const re_node_set *src2)
1059	{
1060	int i1, i2, id;
1061	if (src1 != NULL && src1->nelem > 0 && src2 != NULL && src2->nelem > 0)
1062	{
1063	dest->alloc = src1->nelem + src2->nelem;
1064	dest->elems = re_malloc (int, dest->alloc);
1065	if (BE (dest->elems == NULL, 0))
1066	return REG_ESPACE;
1067	}
1068	else
1069	{
1070	if (src1 != NULL && src1->nelem > 0)
1071	return re_node_set_init_copy (dest, src1);
1072	else if (src2 != NULL && src2->nelem > 0)
1073	return re_node_set_init_copy (dest, src2);
1074	else
1075	re_node_set_init_empty (dest);
1076	return REG_NOERROR;
1077	}
1078	for (i1 = i2 = id = 0 ; i1 < src1->nelem && i2 < src2->nelem ;)
1079	{
1080	if (src1->elems[i1] > src2->elems[i2])
1081	{
1082	dest->elems[id++] = src2->elems[i2++];
1083	continue;
1084	}
1085	if (src1->elems[i1] == src2->elems[i2])
1086	++i2;
1087	dest->elems[id++] = src1->elems[i1++];
1088	}
1089	if (i1 < src1->nelem)
1090	{
1091	memcpy (dest->elems + id, src1->elems + i1,
1092	(src1->nelem - i1) * sizeof (int));
1093	id += src1->nelem - i1;
1094	}
1095	else if (i2 < src2->nelem)
1096	{
1097	memcpy (dest->elems + id, src2->elems + i2,
1098	(src2->nelem - i2) * sizeof (int));
1099	id += src2->nelem - i2;
1100	}
1101	dest->nelem = id;
1102	return REG_NOERROR;
1103	}
1104
1105	/* Calculate the union set of the sets DEST and SRC. And store it to
1106	DEST. Return value indicate the error code or REG_NOERROR if succeeded. */
1107
1108	static reg_errcode_t
1109	internal_function
1110	re_node_set_merge (re_node_set dest, const re_node_set src)
1111	{
1112	int is, id, sbase, delta;
1113	if (src == NULL \|\| src->nelem == 0)
1114	return REG_NOERROR;
1115	if (dest->alloc < 2 * src->nelem + dest->nelem)
1116	{
1117	int new_alloc = 2 * (src->nelem + dest->alloc);
1118	int *new_buffer = re_realloc (dest->elems, int, new_alloc);
1119	if (BE (new_buffer == NULL, 0))
1120	return REG_ESPACE;
1121	dest->elems = new_buffer;
1122	dest->alloc = new_alloc;
1123	}
1124
1125	if (BE (dest->nelem == 0, 0))
1126	{
1127	dest->nelem = src->nelem;
1128	memcpy (dest->elems, src->elems, src->nelem * sizeof (int));
1129	return REG_NOERROR;
1130	}
1131
1132	/* Copy into the top of DEST the items of SRC that are not
1133	found in DEST. Maybe we could binary search in DEST? */
1134	for (sbase = dest->nelem + 2 * src->nelem,
1135	is = src->nelem - 1, id = dest->nelem - 1; is >= 0 && id >= 0; )
1136	{
1137	if (dest->elems[id] == src->elems[is])
1138	is--, id--;
1139	else if (dest->elems[id] < src->elems[is])
1140	dest->elems[--sbase] = src->elems[is--];
1141	else /* if (dest->elems[id] > src->elems[is]) */
1142	--id;
1143	}
1144
1145	if (is >= 0)
1146	{
1147	/* If DEST is exhausted, the remaining items of SRC must be unique. */
1148	sbase -= is + 1;
1149	memcpy (dest->elems + sbase, src->elems, (is + 1) * sizeof (int));
1150	}
1151
1152	id = dest->nelem - 1;
1153	is = dest->nelem + 2 * src->nelem - 1;
1154	delta = is - sbase + 1;
1155	if (delta == 0)
1156	return REG_NOERROR;
1157
1158	/* Now copy. When DELTA becomes zero, the remaining
1159	DEST elements are already in place. */
1160	dest->nelem += delta;
1161	for (;;)
1162	{
1163	if (dest->elems[is] > dest->elems[id])
1164	{
1165	/* Copy from the top. */
1166	dest->elems[id + delta--] = dest->elems[is--];
1167	if (delta == 0)
1168	break;
1169	}
1170	else
1171	{
1172	/* Slide from the bottom. */
1173	dest->elems[id + delta] = dest->elems[id];
1174	if (--id < 0)
1175	{
1176	/* Copy remaining SRC elements. */
1177	memcpy (dest->elems, dest->elems + sbase,
1178	delta * sizeof (int));
1179	break;
1180	}
1181	}
1182	}
1183
1184	return REG_NOERROR;
1185	}
1186
1187	/* Insert the new element ELEM to the re_node_set* SET.
1188	SET should not already have ELEM.
1189	return -1 if an error is occured, return 1 otherwise. */
1190
1191	static int
1192	internal_function
1193	re_node_set_insert (re_node_set *set, int elem)
1194	{
1195	int idx;
1196	/* In case the set is empty. */
1197	if (set->alloc == 0)
1198	{
1199	if (BE (re_node_set_init_1 (set, elem) == REG_NOERROR, 1))
1200	return 1;
1201	else
1202	return -1;
1203	}
1204
1205	if (BE (set->nelem, 0) == 0)
1206	{
1207	/* We already guaranteed above that set->alloc != 0. */
1208	set->elems[0] = elem;
1209	++set->nelem;
1210	return 1;
1211	}
1212
1213	/* Realloc if we need. */
1214	if (set->alloc == set->nelem)
1215	{
1216	int *new_elems;
1217	set->alloc = set->alloc * 2;
1218	new_elems = re_realloc (set->elems, int, set->alloc);
1219	if (BE (new_elems == NULL, 0))
1220	return -1;
1221	set->elems = new_elems;
1222	}
1223
1224	/* Move the elements which follows the new element. Test the
1225	first element separately to skip a check in the inner loop. */
1226	if (elem < set->elems[0])
1227	{
1228	idx = 0;
1229	for (idx = set->nelem; idx > 0; idx--)
1230	set->elems[idx] = set->elems[idx - 1];
1231	}
1232	else
1233	{
1234	for (idx = set->nelem; set->elems[idx - 1] > elem; idx--)
1235	set->elems[idx] = set->elems[idx - 1];
1236	}
1237
1238	/* Insert the new element. */
1239	set->elems[idx] = elem;
1240	++set->nelem;
1241	return 1;
1242	}
1243
1244	/* Insert the new element ELEM to the re_node_set* SET.
1245	SET should not already have any element greater than or equal to ELEM.
1246	Return -1 if an error is occured, return 1 otherwise. */
1247
1248	static int
1249	internal_function
1250	re_node_set_insert_last (re_node_set *set, int elem)
1251	{
1252	/* Realloc if we need. */
1253	if (set->alloc == set->nelem)
1254	{
1255	int *new_elems;
1256	set->alloc = (set->alloc + 1) * 2;
1257	new_elems = re_realloc (set->elems, int, set->alloc);
1258	if (BE (new_elems == NULL, 0))
1259	return -1;
1260	set->elems = new_elems;
1261	}
1262
1263	/* Insert the new element. */
1264	set->elems[set->nelem++] = elem;
1265	return 1;
1266	}
1267
1268	/* Compare two node sets SET1 and SET2.
1269	return 1 if SET1 and SET2 are equivalent, return 0 otherwise. */
1270
1271	static int
1272	internal_function __attribute ((pure))
1273	re_node_set_compare (const re_node_set set1, const re_node_set set2)
1274	{
1275	int i;
1276	if (set1 == NULL \|\| set2 == NULL \|\| set1->nelem != set2->nelem)
1277	return 0;
1278	for (i = set1->nelem ; --i >= 0 ; )
1279	if (set1->elems[i] != set2->elems[i])
1280	return 0;
1281	return 1;
1282	}
1283
1284	/* Return (idx + 1) if SET contains the element ELEM, return 0 otherwise. */
1285
1286	static int
1287	internal_function __attribute ((pure))
1288	re_node_set_contains (const re_node_set *set, int elem)
1289	{
1290	unsigned int idx, right, mid;
1291	if (set->nelem <= 0)
1292	return 0;
1293
1294	/* Binary search the element. */
1295	idx = 0;
1296	right = set->nelem - 1;
1297	while (idx < right)
1298	{
1299	mid = (idx + right) / 2;
1300	if (set->elems[mid] < elem)
1301	idx = mid + 1;
1302	else
1303	right = mid;
1304	}
1305	return set->elems[idx] == elem ? idx + 1 : 0;
1306	}
1307
1308	static void
1309	internal_function
1310	re_node_set_remove_at (re_node_set *set, int idx)
1311	{
1312	if (idx < 0 \|\| idx >= set->nelem)
1313	return;
1314	--set->nelem;
1315	for (; idx < set->nelem; idx++)
1316	set->elems[idx] = set->elems[idx + 1];
1317	}
1318
1319
1320
1321	/* Add the token TOKEN to dfa->nodes, and return the index of the token.
1322	Or return -1, if an error will be occured. */
1323
1324	static int
1325	internal_function
1326	re_dfa_add_node (re_dfa_t *dfa, re_token_t token)
1327	{
1328	int type = token.type;
1329	if (BE (dfa->nodes_len >= dfa->nodes_alloc, 0))
1330	{
1331	size_t new_nodes_alloc = dfa->nodes_alloc * 2;
1332	int new_nexts, new_indices;
1333	re_node_set new_edests, new_eclosures;
1334	re_token_t *new_nodes;
1335
1336	/* Avoid overflows. */
1337	if (BE (new_nodes_alloc < dfa->nodes_alloc, 0))
1338	return -1;
1339
1340	new_nodes = re_realloc (dfa->nodes, re_token_t, new_nodes_alloc);
1341	if (BE (new_nodes == NULL, 0))
1342	return -1;
1343	dfa->nodes = new_nodes;
1344	new_nexts = re_realloc (dfa->nexts, int, new_nodes_alloc);
1345	new_indices = re_realloc (dfa->org_indices, int, new_nodes_alloc);
1346	new_edests = re_realloc (dfa->edests, re_node_set, new_nodes_alloc);
1347	new_eclosures = re_realloc (dfa->eclosures, re_node_set, new_nodes_alloc);
1348	if (BE (new_nexts == NULL \|\| new_indices == NULL
1349	\|\| new_edests == NULL \|\| new_eclosures == NULL, 0))
1350	return -1;
1351	dfa->nexts = new_nexts;
1352	dfa->org_indices = new_indices;
1353	dfa->edests = new_edests;
1354	dfa->eclosures = new_eclosures;
1355	dfa->nodes_alloc = new_nodes_alloc;
1356	}
1357	dfa->nodes[dfa->nodes_len] = token;
1358	dfa->nodes[dfa->nodes_len].constraint = 0;
1359	#ifdef RE_ENABLE_I18N
1360	dfa->nodes[dfa->nodes_len].accept_mb =
1361	(type == OP_PERIOD && dfa->mb_cur_max > 1) \|\| type == COMPLEX_BRACKET;
1362	#endif
1363	dfa->nexts[dfa->nodes_len] = -1;
1364	re_node_set_init_empty (dfa->edests + dfa->nodes_len);
1365	re_node_set_init_empty (dfa->eclosures + dfa->nodes_len);
1366	return dfa->nodes_len++;
1367	}
1368
1369	static inline unsigned int
1370	internal_function
1371	calc_state_hash (const re_node_set *nodes, unsigned int context)
1372	{
1373	unsigned int hash = nodes->nelem + context;
1374	int i;
1375	for (i = 0 ; i < nodes->nelem ; i++)
1376	hash += nodes->elems[i];
1377	return hash;
1378	}
1379
1380	/* Search for the state whose node_set is equivalent to NODES.
1381	Return the pointer to the state, if we found it in the DFA.
1382	Otherwise create the new one and return it. In case of an error
1383	return NULL and set the error code in ERR.
1384	Note: - We assume NULL as the invalid state, then it is possible that
1385	return value is NULL and ERR is REG_NOERROR.
1386	- We never return non-NULL value in case of any errors, it is for
1387	optimization. */
1388
1389	static re_dfastate_t *
1390	internal_function
1391	re_acquire_state (reg_errcode_t err, const re_dfa_t dfa,
1392	const re_node_set *nodes)
1393	{
1394	unsigned int hash;
1395	re_dfastate_t *new_state;
1396	struct re_state_table_entry *spot;
1397	int i;
1398	if (BE (nodes->nelem == 0, 0))
1399	{
1400	*err = REG_NOERROR;
1401	return NULL;
1402	}
1403	hash = calc_state_hash (nodes, 0);
1404	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1405
1406	for (i = 0 ; i < spot->num ; i++)
1407	{
1408	re_dfastate_t *state = spot->array[i];
1409	if (hash != state->hash)
1410	continue;
1411	if (re_node_set_compare (&state->nodes, nodes))
1412	return state;
1413	}
1414
1415	/* There are no appropriate state in the dfa, create the new one. */
1416	new_state = create_ci_newstate (dfa, nodes, hash);
1417	if (BE (new_state == NULL, 0))
1418	*err = REG_ESPACE;
1419
1420	return new_state;
1421	}
1422
1423	/* Search for the state whose node_set is equivalent to NODES and
1424	whose context is equivalent to CONTEXT.
1425	Return the pointer to the state, if we found it in the DFA.
1426	Otherwise create the new one and return it. In case of an error
1427	return NULL and set the error code in ERR.
1428	Note: - We assume NULL as the invalid state, then it is possible that
1429	return value is NULL and ERR is REG_NOERROR.
1430	- We never return non-NULL value in case of any errors, it is for
1431	optimization. */
1432
1433	static re_dfastate_t *
1434	internal_function
1435	re_acquire_state_context (reg_errcode_t err, const re_dfa_t dfa,
1436	const re_node_set *nodes, unsigned int context)
1437	{
1438	unsigned int hash;
1439	re_dfastate_t *new_state;
1440	struct re_state_table_entry *spot;
1441	int i;
1442	if (nodes->nelem == 0)
1443	{
1444	*err = REG_NOERROR;
1445	return NULL;
1446	}
1447	hash = calc_state_hash (nodes, context);
1448	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1449
1450	for (i = 0 ; i < spot->num ; i++)
1451	{
1452	re_dfastate_t *state = spot->array[i];
1453	if (state->hash == hash
1454	&& state->context == context
1455	&& re_node_set_compare (state->entrance_nodes, nodes))
1456	return state;
1457	}
1458	/* There are no appropriate state in `dfa', create the new one. */
1459	new_state = create_cd_newstate (dfa, nodes, context, hash);
1460	if (BE (new_state == NULL, 0))
1461	*err = REG_ESPACE;
1462
1463	return new_state;
1464	}
1465
1466	/* Finish initialization of the new state NEWSTATE, and using its hash value
1467	HASH put in the appropriate bucket of DFA's state table. Return value
1468	indicates the error code if failed. */
1469
1470	static reg_errcode_t
1471	register_state (const re_dfa_t dfa, re_dfastate_t newstate,
1472	unsigned int hash)
1473	{
1474	struct re_state_table_entry *spot;
1475	reg_errcode_t err;
1476	int i;
1477
1478	newstate->hash = hash;
1479	err = re_node_set_alloc (&newstate->non_eps_nodes, newstate->nodes.nelem);
1480	if (BE (err != REG_NOERROR, 0))
1481	return REG_ESPACE;
1482	for (i = 0; i < newstate->nodes.nelem; i++)
1483	{
1484	int elem = newstate->nodes.elems[i];
1485	if (!IS_EPSILON_NODE (dfa->nodes[elem].type))
1486	re_node_set_insert_last (&newstate->non_eps_nodes, elem);
1487	}
1488
1489	spot = dfa->state_table + (hash & dfa->state_hash_mask);
1490	if (BE (spot->alloc <= spot->num, 0))
1491	{
1492	int new_alloc = 2 * spot->num + 2;
1493	re_dfastate_t *new_array = re_realloc (spot->array, re_dfastate_t ,
1494	new_alloc);
1495	if (BE (new_array == NULL, 0))
1496	return REG_ESPACE;
1497	spot->array = new_array;
1498	spot->alloc = new_alloc;
1499	}
1500	spot->array[spot->num++] = newstate;
1501	return REG_NOERROR;
1502	}
1503
1504	static void
1505	free_state (re_dfastate_t *state)
1506	{
1507	re_node_set_free (&state->non_eps_nodes);
1508	re_node_set_free (&state->inveclosure);
1509	if (state->entrance_nodes != &state->nodes)
1510	{
1511	re_node_set_free (state->entrance_nodes);
1512	re_free (state->entrance_nodes);
1513	}
1514	re_node_set_free (&state->nodes);
1515	re_free (state->word_trtable);
1516	re_free (state->trtable);
1517	re_free (state);
1518	}
1519
1520	/* Create the new state which is independ of contexts.
1521	Return the new state if succeeded, otherwise return NULL. */
1522
1523	static re_dfastate_t *
1524	internal_function
1525	create_ci_newstate (const re_dfa_t dfa, const re_node_set nodes,
1526	unsigned int hash)
1527	{
1528	int i;
1529	reg_errcode_t err;
1530	re_dfastate_t *newstate;
1531
1532	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1533	if (BE (newstate == NULL, 0))
1534	return NULL;
1535	err = re_node_set_init_copy (&newstate->nodes, nodes);
1536	if (BE (err != REG_NOERROR, 0))
1537	{
1538	re_free (newstate);
1539	return NULL;
1540	}
1541
1542	newstate->entrance_nodes = &newstate->nodes;
1543	for (i = 0 ; i < nodes->nelem ; i++)
1544	{
1545	re_token_t *node = dfa->nodes + nodes->elems[i];
1546	re_token_type_t type = node->type;
1547	if (type == CHARACTER && !node->constraint)
1548	continue;
1549	#ifdef RE_ENABLE_I18N
1550	newstate->accept_mb \|= node->accept_mb;
1551	#endif /* RE_ENABLE_I18N */
1552
1553	/* If the state has the halt node, the state is a halt state. */
1554	if (type == END_OF_RE)
1555	newstate->halt = 1;
1556	else if (type == OP_BACK_REF)
1557	newstate->has_backref = 1;
1558	else if (type == ANCHOR \|\| node->constraint)
1559	newstate->has_constraint = 1;
1560	}
1561	err = register_state (dfa, newstate, hash);
1562	if (BE (err != REG_NOERROR, 0))
1563	{
1564	free_state (newstate);
1565	newstate = NULL;
1566	}
1567	return newstate;
1568	}
1569
1570	/* Create the new state which is depend on the context CONTEXT.
1571	Return the new state if succeeded, otherwise return NULL. */
1572
1573	static re_dfastate_t *
1574	internal_function
1575	create_cd_newstate (const re_dfa_t dfa, const re_node_set nodes,
1576	unsigned int context, unsigned int hash)
1577	{
1578	int i, nctx_nodes = 0;
1579	reg_errcode_t err;
1580	re_dfastate_t *newstate;
1581
1582	newstate = (re_dfastate_t *) calloc (sizeof (re_dfastate_t), 1);
1583	if (BE (newstate == NULL, 0))
1584	return NULL;
1585	err = re_node_set_init_copy (&newstate->nodes, nodes);
1586	if (BE (err != REG_NOERROR, 0))
1587	{
1588	re_free (newstate);
1589	return NULL;
1590	}
1591
1592	newstate->context = context;
1593	newstate->entrance_nodes = &newstate->nodes;
1594
1595	for (i = 0 ; i < nodes->nelem ; i++)
1596	{
1597	unsigned int constraint = 0;
1598	re_token_t *node = dfa->nodes + nodes->elems[i];
1599	re_token_type_t type = node->type;
1600	if (node->constraint)
1601	constraint = node->constraint;
1602
1603	if (type == CHARACTER && !constraint)
1604	continue;
1605	#ifdef RE_ENABLE_I18N
1606	newstate->accept_mb \|= node->accept_mb;
1607	#endif /* RE_ENABLE_I18N */
1608
1609	/* If the state has the halt node, the state is a halt state. */
1610	if (type == END_OF_RE)
1611	newstate->halt = 1;
1612	else if (type == OP_BACK_REF)
1613	newstate->has_backref = 1;
1614	else if (type == ANCHOR)
1615	constraint = node->opr.ctx_type;
1616
1617	if (constraint)
1618	{
1619	if (newstate->entrance_nodes == &newstate->nodes)
1620	{
1621	newstate->entrance_nodes = re_malloc (re_node_set, 1);
1622	if (BE (newstate->entrance_nodes == NULL, 0))
1623	{
1624	free_state (newstate);
1625	return NULL;
1626	}
1627	re_node_set_init_copy (newstate->entrance_nodes, nodes);
1628	nctx_nodes = 0;
1629	newstate->has_constraint = 1;
1630	}
1631
1632	if (NOT_SATISFY_PREV_CONSTRAINT (constraint,context))
1633	{
1634	re_node_set_remove_at (&newstate->nodes, i - nctx_nodes);
1635	++nctx_nodes;
1636	}
1637	}
1638	}
1639	err = register_state (dfa, newstate, hash);
1640	if (BE (err != REG_NOERROR, 0))
1641	{
1642	free_state (newstate);
1643	newstate = NULL;
1644	}
1645	return newstate;
1646	}

Note: See TracBrowser for help on using the repository browser.

source: kBuild/vendor/sed/4.1.5/lib/regex_internal.c@ 3138

Download in other formats: