SPENCER.tests

Last change on this file was 599, checked in by bird, 18 years ago
GNU sed 4.1.5.
File size: 14.1 KB

Line
1	# regular expression test set
2	# Lines are at least three fields, separated by one or more tabs. "" stands
3	# for an empty field. First field is an RE. Second field is flags. If
4	# C flag given, regcomp() is expected to fail, and the third field is the
5	# error name (minus the leading REG_).
6	#
7	# Otherwise it is expected to succeed, and the third field is the string to
8	# try matching it against. If there is no fourth field, the match is
9	# expected to fail. If there is a fourth field, it is the substring that
10	# the RE is expected to match. If there is a fifth field, it is a comma-
11	# separated list of what the subexpressions should match, with - indicating
12	# no match for that one. In both the fourth and fifth fields, a (sub)field
13	# starting with @ indicates that the (sub)expression is expected to match
14	# a null string followed by the stuff after the @; this provides a way to
15	# test where null strings match. The character `N' in REs and strings
16	# is newline, `S' is space, `T' is tab, `Z' is NUL.
17	#
18	# The full list of flags:
19	# - placeholder, does nothing
20	# b RE is a BRE, not an ERE
21	# & try it as both an ERE and a BRE
22	# C regcomp() error expected, third field is error name
23	# i REG_ICASE
24	# m ("mundane") REG_NOSPEC
25	# s REG_NOSUB (not really testable)
26	# n REG_NEWLINE
27	# ^ REG_NOTBOL
28	# $ REG_NOTEOL
29	# # REG_STARTEND (see below)
30	# p REG_PEND
31	#
32	# For REG_STARTEND, the start/end offsets are those of the substring
33	# enclosed in ().
34
35	# basics
36	a & a a
37	abc & abc abc
38	abc\|de - abc abc
39	a\|b\|c - abc a
40
41	# parentheses and perversions thereof
42	a(b)c - abc abc
43	a$b$c b abc abc
44	a( C EPAREN
45	a( b a( a(
46	a\( - a( a(
47	a\( bC EPAREN
48	a\(b bC EPAREN
49	a(b C EPAREN
50	a(b b a(b a(b
51	# gag me with a right parenthesis -- 1003.2 goofed here (my fault, partly)
52	a) - a) a)
53	) - ) )
54	# end gagging (in a just world, those should give EPAREN)
55	a) b a) a)
56	a\) bC EPAREN
57	\) bC EPAREN
58	a()b - ab ab
59	ab b ab ab
60
61	# anchoring and REG_NEWLINE
62	^abc$ & abc abc
63	a^b - a^b
64	a^b b a^b a^b
65	a$b - a$b
66	a$b b a$b a$b
67	^ & abc @abc
68	$ & abc @
69	^$ & "" @
70	$^ - "" @
71	$$$$^$ b "" @
72	# stop retching, those are legitimate (although disgusting)
73	^^ - "" @
74	$$ - "" @
75	b$ & abNc
76	b$ &n abNc b
77	^b$ & aNbNc
78	^b$ &n aNbNc b
79	^$ &n aNNb @Nb
80	^$ n abc
81	^$ n abcN @
82	$^ n aNNb @Nb
83	$$$$^$ bn aNNb @Nb
84	^^ n^ aNNb @Nb
85	$$ n aNNb @NN
86	^a ^ a
87	a$ $ a
88	^a ^n aNb
89	^b ^n aNb b
90	a$ $n bNa
91	b$ $n bNa b
92	a(^b$)c - b b
93	a$^b$$c b b b
94
95	# certain syntax errors and non-errors
96	\| C EMPTY
97	\| b \| \|
98	* C BADRPT
99	* b * *
100	+ C BADRPT
101	? C BADRPT
102	"" &C EMPTY
103	() - abc @abc
104	b abc @abc
105	a\|\|b C EMPTY
106	\|ab C EMPTY
107	ab\| C EMPTY
108	(\|a)b C EMPTY
109	(a\|)b C EMPTY
110	(*a) C BADRPT
111	(+a) C BADRPT
112	(?a) C BADRPT
113	({1}a) C BADRPT
114	$\{1\}a$ bC BADRPT
115	(a\|*b) C BADRPT
116	(a\|+b) C BADRPT
117	(a\|?b) C BADRPT
118	(a\|{1}b) C BADRPT
119	^* C BADRPT
120	^* b * *
121	^+ C BADRPT
122	^? C BADRPT
123	^{1} C BADRPT
124	^\{1\} bC BADRPT
125
126	# metacharacters, backslashes
127	a.c & abc abc
128	a[bc]d & abd abd
129	a\c & ac a*c
130	a\\b & a\b a\b
131	a\\\b & a\b a\*b
132	# The following test is wrong. Using \b in an BRE or ERE is undefined.
133	# a\bc & abc abc
134	a\ &C EESCAPE
135	a\\bc & a\bc a\bc
136	\{ bC BADRPT
137	a\[b & a[b a[b
138	a[b &C EBRACK
139	# trailing $ is a peculiar special case for the BRE code
140	a$ & a a
141	a$ & a$
142	a\$ & a
143	a\$ & a$ a$
144	a\\$ & a
145	a\\$ & a$
146	a\\$ & a\$
147	a\\$ & a\ a\
148
149	# back references, ugh
150	a$b$\2c bC ESUBREG
151	a$b\1$c bC ESUBREG
152	a$b*$c\1d b abbcbbd abbcbbd bb
153	a$b*$c\1d b abbcbd
154	a$b*$c\1d b abbcbbbd
155	^$.$\1 b abc
156	a$[bc]$\1d b abcdabbd abbd b
157	a$\([bc]$\2\)*d b abbccd abbccd
158	a$\([bc]$\2\)*d b abbcbd
159	# actually, this next one probably ought to fail, but the spec is unclear
160	a$\(b$\2\)d b abbbd abbbd
161	# here is a case that no NFA implementation does right
162	$ab$[ab]\1 b ababaaa ababaaa a
163	# check out normal matching in the presence of back refs
164	$a$\1bcd b aabcd aabcd
165	$a$\1bc*d b aabcd aabcd
166	$a$\1bc*d b aabd aabd
167	$a$\1bc*d b aabcccd aabcccd
168	$a$\1bc*[ce]d b aabcccd aabcccd
169	^$a$\1b$c$*cd$ b aabcccd aabcccd
170
171	# ordinary repetitions
172	ab*c & abc abc
173	ab+c - abc abc
174	ab?c - abc abc
175	a$$b b ab a*b
176	a$**$b b ab ab
177	a$***$b bC BADRPT
178	a b a *a
179	**a b a a
180	***a bC BADRPT
181
182	# the dreaded bounded repetitions
183	# The following two tests are not correct:
184	#{ & { {
185	#{abc & {abc {abc
186	# '{' is always a special char outside bracket expressions. So test ony BRE:
187	{ b { {
188	{abc b {abc {abc
189	{1 C BADRPT
190	{1} C BADRPT
191	# Same reason as for the two tests above:
192	#a{b & a{b a{b
193	a{b b a{b a{b
194	a{1}b - ab ab
195	a\{1\}b b ab ab
196	a{1,}b - ab ab
197	a\{1,\}b b ab ab
198	a{1,2}b - aab aab
199	a\{1,2\}b b aab aab
200	a{1 C EBRACE
201	a\{1 bC EBRACE
202	a{1a C EBRACE
203	a\{1a bC EBRACE
204	a{1a} C BADBR
205	a\{1a\} bC BADBR
206	# These four tests checks for undefined behavior. Our implementation does
207	# something different.
208	#a{,2} - a{,2} a{,2}
209	#a\{,2\} bC BADBR
210	#a{,} - a{,} a{,}
211	#a\{,\} bC BADBR
212	a{1,x} C BADBR
213	a\{1,x\} bC BADBR
214	a{1,x C EBRACE
215	a\{1,x bC EBRACE
216	# These two tests probably fails due to an arbitrary limit on the number of
217	# repetitions in the other implementation.
218	#a{300} C BADBR
219	#a\{300\} bC BADBR
220	a{1,0} C BADBR
221	a\{1,0\} bC BADBR
222	ab{0,0}c - abcac ac
223	ab\{0,0\}c b abcac ac
224	ab{0,1}c - abcac abc
225	ab\{0,1\}c b abcac abc
226	ab{0,3}c - abbcac abbc
227	ab\{0,3\}c b abbcac abbc
228	ab{1,1}c - acabc abc
229	ab\{1,1\}c b acabc abc
230	ab{1,3}c - acabc abc
231	ab\{1,3\}c b acabc abc
232	ab{2,2}c - abcabbc abbc
233	ab\{2,2\}c b abcabbc abbc
234	ab{2,4}c - abcabbc abbc
235	ab\{2,4\}c b abcabbc abbc
236	((a{1,10}){1,10}){1,10} - a a a,a
237
238	# multiple repetitions
239	# Wow, there is serious disconnect here. The ERE grammar is like this:
240	# ERE_expression : one_char_or_coll_elem_ERE
241	# \| '^'
242	# \| '$'
243	# \| '(' extended_reg_exp ')'
244	# \| ERE_expression ERE_dupl_symbol
245	# ;
246	# where ERE_dupl_symbol is any of the repetition methods. It is clear from
247	# this that consecutive repetition is OK. On top of this, the one test not
248	# marked as failing must fail. For BREs the situation is different, so we
249	# use the four tests.
250	#a** &C BADRPT
251	a** bC BADRPT
252	#a++ C BADRPT
253	#a?? C BADRPT
254	#a*+ C BADRPT
255	#a*? C BADRPT
256	#a+* C BADRPT
257	#a+? C BADRPT
258	#a?* C BADRPT
259	#a?+ C BADRPT
260	#a{1}{1} C BADRPT
261	#a*{1} C BADRPT
262	#a+{1} C BADRPT
263	#a?{1} C BADRPT
264	#a{1}* C BADRPT
265	#a{1}+ C BADRPT
266	#a{1}? C BADRPT
267	#a*{b} - a{b} a{b}
268	a\{1\}\{1\} bC BADRPT
269	a*\{1\} bC BADRPT
270	a\{1\}* bC BADRPT
271
272	# brackets, and numerous perversions thereof
273	a[b]c & abc abc
274	a[ab]c & abc abc
275	a[^ab]c & adc adc
276	a[]b]c & a]c a]c
277	a[[b]c & a[c a[c
278	a[-b]c & a-c a-c
279	a[^]b]c & adc adc
280	a[^-b]c & adc adc
281	a[b-]c & a-c a-c
282	a[b &C EBRACK
283	a[] &C EBRACK
284	a[1-3]c & a2c a2c
285	a[3-1]c &C ERANGE
286	a[1-3-5]c &C ERANGE
287	a[[.-.]--]c & a-c a-c
288	# I don't thing the error value should be ERANGE since a[1-] would be
289	# valid, too. Expect EBRACK.
290	#a[1- &C ERANGE
291	a[1- &C EBRACK
292	a[[. &C EBRACK
293	a[[.x &C EBRACK
294	a[[.x. &C EBRACK
295	a[[.x.] &C EBRACK
296	a[[.x.]] & ax ax
297	a[[.x,.]] &C ECOLLATE
298	# This test is invalid. "one" is no collating symbol in any standardized
299	# locale.
300	# a[[.one.]]b & a1b a1b
301	a[[.notdef.]]b &C ECOLLATE
302	a[[.].]]b & a]b a]b
303	a[[:alpha:]]c & abc abc
304	a[[:notdef:]]c &C ECTYPE
305	a[[: &C EBRACK
306	a[[:alpha &C EBRACK
307	a[[:alpha:] &C EBRACK
308	a[[:alpha,:] &C ECTYPE
309	a[[:]:]]b &C ECTYPE
310	a[[:-:]]b &C ECTYPE
311	a[[:alph:]] &C ECTYPE
312	a[[:alphabet:]] &C ECTYPE
313	[[:alnum:]]+ - -%@a0X- a0X
314	[[:alpha:]]+ - -%@aX0- aX
315	[[:blank:]]+ - aSSTb SST
316	[[:cntrl:]]+ - aNTb NT
317	[[:digit:]]+ - a019b 019
318	[[:graph:]]+ - Sa%bS a%b
319	[[:lower:]]+ - AabC ab
320	[[:print:]]+ - NaSbN aSb
321	[[:punct:]]+ - S%-&T %-&
322	[[:space:]]+ - aSNTb SNT
323	[[:upper:]]+ - aBCd BC
324	[[:xdigit:]]+ - p0f3Cq 0f3C
325	a[[=b=]]c & abc abc
326	a[[= &C EBRACK
327	a[[=b &C EBRACK
328	a[[=b= &C EBRACK
329	a[[=b=] &C EBRACK
330	a[[=b,=]] &C ECOLLATE
331	# This test is invalid. "one" is no collating symbol in any standardized
332	# locale.
333	#a[[=one=]]b & a1b a1b
334
335	# complexities
336	a(((b)))c - abc abc
337	a(b\|(c))d - abd abd
338	a(b*\|c)d - abbd abbd
339	# just gotta have one DFA-buster, of course
340	a[ab]{20} - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
341	# and an inline expansion in case somebody gets tricky
342	a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab] - aaaaabaaaabaaaabaaaab aaaaabaaaabaaaabaaaab
343	# and in case somebody just slips in an NFA...
344	a[ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab][ab](wee\|week)(knights\|night) - aaaaabaaaabaaaabaaaabweeknights aaaaabaaaabaaaabaaaabweeknights
345	# fish for anomalies as the number of states passes 32
346	12345678901234567890123456789 - a12345678901234567890123456789b 12345678901234567890123456789
347	123456789012345678901234567890 - a123456789012345678901234567890b 123456789012345678901234567890
348	1234567890123456789012345678901 - a1234567890123456789012345678901b 1234567890123456789012345678901
349	12345678901234567890123456789012 - a12345678901234567890123456789012b 12345678901234567890123456789012
350	123456789012345678901234567890123 - a123456789012345678901234567890123b 123456789012345678901234567890123
351	# and one really big one, beyond any plausible word width
352	1234567890123456789012345678901234567890123456789012345678901234567890 - a1234567890123456789012345678901234567890123456789012345678901234567890b 1234567890123456789012345678901234567890123456789012345678901234567890
353	# fish for problems as brackets go past 8
354	[ab][cd][ef][gh][ij][kl][mn] - xacegikmoq acegikm
355	[ab][cd][ef][gh][ij][kl][mn][op] - xacegikmoq acegikmo
356	[ab][cd][ef][gh][ij][kl][mn][op][qr] - xacegikmoqy acegikmoq
357	[ab][cd][ef][gh][ij][kl][mn][op][q] - xacegikmoqy acegikmoq
358
359	# subtleties of matching
360	abc & xabcy abc
361	a$b$?c\1d b acd
362	aBc i Abc Abc
363	a[Bc]*d i abBCcd abBCcd
364	0[[:upper:]]1 &i 0a1 0a1
365	0[[:lower:]]1 &i 0A1 0A1
366	a[^b]c &i abc
367	a[^b]c &i aBc
368	a[^b]c &i adc adc
369	[a]b[c] - abc abc
370	[a]b[a] - aba aba
371	[abc]b[abc] - abc abc
372	[abc]b[abd] - abd abd
373	a(b?c)+d - accd accd
374	(wee\|week)(knights\|night) - weeknights weeknights
375	(we\|wee\|week\|frob)(knights\|night\|day) - weeknights weeknights
376	a[bc]d - xyzaaabcaababdacd abd
377	a[ab]c - aaabc abc
378	abc s abc abc
379	() s abc @abc
380	a* & b @b
381
382	# Let's have some fun -- try to match a C comment.
383	# first the obvious, which looks okay at first glance...
384	/\.\/ - /x/ /x*/
385	# but...
386	/\.\/ - /x/y/z/ /x/y/z*/
387	# okay, we must not match */ inside; try to do that...
388	/\([^]\|\[^/])\/ - /x/ /x*/
389	/\([^]\|\[^/])\/ - /x/y/z/ /x*/
390	# but...
391	/\([^]\|\[^/])\/ - /x*/y/z/ /x*/y/z*/
392	# and a still fancier version, which does it right (I think)...
393	/\([^]\|\+[^/])\+/ - /x/ /x/
394	/\([^]\|\+[^/])\+/ - /x/y/z/ /x/
395	/\([^]\|\+[^/])\+/ - /x/y/z/ /x**/
396	/\([^]\|\+[^/])\+/ - /x**/y/z/ /x****/
397	/\([^]\|\+[^/])\+/ - /xx/y/z/ /xx/
398	/\([^]\|\+[^/])\+/ - /x*x/y/z/ /x**x/y/z*/
399
400	# subexpressions
401	.* - abc abc -
402	a(b)(c)d - abcd abcd b,c
403	a(((b)))c - abc abc b,b,b
404	a(b\|(c))d - abd abd b,-
405	a(b*\|c\|e)d - abbd abbd bb
406	a(b*\|c\|e)d - acd acd c
407	a(b*\|c\|e)d - ad ad @d
408	a(b?)c - abc abc b
409	a(b?)c - ac ac @c
410	a(b+)c - abc abc b
411	a(b+)c - abbbc abbbc bbb
412	a(b*)c - ac ac @c
413	(a\|ab)(bc([de]+)f\|cde) - abcdef abcdef a,bcdef,de
414	# the regression tester only asks for 9 subexpressions
415	a(b)(c)(d)(e)(f)(g)(h)(i)(j)k - abcdefghijk abcdefghijk b,c,d,e,f,g,h,i,j
416	a(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)l - abcdefghijkl abcdefghijkl b,c,d,e,f,g,h,i,j,k
417	a([bc]?)c - abc abc b
418	a([bc]?)c - ac ac @c
419	a([bc]+)c - abc abc b
420	a([bc]+)c - abcc abcc bc
421	a([bc]+)bc - abcbc abcbc bc
422	a(bb+\|b)b - abb abb b
423	a(bbb+\|bb+\|b)b - abb abb b
424	a(bbb+\|bb+\|b)b - abbb abbb bb
425	a(bbb+\|bb+\|b)bb - abbb abbb b
426	(.). - abcdef abcdef abcdef
427	(a) - bc @b @b
428
429	# do we get the right subexpression when it is used more than once?
430	a(b\|c)*d - ad ad -
431	a(b\|c)*d - abcd abcd c
432	a(b\|c)+d - abd abd b
433	a(b\|c)+d - abcd abcd c
434	a(b\|c?)+d - ad ad @d
435	a(b\|c?)+d - abcd abcd c
436	a(b\|c){0,0}d - ad ad -
437	a(b\|c){0,1}d - ad ad -
438	a(b\|c){0,1}d - abd abd b
439	a(b\|c){0,2}d - ad ad -
440	a(b\|c){0,2}d - abcd abcd c
441	a(b\|c){0,}d - ad ad -
442	a(b\|c){0,}d - abcd abcd c
443	a(b\|c){1,1}d - abd abd b
444	a(b\|c){1,1}d - acd acd c
445	a(b\|c){1,2}d - abd abd b
446	a(b\|c){1,2}d - abcd abcd c
447	a(b\|c){1,}d - abd abd b
448	a(b\|c){1,}d - abcd abcd c
449	a(b\|c){2,2}d - acbd acbd b
450	a(b\|c){2,2}d - abcd abcd c
451	a(b\|c){2,4}d - abcd abcd c
452	a(b\|c){2,4}d - abcbd abcbd b
453	a(b\|c){2,4}d - abcbcd abcbcd c
454	a(b\|c){2,}d - abcd abcd c
455	a(b\|c){2,}d - abcbd abcbd b
456	a(b+\|((c)*))+d - abd abd b,-,-
457	a(b+\|((c)*))+d - abcd abcd c,c,c
458
459	# check out the STARTEND option
460	[abc] &# a(b)c b
461	[abc] &# a(d)c
462	[abc] &# a(bc)d b
463	[abc] &# a(dc)d c
464	. &# a()c
465	b.*c &# b(bc)c bc
466	b.* &# b(bc)c bc
467	.*c &# b(bc)c bc
468
469	# plain strings, with the NOSPEC flag
470	abc m abc abc
471	abc m xabcy abc
472	abc m xyz
473	ab m abab a*b
474	a*b m ab
475	"" mC EMPTY
476
477	# cases involving NULs
478	aZb & a a
479	aZb &p a
480	aZb &p# (aZb) aZb
481	aZ*b &p# (ab) ab
482	a.b &# (aZb) aZb
483	a.* &# (aZb)c aZb
484
485	# word boundaries (ick)
486	[[:<:]]a & a a
487	[[:<:]]a & ba
488	[[:<:]]a & -a a
489	a[[:>:]] & a a
490	a[[:>:]] & ab
491	a[[:>:]] & a- a
492	[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc abc
493	[[:<:]]a.c[[:>:]] & axcd-dayc-dazce-abc-q abc
494	[[:<:]]a.c[[:>:]] & axc-dayc-dazce-abc axc
495	[[:<:]]b.c[[:>:]] & a_bxc-byc_d-bzc-q bzc
496	[[:<:]].x..[[:>:]] & y_xa_-_xb_y-_xc_-axdc _xc_
497	[[:<:]]a_b[[:>:]] & x_a_b
498
499	# past problems, and suspected problems
500	(A[1])\|(A[2])\|(A[3])\|(A[4])\|(A[5])\|(A[6])\|(A[7])\|(A[8])\|(A[9])\|(A[A]) - A1 A1
501	abcdefghijklmnop i abcdefghijklmnop abcdefghijklmnop
502	abcdefghijklmnopqrstuv i abcdefghijklmnopqrstuv abcdefghijklmnopqrstuv
503	(ALAK)\|(ALT[AB])\|(CC[123]1)\|(CM[123]1)\|(GAMC)\|(LC[23][EO ])\|(SEM[1234])\|(SL[ES][12])\|(SLWW)\|(SLF )\|(SLDT)\|(VWH[12])\|(WH[34][EW])\|(WP1[ESN]) - CC11 CC11
504	CC[13]1\|a{21}[23][EO][123][Es][12]a{15}aa[34][EW]aaaaaaa[X]a - CC11 CC11
505	Char $[a-z0-9_]$\[. b Char xyz[k Char xyz[k xyz
506	a?b - ab ab
507	-\{0,1\}[0-9]*$ b -5 -5
508	aaaaaaa* & aaaaaa aaaaaa
509	(\b){0} - x @x -
510	$\b$\{0,0\} b abc @abc -
511	a(\b){0}c - ac ac -
512	a(.*)b(\1){0}c - abc abc @bc,-
513	a(.*)b(\1){0}c - axbc axbc x,-
514
515	a$\(b*$\)c\1d b abbcbbd abbcbbd bb,bb
516	a$\([bc]$\)\2d b abcdabbd abbd b,b
517	a$\(\(\([bc]$\)\3\)\)*d b abbccd abbccd cc,cc,c,c
518	a(b)(c)d - abcd abcd b,c
519	a(((b)))c - abc abc b,b,b
520	a(((b\|(((c))))))d - abd abd b,b,b,-,-,-
521	a(((b*\|c\|e)))d - abbd abbd bb,bb,bb
522	a((b\|c)){0,0}d - ad ad -,-
523	a((b\|c)){0,1}d - abd abd b,b
524	a((b\|c)){0,2}d - abcd abcd c,c
525	a((b+\|((c)*)))+d - abd abd b,b,-,-
526	a((b+\|((c)*)))+d - abcd abcd c,c,c,c
527	(((\b))){0} - x @x -,-,-
528	a(((.*)))b((\2)){0}c - abc abc @bc,@bc,@bc,-,-
529	a(((.*)))b((\1)){0}c - axbc axbc x,x,x,-,-
530
531	\b & SaT @aT
532	\b & aT @aT
533	a.*\b & abT ab
534	\b & STSS
535	\B & abc @bc
536	\B & aSbTc
537	\B & SaT @SaT
538	\B & aSTSb @TSb

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

Original Format