VirtualBox

source: vbox/trunk/src/libs/libxml2-2.13.2/xmlregexp.c@ 105770

Last change on this file since 105770 was 105420, checked in by vboxsync, 4 months ago

libxml2-2.12.6: Applied and adjusted our libxml2 changes to 2.12.6. bugref:10730

  • Property svn:eol-style set to native
File size: 211.8 KB
Line 
1/*
2 * regexp.c: generic and extensible Regular Expression engine
3 *
4 * Basically designed with the purpose of compiling regexps for
5 * the variety of validation/schemas mechanisms now available in
6 * XML related specifications these include:
7 * - XML-1.0 DTD validation
8 * - XML Schemas structure part 1
9 * - XML Schemas Datatypes part 2 especially Appendix F
10 * - RELAX-NG/TREX i.e. the counter proposal
11 *
12 * See Copyright for the status of this software.
13 *
14 * Daniel Veillard <veillard@redhat.com>
15 */
16
17#define IN_LIBXML
18#include "libxml.h"
19
20#ifdef LIBXML_REGEXP_ENABLED
21
22#include <stdio.h>
23#include <string.h>
24#include <limits.h>
25
26#include <libxml/tree.h>
27#include <libxml/parserInternals.h>
28#include <libxml/xmlregexp.h>
29#include <libxml/xmlautomata.h>
30#include <libxml/xmlunicode.h>
31
32#include "private/error.h"
33#include "private/regexp.h"
34
35#ifndef SIZE_MAX
36#define SIZE_MAX ((size_t) -1)
37#endif
38
39#define MAX_PUSH 10000000
40
41#ifdef ERROR
42#undef ERROR
43#endif
44#define ERROR(str) \
45 ctxt->error = XML_REGEXP_COMPILE_ERROR; \
46 xmlRegexpErrCompile(ctxt, str);
47#define NEXT ctxt->cur++
48#define CUR (*(ctxt->cur))
49#define NXT(index) (ctxt->cur[index])
50
51#define NEXTL(l) ctxt->cur += l;
52#define XML_REG_STRING_SEPARATOR '|'
53/*
54 * Need PREV to check on a '-' within a Character Group. May only be used
55 * when it's guaranteed that cur is not at the beginning of ctxt->string!
56 */
57#define PREV (ctxt->cur[-1])
58
59/************************************************************************
60 * *
61 * Datatypes and structures *
62 * *
63 ************************************************************************/
64
65/*
66 * Note: the order of the enums below is significant, do not shuffle
67 */
68typedef enum {
69 XML_REGEXP_EPSILON = 1,
70 XML_REGEXP_CHARVAL,
71 XML_REGEXP_RANGES,
72 XML_REGEXP_SUBREG, /* used for () sub regexps */
73 XML_REGEXP_STRING,
74 XML_REGEXP_ANYCHAR, /* . */
75 XML_REGEXP_ANYSPACE, /* \s */
76 XML_REGEXP_NOTSPACE, /* \S */
77 XML_REGEXP_INITNAME, /* \l */
78 XML_REGEXP_NOTINITNAME, /* \L */
79 XML_REGEXP_NAMECHAR, /* \c */
80 XML_REGEXP_NOTNAMECHAR, /* \C */
81 XML_REGEXP_DECIMAL, /* \d */
82 XML_REGEXP_NOTDECIMAL, /* \D */
83 XML_REGEXP_REALCHAR, /* \w */
84 XML_REGEXP_NOTREALCHAR, /* \W */
85 XML_REGEXP_LETTER = 100,
86 XML_REGEXP_LETTER_UPPERCASE,
87 XML_REGEXP_LETTER_LOWERCASE,
88 XML_REGEXP_LETTER_TITLECASE,
89 XML_REGEXP_LETTER_MODIFIER,
90 XML_REGEXP_LETTER_OTHERS,
91 XML_REGEXP_MARK,
92 XML_REGEXP_MARK_NONSPACING,
93 XML_REGEXP_MARK_SPACECOMBINING,
94 XML_REGEXP_MARK_ENCLOSING,
95 XML_REGEXP_NUMBER,
96 XML_REGEXP_NUMBER_DECIMAL,
97 XML_REGEXP_NUMBER_LETTER,
98 XML_REGEXP_NUMBER_OTHERS,
99 XML_REGEXP_PUNCT,
100 XML_REGEXP_PUNCT_CONNECTOR,
101 XML_REGEXP_PUNCT_DASH,
102 XML_REGEXP_PUNCT_OPEN,
103 XML_REGEXP_PUNCT_CLOSE,
104 XML_REGEXP_PUNCT_INITQUOTE,
105 XML_REGEXP_PUNCT_FINQUOTE,
106 XML_REGEXP_PUNCT_OTHERS,
107 XML_REGEXP_SEPAR,
108 XML_REGEXP_SEPAR_SPACE,
109 XML_REGEXP_SEPAR_LINE,
110 XML_REGEXP_SEPAR_PARA,
111 XML_REGEXP_SYMBOL,
112 XML_REGEXP_SYMBOL_MATH,
113 XML_REGEXP_SYMBOL_CURRENCY,
114 XML_REGEXP_SYMBOL_MODIFIER,
115 XML_REGEXP_SYMBOL_OTHERS,
116 XML_REGEXP_OTHER,
117 XML_REGEXP_OTHER_CONTROL,
118 XML_REGEXP_OTHER_FORMAT,
119 XML_REGEXP_OTHER_PRIVATE,
120 XML_REGEXP_OTHER_NA,
121 XML_REGEXP_BLOCK_NAME
122} xmlRegAtomType;
123
124typedef enum {
125 XML_REGEXP_QUANT_EPSILON = 1,
126 XML_REGEXP_QUANT_ONCE,
127 XML_REGEXP_QUANT_OPT,
128 XML_REGEXP_QUANT_MULT,
129 XML_REGEXP_QUANT_PLUS,
130 XML_REGEXP_QUANT_ONCEONLY,
131 XML_REGEXP_QUANT_ALL,
132 XML_REGEXP_QUANT_RANGE
133} xmlRegQuantType;
134
135typedef enum {
136 XML_REGEXP_START_STATE = 1,
137 XML_REGEXP_FINAL_STATE,
138 XML_REGEXP_TRANS_STATE,
139 XML_REGEXP_SINK_STATE,
140 XML_REGEXP_UNREACH_STATE
141} xmlRegStateType;
142
143typedef enum {
144 XML_REGEXP_MARK_NORMAL = 0,
145 XML_REGEXP_MARK_START,
146 XML_REGEXP_MARK_VISITED
147} xmlRegMarkedType;
148
149typedef struct _xmlRegRange xmlRegRange;
150typedef xmlRegRange *xmlRegRangePtr;
151
152struct _xmlRegRange {
153 int neg; /* 0 normal, 1 not, 2 exclude */
154 xmlRegAtomType type;
155 int start;
156 int end;
157 xmlChar *blockName;
158};
159
160typedef struct _xmlRegAtom xmlRegAtom;
161typedef xmlRegAtom *xmlRegAtomPtr;
162
163typedef struct _xmlAutomataState xmlRegState;
164typedef xmlRegState *xmlRegStatePtr;
165
166struct _xmlRegAtom {
167 int no;
168 xmlRegAtomType type;
169 xmlRegQuantType quant;
170 int min;
171 int max;
172
173 void *valuep;
174 void *valuep2;
175 int neg;
176 int codepoint;
177 xmlRegStatePtr start;
178 xmlRegStatePtr start0;
179 xmlRegStatePtr stop;
180 int maxRanges;
181 int nbRanges;
182 xmlRegRangePtr *ranges;
183 void *data;
184};
185
186typedef struct _xmlRegCounter xmlRegCounter;
187typedef xmlRegCounter *xmlRegCounterPtr;
188
189struct _xmlRegCounter {
190 int min;
191 int max;
192};
193
194typedef struct _xmlRegTrans xmlRegTrans;
195typedef xmlRegTrans *xmlRegTransPtr;
196
197struct _xmlRegTrans {
198 xmlRegAtomPtr atom;
199 int to;
200 int counter;
201 int count;
202 int nd;
203};
204
205struct _xmlAutomataState {
206 xmlRegStateType type;
207 xmlRegMarkedType mark;
208 xmlRegMarkedType markd;
209 xmlRegMarkedType reached;
210 int no;
211 int maxTrans;
212 int nbTrans;
213 xmlRegTrans *trans;
214 /* knowing states pointing to us can speed things up */
215 int maxTransTo;
216 int nbTransTo;
217 int *transTo;
218};
219
220typedef struct _xmlAutomata xmlRegParserCtxt;
221typedef xmlRegParserCtxt *xmlRegParserCtxtPtr;
222
223#define AM_AUTOMATA_RNG 1
224
225struct _xmlAutomata {
226 xmlChar *string;
227 xmlChar *cur;
228
229 int error;
230 int neg;
231
232 xmlRegStatePtr start;
233 xmlRegStatePtr end;
234 xmlRegStatePtr state;
235
236 xmlRegAtomPtr atom;
237
238 int maxAtoms;
239 int nbAtoms;
240 xmlRegAtomPtr *atoms;
241
242 int maxStates;
243 int nbStates;
244 xmlRegStatePtr *states;
245
246 int maxCounters;
247 int nbCounters;
248 xmlRegCounter *counters;
249
250 int determinist;
251 int negs;
252 int flags;
253
254 int depth;
255};
256
257struct _xmlRegexp {
258 xmlChar *string;
259 int nbStates;
260 xmlRegStatePtr *states;
261 int nbAtoms;
262 xmlRegAtomPtr *atoms;
263 int nbCounters;
264 xmlRegCounter *counters;
265 int determinist;
266 int flags;
267 /*
268 * That's the compact form for determinists automatas
269 */
270 int nbstates;
271 int *compact;
272 void **transdata;
273 int nbstrings;
274 xmlChar **stringMap;
275};
276
277typedef struct _xmlRegExecRollback xmlRegExecRollback;
278typedef xmlRegExecRollback *xmlRegExecRollbackPtr;
279
280struct _xmlRegExecRollback {
281 xmlRegStatePtr state;/* the current state */
282 int index; /* the index in the input stack */
283 int nextbranch; /* the next transition to explore in that state */
284 int *counts; /* save the automata state if it has some */
285};
286
287typedef struct _xmlRegInputToken xmlRegInputToken;
288typedef xmlRegInputToken *xmlRegInputTokenPtr;
289
290struct _xmlRegInputToken {
291 xmlChar *value;
292 void *data;
293};
294
295struct _xmlRegExecCtxt {
296 int status; /* execution status != 0 indicate an error */
297 int determinist; /* did we find an indeterministic behaviour */
298 xmlRegexpPtr comp; /* the compiled regexp */
299 xmlRegExecCallbacks callback;
300 void *data;
301
302 xmlRegStatePtr state;/* the current state */
303 int transno; /* the current transition on that state */
304 int transcount; /* the number of chars in char counted transitions */
305
306 /*
307 * A stack of rollback states
308 */
309 int maxRollbacks;
310 int nbRollbacks;
311 xmlRegExecRollback *rollbacks;
312
313 /*
314 * The state of the automata if any
315 */
316 int *counts;
317
318 /*
319 * The input stack
320 */
321 int inputStackMax;
322 int inputStackNr;
323 int index;
324 int *charStack;
325 const xmlChar *inputString; /* when operating on characters */
326 xmlRegInputTokenPtr inputStack;/* when operating on strings */
327
328 /*
329 * error handling
330 */
331 int errStateNo; /* the error state number */
332 xmlRegStatePtr errState; /* the error state */
333 xmlChar *errString; /* the string raising the error */
334 int *errCounts; /* counters at the error state */
335 int nbPush;
336};
337
338#define REGEXP_ALL_COUNTER 0x123456
339#define REGEXP_ALL_LAX_COUNTER 0x123457
340
341static void xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top);
342static void xmlRegFreeState(xmlRegStatePtr state);
343static void xmlRegFreeAtom(xmlRegAtomPtr atom);
344static int xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr);
345static int xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint);
346static int xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint,
347 int neg, int start, int end, const xmlChar *blockName);
348
349/************************************************************************
350 * *
351 * Regexp memory error handler *
352 * *
353 ************************************************************************/
354/**
355 * xmlRegexpErrMemory:
356 * @extra: extra information
357 *
358 * Handle an out of memory condition
359 */
360static void
361xmlRegexpErrMemory(xmlRegParserCtxtPtr ctxt)
362{
363 if (ctxt != NULL)
364 ctxt->error = XML_ERR_NO_MEMORY;
365
366 xmlRaiseMemoryError(NULL, NULL, NULL, XML_FROM_REGEXP, NULL);
367}
368
369/**
370 * xmlRegexpErrCompile:
371 * @extra: extra information
372 *
373 * Handle a compilation failure
374 */
375static void
376xmlRegexpErrCompile(xmlRegParserCtxtPtr ctxt, const char *extra)
377{
378 const char *regexp = NULL;
379 int idx = 0;
380 int res;
381
382 if (ctxt != NULL) {
383 regexp = (const char *) ctxt->string;
384 idx = ctxt->cur - ctxt->string;
385 ctxt->error = XML_REGEXP_COMPILE_ERROR;
386 }
387
388 res = __xmlRaiseError(NULL, NULL, NULL, NULL, NULL, XML_FROM_REGEXP,
389 XML_REGEXP_COMPILE_ERROR, XML_ERR_FATAL,
390 NULL, 0, extra, regexp, NULL, idx, 0,
391 "failed to compile: %s\n", extra);
392 if (res < 0)
393 xmlRegexpErrMemory(ctxt);
394}
395
396/************************************************************************
397 * *
398 * Allocation/Deallocation *
399 * *
400 ************************************************************************/
401
402static int xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt);
403
404/**
405 * xmlRegCalloc2:
406 * @dim1: size of first dimension
407 * @dim2: size of second dimension
408 * @elemSize: size of element
409 *
410 * Allocate a two-dimensional array and set all elements to zero.
411 *
412 * Returns the new array or NULL in case of error.
413 */
414static void*
415xmlRegCalloc2(size_t dim1, size_t dim2, size_t elemSize) {
416 size_t totalSize;
417 void *ret;
418
419 /* Check for overflow */
420 if ((dim2 == 0) || (elemSize == 0) ||
421 (dim1 > SIZE_MAX / dim2 / elemSize))
422 return (NULL);
423 totalSize = dim1 * dim2 * elemSize;
424 ret = xmlMalloc(totalSize);
425 if (ret != NULL)
426 memset(ret, 0, totalSize);
427 return (ret);
428}
429
430/**
431 * xmlRegEpxFromParse:
432 * @ctxt: the parser context used to build it
433 *
434 * Allocate a new regexp and fill it with the result from the parser
435 *
436 * Returns the new regexp or NULL in case of error
437 */
438static xmlRegexpPtr
439xmlRegEpxFromParse(xmlRegParserCtxtPtr ctxt) {
440 xmlRegexpPtr ret;
441
442 ret = (xmlRegexpPtr) xmlMalloc(sizeof(xmlRegexp));
443 if (ret == NULL) {
444 xmlRegexpErrMemory(ctxt);
445 return(NULL);
446 }
447 memset(ret, 0, sizeof(xmlRegexp));
448 ret->string = ctxt->string;
449 ret->nbStates = ctxt->nbStates;
450 ret->states = ctxt->states;
451 ret->nbAtoms = ctxt->nbAtoms;
452 ret->atoms = ctxt->atoms;
453 ret->nbCounters = ctxt->nbCounters;
454 ret->counters = ctxt->counters;
455 ret->determinist = ctxt->determinist;
456 ret->flags = ctxt->flags;
457 if (ret->determinist == -1) {
458 if (xmlRegexpIsDeterminist(ret) < 0) {
459 xmlRegexpErrMemory(ctxt);
460 xmlFree(ret);
461 return(NULL);
462 }
463 }
464
465 if ((ret->determinist != 0) &&
466 (ret->nbCounters == 0) &&
467 (ctxt->negs == 0) &&
468 (ret->atoms != NULL) &&
469 (ret->atoms[0] != NULL) &&
470 (ret->atoms[0]->type == XML_REGEXP_STRING)) {
471 int i, j, nbstates = 0, nbatoms = 0;
472 int *stateRemap;
473 int *stringRemap;
474 int *transitions;
475 void **transdata;
476 xmlChar **stringMap;
477 xmlChar *value;
478
479 /*
480 * Switch to a compact representation
481 * 1/ counting the effective number of states left
482 * 2/ counting the unique number of atoms, and check that
483 * they are all of the string type
484 * 3/ build a table state x atom for the transitions
485 */
486
487 stateRemap = xmlMalloc(ret->nbStates * sizeof(int));
488 if (stateRemap == NULL) {
489 xmlRegexpErrMemory(ctxt);
490 xmlFree(ret);
491 return(NULL);
492 }
493 for (i = 0;i < ret->nbStates;i++) {
494 if (ret->states[i] != NULL) {
495 stateRemap[i] = nbstates;
496 nbstates++;
497 } else {
498 stateRemap[i] = -1;
499 }
500 }
501 stringMap = xmlMalloc(ret->nbAtoms * sizeof(char *));
502 if (stringMap == NULL) {
503 xmlRegexpErrMemory(ctxt);
504 xmlFree(stateRemap);
505 xmlFree(ret);
506 return(NULL);
507 }
508 stringRemap = xmlMalloc(ret->nbAtoms * sizeof(int));
509 if (stringRemap == NULL) {
510 xmlRegexpErrMemory(ctxt);
511 xmlFree(stringMap);
512 xmlFree(stateRemap);
513 xmlFree(ret);
514 return(NULL);
515 }
516 for (i = 0;i < ret->nbAtoms;i++) {
517 if ((ret->atoms[i]->type == XML_REGEXP_STRING) &&
518 (ret->atoms[i]->quant == XML_REGEXP_QUANT_ONCE)) {
519 value = ret->atoms[i]->valuep;
520 for (j = 0;j < nbatoms;j++) {
521 if (xmlStrEqual(stringMap[j], value)) {
522 stringRemap[i] = j;
523 break;
524 }
525 }
526 if (j >= nbatoms) {
527 stringRemap[i] = nbatoms;
528 stringMap[nbatoms] = xmlStrdup(value);
529 if (stringMap[nbatoms] == NULL) {
530 for (i = 0;i < nbatoms;i++)
531 xmlFree(stringMap[i]);
532 xmlFree(stringRemap);
533 xmlFree(stringMap);
534 xmlFree(stateRemap);
535 xmlFree(ret);
536 return(NULL);
537 }
538 nbatoms++;
539 }
540 } else {
541 xmlFree(stateRemap);
542 xmlFree(stringRemap);
543 for (i = 0;i < nbatoms;i++)
544 xmlFree(stringMap[i]);
545 xmlFree(stringMap);
546 xmlFree(ret);
547 return(NULL);
548 }
549 }
550 transitions = (int *) xmlRegCalloc2(nbstates + 1, nbatoms + 1,
551 sizeof(int));
552 if (transitions == NULL) {
553 xmlFree(stateRemap);
554 xmlFree(stringRemap);
555 for (i = 0;i < nbatoms;i++)
556 xmlFree(stringMap[i]);
557 xmlFree(stringMap);
558 xmlFree(ret);
559 return(NULL);
560 }
561
562 /*
563 * Allocate the transition table. The first entry for each
564 * state corresponds to the state type.
565 */
566 transdata = NULL;
567
568 for (i = 0;i < ret->nbStates;i++) {
569 int stateno, atomno, targetno, prev;
570 xmlRegStatePtr state;
571 xmlRegTransPtr trans;
572
573 stateno = stateRemap[i];
574 if (stateno == -1)
575 continue;
576 state = ret->states[i];
577
578 transitions[stateno * (nbatoms + 1)] = state->type;
579
580 for (j = 0;j < state->nbTrans;j++) {
581 trans = &(state->trans[j]);
582 if ((trans->to < 0) || (trans->atom == NULL))
583 continue;
584 atomno = stringRemap[trans->atom->no];
585 if ((trans->atom->data != NULL) && (transdata == NULL)) {
586 transdata = (void **) xmlRegCalloc2(nbstates, nbatoms,
587 sizeof(void *));
588 if (transdata == NULL) {
589 xmlRegexpErrMemory(ctxt);
590 break;
591 }
592 }
593 targetno = stateRemap[trans->to];
594 /*
595 * if the same atom can generate transitions to 2 different
596 * states then it means the automata is not deterministic and
597 * the compact form can't be used !
598 */
599 prev = transitions[stateno * (nbatoms + 1) + atomno + 1];
600 if (prev != 0) {
601 if (prev != targetno + 1) {
602 ret->determinist = 0;
603 if (transdata != NULL)
604 xmlFree(transdata);
605 xmlFree(transitions);
606 xmlFree(stateRemap);
607 xmlFree(stringRemap);
608 for (i = 0;i < nbatoms;i++)
609 xmlFree(stringMap[i]);
610 xmlFree(stringMap);
611 goto not_determ;
612 }
613 } else {
614#if 0
615 printf("State %d trans %d: atom %d to %d : %d to %d\n",
616 i, j, trans->atom->no, trans->to, atomno, targetno);
617#endif
618 transitions[stateno * (nbatoms + 1) + atomno + 1] =
619 targetno + 1; /* to avoid 0 */
620 if (transdata != NULL)
621 transdata[stateno * nbatoms + atomno] =
622 trans->atom->data;
623 }
624 }
625 }
626 ret->determinist = 1;
627 /*
628 * Cleanup of the old data
629 */
630 if (ret->states != NULL) {
631 for (i = 0;i < ret->nbStates;i++)
632 xmlRegFreeState(ret->states[i]);
633 xmlFree(ret->states);
634 }
635 ret->states = NULL;
636 ret->nbStates = 0;
637 if (ret->atoms != NULL) {
638 for (i = 0;i < ret->nbAtoms;i++)
639 xmlRegFreeAtom(ret->atoms[i]);
640 xmlFree(ret->atoms);
641 }
642 ret->atoms = NULL;
643 ret->nbAtoms = 0;
644
645 ret->compact = transitions;
646 ret->transdata = transdata;
647 ret->stringMap = stringMap;
648 ret->nbstrings = nbatoms;
649 ret->nbstates = nbstates;
650 xmlFree(stateRemap);
651 xmlFree(stringRemap);
652 }
653not_determ:
654 ctxt->string = NULL;
655 ctxt->nbStates = 0;
656 ctxt->states = NULL;
657 ctxt->nbAtoms = 0;
658 ctxt->atoms = NULL;
659 ctxt->nbCounters = 0;
660 ctxt->counters = NULL;
661 return(ret);
662}
663
664/**
665 * xmlRegNewParserCtxt:
666 * @string: the string to parse
667 *
668 * Allocate a new regexp parser context
669 *
670 * Returns the new context or NULL in case of error
671 */
672static xmlRegParserCtxtPtr
673xmlRegNewParserCtxt(const xmlChar *string) {
674 xmlRegParserCtxtPtr ret;
675
676 ret = (xmlRegParserCtxtPtr) xmlMalloc(sizeof(xmlRegParserCtxt));
677 if (ret == NULL)
678 return(NULL);
679 memset(ret, 0, sizeof(xmlRegParserCtxt));
680 if (string != NULL) {
681 ret->string = xmlStrdup(string);
682 if (ret->string == NULL) {
683 xmlFree(ret);
684 return(NULL);
685 }
686 }
687 ret->cur = ret->string;
688 ret->neg = 0;
689 ret->negs = 0;
690 ret->error = 0;
691 ret->determinist = -1;
692 return(ret);
693}
694
695/**
696 * xmlRegNewRange:
697 * @ctxt: the regexp parser context
698 * @neg: is that negative
699 * @type: the type of range
700 * @start: the start codepoint
701 * @end: the end codepoint
702 *
703 * Allocate a new regexp range
704 *
705 * Returns the new range or NULL in case of error
706 */
707static xmlRegRangePtr
708xmlRegNewRange(xmlRegParserCtxtPtr ctxt,
709 int neg, xmlRegAtomType type, int start, int end) {
710 xmlRegRangePtr ret;
711
712 ret = (xmlRegRangePtr) xmlMalloc(sizeof(xmlRegRange));
713 if (ret == NULL) {
714 xmlRegexpErrMemory(ctxt);
715 return(NULL);
716 }
717 ret->neg = neg;
718 ret->type = type;
719 ret->start = start;
720 ret->end = end;
721 return(ret);
722}
723
724/**
725 * xmlRegFreeRange:
726 * @range: the regexp range
727 *
728 * Free a regexp range
729 */
730static void
731xmlRegFreeRange(xmlRegRangePtr range) {
732 if (range == NULL)
733 return;
734
735 if (range->blockName != NULL)
736 xmlFree(range->blockName);
737 xmlFree(range);
738}
739
740/**
741 * xmlRegCopyRange:
742 * @range: the regexp range
743 *
744 * Copy a regexp range
745 *
746 * Returns the new copy or NULL in case of error.
747 */
748static xmlRegRangePtr
749xmlRegCopyRange(xmlRegParserCtxtPtr ctxt, xmlRegRangePtr range) {
750 xmlRegRangePtr ret;
751
752 if (range == NULL)
753 return(NULL);
754
755 ret = xmlRegNewRange(ctxt, range->neg, range->type, range->start,
756 range->end);
757 if (ret == NULL)
758 return(NULL);
759 if (range->blockName != NULL) {
760 ret->blockName = xmlStrdup(range->blockName);
761 if (ret->blockName == NULL) {
762 xmlRegexpErrMemory(ctxt);
763 xmlRegFreeRange(ret);
764 return(NULL);
765 }
766 }
767 return(ret);
768}
769
770/**
771 * xmlRegNewAtom:
772 * @ctxt: the regexp parser context
773 * @type: the type of atom
774 *
775 * Allocate a new atom
776 *
777 * Returns the new atom or NULL in case of error
778 */
779static xmlRegAtomPtr
780xmlRegNewAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomType type) {
781 xmlRegAtomPtr ret;
782
783 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
784 if (ret == NULL) {
785 xmlRegexpErrMemory(ctxt);
786 return(NULL);
787 }
788 memset(ret, 0, sizeof(xmlRegAtom));
789 ret->type = type;
790 ret->quant = XML_REGEXP_QUANT_ONCE;
791 ret->min = 0;
792 ret->max = 0;
793 return(ret);
794}
795
796/**
797 * xmlRegFreeAtom:
798 * @atom: the regexp atom
799 *
800 * Free a regexp atom
801 */
802static void
803xmlRegFreeAtom(xmlRegAtomPtr atom) {
804 int i;
805
806 if (atom == NULL)
807 return;
808
809 for (i = 0;i < atom->nbRanges;i++)
810 xmlRegFreeRange(atom->ranges[i]);
811 if (atom->ranges != NULL)
812 xmlFree(atom->ranges);
813 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep != NULL))
814 xmlFree(atom->valuep);
815 if ((atom->type == XML_REGEXP_STRING) && (atom->valuep2 != NULL))
816 xmlFree(atom->valuep2);
817 if ((atom->type == XML_REGEXP_BLOCK_NAME) && (atom->valuep != NULL))
818 xmlFree(atom->valuep);
819 xmlFree(atom);
820}
821
822/**
823 * xmlRegCopyAtom:
824 * @ctxt: the regexp parser context
825 * @atom: the original atom
826 *
827 * Allocate a new regexp range
828 *
829 * Returns the new atom or NULL in case of error
830 */
831static xmlRegAtomPtr
832xmlRegCopyAtom(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
833 xmlRegAtomPtr ret;
834
835 ret = (xmlRegAtomPtr) xmlMalloc(sizeof(xmlRegAtom));
836 if (ret == NULL) {
837 xmlRegexpErrMemory(ctxt);
838 return(NULL);
839 }
840 memset(ret, 0, sizeof(xmlRegAtom));
841 ret->type = atom->type;
842 ret->quant = atom->quant;
843 ret->min = atom->min;
844 ret->max = atom->max;
845 if (atom->nbRanges > 0) {
846 int i;
847
848 ret->ranges = (xmlRegRangePtr *) xmlMalloc(sizeof(xmlRegRangePtr) *
849 atom->nbRanges);
850 if (ret->ranges == NULL) {
851 xmlRegexpErrMemory(ctxt);
852 goto error;
853 }
854 for (i = 0;i < atom->nbRanges;i++) {
855 ret->ranges[i] = xmlRegCopyRange(ctxt, atom->ranges[i]);
856 if (ret->ranges[i] == NULL)
857 goto error;
858 ret->nbRanges = i + 1;
859 }
860 }
861 return(ret);
862
863error:
864 xmlRegFreeAtom(ret);
865 return(NULL);
866}
867
868static xmlRegStatePtr
869xmlRegNewState(xmlRegParserCtxtPtr ctxt) {
870 xmlRegStatePtr ret;
871
872 ret = (xmlRegStatePtr) xmlMalloc(sizeof(xmlRegState));
873 if (ret == NULL) {
874 xmlRegexpErrMemory(ctxt);
875 return(NULL);
876 }
877 memset(ret, 0, sizeof(xmlRegState));
878 ret->type = XML_REGEXP_TRANS_STATE;
879 ret->mark = XML_REGEXP_MARK_NORMAL;
880 return(ret);
881}
882
883/**
884 * xmlRegFreeState:
885 * @state: the regexp state
886 *
887 * Free a regexp state
888 */
889static void
890xmlRegFreeState(xmlRegStatePtr state) {
891 if (state == NULL)
892 return;
893
894 if (state->trans != NULL)
895 xmlFree(state->trans);
896 if (state->transTo != NULL)
897 xmlFree(state->transTo);
898 xmlFree(state);
899}
900
901/**
902 * xmlRegFreeParserCtxt:
903 * @ctxt: the regexp parser context
904 *
905 * Free a regexp parser context
906 */
907static void
908xmlRegFreeParserCtxt(xmlRegParserCtxtPtr ctxt) {
909 int i;
910 if (ctxt == NULL)
911 return;
912
913 if (ctxt->string != NULL)
914 xmlFree(ctxt->string);
915 if (ctxt->states != NULL) {
916 for (i = 0;i < ctxt->nbStates;i++)
917 xmlRegFreeState(ctxt->states[i]);
918 xmlFree(ctxt->states);
919 }
920 if (ctxt->atoms != NULL) {
921 for (i = 0;i < ctxt->nbAtoms;i++)
922 xmlRegFreeAtom(ctxt->atoms[i]);
923 xmlFree(ctxt->atoms);
924 }
925 if (ctxt->counters != NULL)
926 xmlFree(ctxt->counters);
927 xmlFree(ctxt);
928}
929
930/************************************************************************
931 * *
932 * Display of Data structures *
933 * *
934 ************************************************************************/
935
936static void
937xmlRegPrintAtomType(FILE *output, xmlRegAtomType type) {
938 switch (type) {
939 case XML_REGEXP_EPSILON:
940 fprintf(output, "epsilon "); break;
941 case XML_REGEXP_CHARVAL:
942 fprintf(output, "charval "); break;
943 case XML_REGEXP_RANGES:
944 fprintf(output, "ranges "); break;
945 case XML_REGEXP_SUBREG:
946 fprintf(output, "subexpr "); break;
947 case XML_REGEXP_STRING:
948 fprintf(output, "string "); break;
949 case XML_REGEXP_ANYCHAR:
950 fprintf(output, "anychar "); break;
951 case XML_REGEXP_ANYSPACE:
952 fprintf(output, "anyspace "); break;
953 case XML_REGEXP_NOTSPACE:
954 fprintf(output, "notspace "); break;
955 case XML_REGEXP_INITNAME:
956 fprintf(output, "initname "); break;
957 case XML_REGEXP_NOTINITNAME:
958 fprintf(output, "notinitname "); break;
959 case XML_REGEXP_NAMECHAR:
960 fprintf(output, "namechar "); break;
961 case XML_REGEXP_NOTNAMECHAR:
962 fprintf(output, "notnamechar "); break;
963 case XML_REGEXP_DECIMAL:
964 fprintf(output, "decimal "); break;
965 case XML_REGEXP_NOTDECIMAL:
966 fprintf(output, "notdecimal "); break;
967 case XML_REGEXP_REALCHAR:
968 fprintf(output, "realchar "); break;
969 case XML_REGEXP_NOTREALCHAR:
970 fprintf(output, "notrealchar "); break;
971 case XML_REGEXP_LETTER:
972 fprintf(output, "LETTER "); break;
973 case XML_REGEXP_LETTER_UPPERCASE:
974 fprintf(output, "LETTER_UPPERCASE "); break;
975 case XML_REGEXP_LETTER_LOWERCASE:
976 fprintf(output, "LETTER_LOWERCASE "); break;
977 case XML_REGEXP_LETTER_TITLECASE:
978 fprintf(output, "LETTER_TITLECASE "); break;
979 case XML_REGEXP_LETTER_MODIFIER:
980 fprintf(output, "LETTER_MODIFIER "); break;
981 case XML_REGEXP_LETTER_OTHERS:
982 fprintf(output, "LETTER_OTHERS "); break;
983 case XML_REGEXP_MARK:
984 fprintf(output, "MARK "); break;
985 case XML_REGEXP_MARK_NONSPACING:
986 fprintf(output, "MARK_NONSPACING "); break;
987 case XML_REGEXP_MARK_SPACECOMBINING:
988 fprintf(output, "MARK_SPACECOMBINING "); break;
989 case XML_REGEXP_MARK_ENCLOSING:
990 fprintf(output, "MARK_ENCLOSING "); break;
991 case XML_REGEXP_NUMBER:
992 fprintf(output, "NUMBER "); break;
993 case XML_REGEXP_NUMBER_DECIMAL:
994 fprintf(output, "NUMBER_DECIMAL "); break;
995 case XML_REGEXP_NUMBER_LETTER:
996 fprintf(output, "NUMBER_LETTER "); break;
997 case XML_REGEXP_NUMBER_OTHERS:
998 fprintf(output, "NUMBER_OTHERS "); break;
999 case XML_REGEXP_PUNCT:
1000 fprintf(output, "PUNCT "); break;
1001 case XML_REGEXP_PUNCT_CONNECTOR:
1002 fprintf(output, "PUNCT_CONNECTOR "); break;
1003 case XML_REGEXP_PUNCT_DASH:
1004 fprintf(output, "PUNCT_DASH "); break;
1005 case XML_REGEXP_PUNCT_OPEN:
1006 fprintf(output, "PUNCT_OPEN "); break;
1007 case XML_REGEXP_PUNCT_CLOSE:
1008 fprintf(output, "PUNCT_CLOSE "); break;
1009 case XML_REGEXP_PUNCT_INITQUOTE:
1010 fprintf(output, "PUNCT_INITQUOTE "); break;
1011 case XML_REGEXP_PUNCT_FINQUOTE:
1012 fprintf(output, "PUNCT_FINQUOTE "); break;
1013 case XML_REGEXP_PUNCT_OTHERS:
1014 fprintf(output, "PUNCT_OTHERS "); break;
1015 case XML_REGEXP_SEPAR:
1016 fprintf(output, "SEPAR "); break;
1017 case XML_REGEXP_SEPAR_SPACE:
1018 fprintf(output, "SEPAR_SPACE "); break;
1019 case XML_REGEXP_SEPAR_LINE:
1020 fprintf(output, "SEPAR_LINE "); break;
1021 case XML_REGEXP_SEPAR_PARA:
1022 fprintf(output, "SEPAR_PARA "); break;
1023 case XML_REGEXP_SYMBOL:
1024 fprintf(output, "SYMBOL "); break;
1025 case XML_REGEXP_SYMBOL_MATH:
1026 fprintf(output, "SYMBOL_MATH "); break;
1027 case XML_REGEXP_SYMBOL_CURRENCY:
1028 fprintf(output, "SYMBOL_CURRENCY "); break;
1029 case XML_REGEXP_SYMBOL_MODIFIER:
1030 fprintf(output, "SYMBOL_MODIFIER "); break;
1031 case XML_REGEXP_SYMBOL_OTHERS:
1032 fprintf(output, "SYMBOL_OTHERS "); break;
1033 case XML_REGEXP_OTHER:
1034 fprintf(output, "OTHER "); break;
1035 case XML_REGEXP_OTHER_CONTROL:
1036 fprintf(output, "OTHER_CONTROL "); break;
1037 case XML_REGEXP_OTHER_FORMAT:
1038 fprintf(output, "OTHER_FORMAT "); break;
1039 case XML_REGEXP_OTHER_PRIVATE:
1040 fprintf(output, "OTHER_PRIVATE "); break;
1041 case XML_REGEXP_OTHER_NA:
1042 fprintf(output, "OTHER_NA "); break;
1043 case XML_REGEXP_BLOCK_NAME:
1044 fprintf(output, "BLOCK "); break;
1045 }
1046}
1047
1048static void
1049xmlRegPrintQuantType(FILE *output, xmlRegQuantType type) {
1050 switch (type) {
1051 case XML_REGEXP_QUANT_EPSILON:
1052 fprintf(output, "epsilon "); break;
1053 case XML_REGEXP_QUANT_ONCE:
1054 fprintf(output, "once "); break;
1055 case XML_REGEXP_QUANT_OPT:
1056 fprintf(output, "? "); break;
1057 case XML_REGEXP_QUANT_MULT:
1058 fprintf(output, "* "); break;
1059 case XML_REGEXP_QUANT_PLUS:
1060 fprintf(output, "+ "); break;
1061 case XML_REGEXP_QUANT_RANGE:
1062 fprintf(output, "range "); break;
1063 case XML_REGEXP_QUANT_ONCEONLY:
1064 fprintf(output, "onceonly "); break;
1065 case XML_REGEXP_QUANT_ALL:
1066 fprintf(output, "all "); break;
1067 }
1068}
1069static void
1070xmlRegPrintRange(FILE *output, xmlRegRangePtr range) {
1071 fprintf(output, " range: ");
1072 if (range->neg)
1073 fprintf(output, "negative ");
1074 xmlRegPrintAtomType(output, range->type);
1075 fprintf(output, "%c - %c\n", range->start, range->end);
1076}
1077
1078static void
1079xmlRegPrintAtom(FILE *output, xmlRegAtomPtr atom) {
1080 fprintf(output, " atom: ");
1081 if (atom == NULL) {
1082 fprintf(output, "NULL\n");
1083 return;
1084 }
1085 if (atom->neg)
1086 fprintf(output, "not ");
1087 xmlRegPrintAtomType(output, atom->type);
1088 xmlRegPrintQuantType(output, atom->quant);
1089 if (atom->quant == XML_REGEXP_QUANT_RANGE)
1090 fprintf(output, "%d-%d ", atom->min, atom->max);
1091 if (atom->type == XML_REGEXP_STRING)
1092 fprintf(output, "'%s' ", (char *) atom->valuep);
1093 if (atom->type == XML_REGEXP_CHARVAL)
1094 fprintf(output, "char %c\n", atom->codepoint);
1095 else if (atom->type == XML_REGEXP_RANGES) {
1096 int i;
1097 fprintf(output, "%d entries\n", atom->nbRanges);
1098 for (i = 0; i < atom->nbRanges;i++)
1099 xmlRegPrintRange(output, atom->ranges[i]);
1100 } else if (atom->type == XML_REGEXP_SUBREG) {
1101 fprintf(output, "start %d end %d\n", atom->start->no, atom->stop->no);
1102 } else {
1103 fprintf(output, "\n");
1104 }
1105}
1106
1107static void
1108xmlRegPrintTrans(FILE *output, xmlRegTransPtr trans) {
1109 fprintf(output, " trans: ");
1110 if (trans == NULL) {
1111 fprintf(output, "NULL\n");
1112 return;
1113 }
1114 if (trans->to < 0) {
1115 fprintf(output, "removed\n");
1116 return;
1117 }
1118 if (trans->nd != 0) {
1119 if (trans->nd == 2)
1120 fprintf(output, "last not determinist, ");
1121 else
1122 fprintf(output, "not determinist, ");
1123 }
1124 if (trans->counter >= 0) {
1125 fprintf(output, "counted %d, ", trans->counter);
1126 }
1127 if (trans->count == REGEXP_ALL_COUNTER) {
1128 fprintf(output, "all transition, ");
1129 } else if (trans->count >= 0) {
1130 fprintf(output, "count based %d, ", trans->count);
1131 }
1132 if (trans->atom == NULL) {
1133 fprintf(output, "epsilon to %d\n", trans->to);
1134 return;
1135 }
1136 if (trans->atom->type == XML_REGEXP_CHARVAL)
1137 fprintf(output, "char %c ", trans->atom->codepoint);
1138 fprintf(output, "atom %d, to %d\n", trans->atom->no, trans->to);
1139}
1140
1141static void
1142xmlRegPrintState(FILE *output, xmlRegStatePtr state) {
1143 int i;
1144
1145 fprintf(output, " state: ");
1146 if (state == NULL) {
1147 fprintf(output, "NULL\n");
1148 return;
1149 }
1150 if (state->type == XML_REGEXP_START_STATE)
1151 fprintf(output, "START ");
1152 if (state->type == XML_REGEXP_FINAL_STATE)
1153 fprintf(output, "FINAL ");
1154
1155 fprintf(output, "%d, %d transitions:\n", state->no, state->nbTrans);
1156 for (i = 0;i < state->nbTrans; i++) {
1157 xmlRegPrintTrans(output, &(state->trans[i]));
1158 }
1159}
1160
1161/************************************************************************
1162 * *
1163 * Finite Automata structures manipulations *
1164 * *
1165 ************************************************************************/
1166
1167static xmlRegRangePtr
1168xmlRegAtomAddRange(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom,
1169 int neg, xmlRegAtomType type, int start, int end,
1170 xmlChar *blockName) {
1171 xmlRegRangePtr range;
1172
1173 if (atom == NULL) {
1174 ERROR("add range: atom is NULL");
1175 return(NULL);
1176 }
1177 if (atom->type != XML_REGEXP_RANGES) {
1178 ERROR("add range: atom is not ranges");
1179 return(NULL);
1180 }
1181 if (atom->maxRanges == 0) {
1182 atom->maxRanges = 4;
1183 atom->ranges = (xmlRegRangePtr *) xmlMalloc(atom->maxRanges *
1184 sizeof(xmlRegRangePtr));
1185 if (atom->ranges == NULL) {
1186 xmlRegexpErrMemory(ctxt);
1187 atom->maxRanges = 0;
1188 return(NULL);
1189 }
1190 } else if (atom->nbRanges >= atom->maxRanges) {
1191 xmlRegRangePtr *tmp;
1192 atom->maxRanges *= 2;
1193 tmp = (xmlRegRangePtr *) xmlRealloc(atom->ranges, atom->maxRanges *
1194 sizeof(xmlRegRangePtr));
1195 if (tmp == NULL) {
1196 xmlRegexpErrMemory(ctxt);
1197 atom->maxRanges /= 2;
1198 return(NULL);
1199 }
1200 atom->ranges = tmp;
1201 }
1202 range = xmlRegNewRange(ctxt, neg, type, start, end);
1203 if (range == NULL)
1204 return(NULL);
1205 range->blockName = blockName;
1206 atom->ranges[atom->nbRanges++] = range;
1207
1208 return(range);
1209}
1210
1211static int
1212xmlRegGetCounter(xmlRegParserCtxtPtr ctxt) {
1213 if (ctxt->maxCounters == 0) {
1214 ctxt->maxCounters = 4;
1215 ctxt->counters = (xmlRegCounter *) xmlMalloc(ctxt->maxCounters *
1216 sizeof(xmlRegCounter));
1217 if (ctxt->counters == NULL) {
1218 xmlRegexpErrMemory(ctxt);
1219 ctxt->maxCounters = 0;
1220 return(-1);
1221 }
1222 } else if (ctxt->nbCounters >= ctxt->maxCounters) {
1223 xmlRegCounter *tmp;
1224 ctxt->maxCounters *= 2;
1225 tmp = (xmlRegCounter *) xmlRealloc(ctxt->counters, ctxt->maxCounters *
1226 sizeof(xmlRegCounter));
1227 if (tmp == NULL) {
1228 xmlRegexpErrMemory(ctxt);
1229 ctxt->maxCounters /= 2;
1230 return(-1);
1231 }
1232 ctxt->counters = tmp;
1233 }
1234 ctxt->counters[ctxt->nbCounters].min = -1;
1235 ctxt->counters[ctxt->nbCounters].max = -1;
1236 return(ctxt->nbCounters++);
1237}
1238
1239static int
1240xmlRegAtomPush(xmlRegParserCtxtPtr ctxt, xmlRegAtomPtr atom) {
1241 if (atom == NULL) {
1242 ERROR("atom push: atom is NULL");
1243 return(-1);
1244 }
1245 if (ctxt->nbAtoms >= ctxt->maxAtoms) {
1246 size_t newSize = ctxt->maxAtoms ? ctxt->maxAtoms * 2 : 4;
1247 xmlRegAtomPtr *tmp;
1248
1249 tmp = xmlRealloc(ctxt->atoms, newSize * sizeof(xmlRegAtomPtr));
1250 if (tmp == NULL) {
1251 xmlRegexpErrMemory(ctxt);
1252 return(-1);
1253 }
1254 ctxt->atoms = tmp;
1255 ctxt->maxAtoms = newSize;
1256 }
1257 atom->no = ctxt->nbAtoms;
1258 ctxt->atoms[ctxt->nbAtoms++] = atom;
1259 return(0);
1260}
1261
1262static void
1263xmlRegStateAddTransTo(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr target,
1264 int from) {
1265 if (target->maxTransTo == 0) {
1266 target->maxTransTo = 8;
1267 target->transTo = (int *) xmlMalloc(target->maxTransTo *
1268 sizeof(int));
1269 if (target->transTo == NULL) {
1270 xmlRegexpErrMemory(ctxt);
1271 target->maxTransTo = 0;
1272 return;
1273 }
1274 } else if (target->nbTransTo >= target->maxTransTo) {
1275 int *tmp;
1276 target->maxTransTo *= 2;
1277 tmp = (int *) xmlRealloc(target->transTo, target->maxTransTo *
1278 sizeof(int));
1279 if (tmp == NULL) {
1280 xmlRegexpErrMemory(ctxt);
1281 target->maxTransTo /= 2;
1282 return;
1283 }
1284 target->transTo = tmp;
1285 }
1286 target->transTo[target->nbTransTo] = from;
1287 target->nbTransTo++;
1288}
1289
1290static void
1291xmlRegStateAddTrans(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
1292 xmlRegAtomPtr atom, xmlRegStatePtr target,
1293 int counter, int count) {
1294
1295 int nrtrans;
1296
1297 if (state == NULL) {
1298 ERROR("add state: state is NULL");
1299 return;
1300 }
1301 if (target == NULL) {
1302 ERROR("add state: target is NULL");
1303 return;
1304 }
1305 /*
1306 * Other routines follow the philosophy 'When in doubt, add a transition'
1307 * so we check here whether such a transition is already present and, if
1308 * so, silently ignore this request.
1309 */
1310
1311 for (nrtrans = state->nbTrans - 1; nrtrans >= 0; nrtrans--) {
1312 xmlRegTransPtr trans = &(state->trans[nrtrans]);
1313 if ((trans->atom == atom) &&
1314 (trans->to == target->no) &&
1315 (trans->counter == counter) &&
1316 (trans->count == count)) {
1317 return;
1318 }
1319 }
1320
1321 if (state->maxTrans == 0) {
1322 state->maxTrans = 8;
1323 state->trans = (xmlRegTrans *) xmlMalloc(state->maxTrans *
1324 sizeof(xmlRegTrans));
1325 if (state->trans == NULL) {
1326 xmlRegexpErrMemory(ctxt);
1327 state->maxTrans = 0;
1328 return;
1329 }
1330 } else if (state->nbTrans >= state->maxTrans) {
1331 xmlRegTrans *tmp;
1332 state->maxTrans *= 2;
1333 tmp = (xmlRegTrans *) xmlRealloc(state->trans, state->maxTrans *
1334 sizeof(xmlRegTrans));
1335 if (tmp == NULL) {
1336 xmlRegexpErrMemory(ctxt);
1337 state->maxTrans /= 2;
1338 return;
1339 }
1340 state->trans = tmp;
1341 }
1342
1343 state->trans[state->nbTrans].atom = atom;
1344 state->trans[state->nbTrans].to = target->no;
1345 state->trans[state->nbTrans].counter = counter;
1346 state->trans[state->nbTrans].count = count;
1347 state->trans[state->nbTrans].nd = 0;
1348 state->nbTrans++;
1349 xmlRegStateAddTransTo(ctxt, target, state->no);
1350}
1351
1352static xmlRegStatePtr
1353xmlRegStatePush(xmlRegParserCtxtPtr ctxt) {
1354 xmlRegStatePtr state;
1355
1356 if (ctxt->nbStates >= ctxt->maxStates) {
1357 size_t newSize = ctxt->maxStates ? ctxt->maxStates * 2 : 4;
1358 xmlRegStatePtr *tmp;
1359
1360 tmp = xmlRealloc(ctxt->states, newSize * sizeof(tmp[0]));
1361 if (tmp == NULL) {
1362 xmlRegexpErrMemory(ctxt);
1363 return(NULL);
1364 }
1365 ctxt->states = tmp;
1366 ctxt->maxStates = newSize;
1367 }
1368
1369 state = xmlRegNewState(ctxt);
1370 if (state == NULL)
1371 return(NULL);
1372
1373 state->no = ctxt->nbStates;
1374 ctxt->states[ctxt->nbStates++] = state;
1375
1376 return(state);
1377}
1378
1379/**
1380 * xmlFAGenerateAllTransition:
1381 * @ctxt: a regexp parser context
1382 * @from: the from state
1383 * @to: the target state or NULL for building a new one
1384 * @lax:
1385 *
1386 */
1387static int
1388xmlFAGenerateAllTransition(xmlRegParserCtxtPtr ctxt,
1389 xmlRegStatePtr from, xmlRegStatePtr to,
1390 int lax) {
1391 if (to == NULL) {
1392 to = xmlRegStatePush(ctxt);
1393 if (to == NULL)
1394 return(-1);
1395 ctxt->state = to;
1396 }
1397 if (lax)
1398 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_LAX_COUNTER);
1399 else
1400 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, REGEXP_ALL_COUNTER);
1401 return(0);
1402}
1403
1404/**
1405 * xmlFAGenerateEpsilonTransition:
1406 * @ctxt: a regexp parser context
1407 * @from: the from state
1408 * @to: the target state or NULL for building a new one
1409 *
1410 */
1411static int
1412xmlFAGenerateEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1413 xmlRegStatePtr from, xmlRegStatePtr to) {
1414 if (to == NULL) {
1415 to = xmlRegStatePush(ctxt);
1416 if (to == NULL)
1417 return(-1);
1418 ctxt->state = to;
1419 }
1420 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, -1);
1421 return(0);
1422}
1423
1424/**
1425 * xmlFAGenerateCountedEpsilonTransition:
1426 * @ctxt: a regexp parser context
1427 * @from: the from state
1428 * @to: the target state or NULL for building a new one
1429 * counter: the counter for that transition
1430 *
1431 */
1432static int
1433xmlFAGenerateCountedEpsilonTransition(xmlRegParserCtxtPtr ctxt,
1434 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1435 if (to == NULL) {
1436 to = xmlRegStatePush(ctxt);
1437 if (to == NULL)
1438 return(-1);
1439 ctxt->state = to;
1440 }
1441 xmlRegStateAddTrans(ctxt, from, NULL, to, counter, -1);
1442 return(0);
1443}
1444
1445/**
1446 * xmlFAGenerateCountedTransition:
1447 * @ctxt: a regexp parser context
1448 * @from: the from state
1449 * @to: the target state or NULL for building a new one
1450 * counter: the counter for that transition
1451 *
1452 */
1453static int
1454xmlFAGenerateCountedTransition(xmlRegParserCtxtPtr ctxt,
1455 xmlRegStatePtr from, xmlRegStatePtr to, int counter) {
1456 if (to == NULL) {
1457 to = xmlRegStatePush(ctxt);
1458 if (to == NULL)
1459 return(-1);
1460 ctxt->state = to;
1461 }
1462 xmlRegStateAddTrans(ctxt, from, NULL, to, -1, counter);
1463 return(0);
1464}
1465
1466/**
1467 * xmlFAGenerateTransitions:
1468 * @ctxt: a regexp parser context
1469 * @from: the from state
1470 * @to: the target state or NULL for building a new one
1471 * @atom: the atom generating the transition
1472 *
1473 * Returns 0 if success and -1 in case of error.
1474 */
1475static int
1476xmlFAGenerateTransitions(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr from,
1477 xmlRegStatePtr to, xmlRegAtomPtr atom) {
1478 xmlRegStatePtr end;
1479 int nullable = 0;
1480
1481 if (atom == NULL) {
1482 ERROR("generate transition: atom == NULL");
1483 return(-1);
1484 }
1485 if (atom->type == XML_REGEXP_SUBREG) {
1486 /*
1487 * this is a subexpression handling one should not need to
1488 * create a new node except for XML_REGEXP_QUANT_RANGE.
1489 */
1490 if ((to != NULL) && (atom->stop != to) &&
1491 (atom->quant != XML_REGEXP_QUANT_RANGE)) {
1492 /*
1493 * Generate an epsilon transition to link to the target
1494 */
1495 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1496#ifdef DV
1497 } else if ((to == NULL) && (atom->quant != XML_REGEXP_QUANT_RANGE) &&
1498 (atom->quant != XML_REGEXP_QUANT_ONCE)) {
1499 to = xmlRegStatePush(ctxt, to);
1500 if (to == NULL)
1501 return(-1);
1502 ctxt->state = to;
1503 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, to);
1504#endif
1505 }
1506 switch (atom->quant) {
1507 case XML_REGEXP_QUANT_OPT:
1508 atom->quant = XML_REGEXP_QUANT_ONCE;
1509 /*
1510 * transition done to the state after end of atom.
1511 * 1. set transition from atom start to new state
1512 * 2. set transition from atom end to this state.
1513 */
1514 if (to == NULL) {
1515 xmlFAGenerateEpsilonTransition(ctxt, atom->start, 0);
1516 xmlFAGenerateEpsilonTransition(ctxt, atom->stop,
1517 ctxt->state);
1518 } else {
1519 xmlFAGenerateEpsilonTransition(ctxt, atom->start, to);
1520 }
1521 break;
1522 case XML_REGEXP_QUANT_MULT:
1523 atom->quant = XML_REGEXP_QUANT_ONCE;
1524 xmlFAGenerateEpsilonTransition(ctxt, atom->start, atom->stop);
1525 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1526 break;
1527 case XML_REGEXP_QUANT_PLUS:
1528 atom->quant = XML_REGEXP_QUANT_ONCE;
1529 xmlFAGenerateEpsilonTransition(ctxt, atom->stop, atom->start);
1530 break;
1531 case XML_REGEXP_QUANT_RANGE: {
1532 int counter;
1533 xmlRegStatePtr inter, newstate;
1534
1535 /*
1536 * create the final state now if needed
1537 */
1538 if (to != NULL) {
1539 newstate = to;
1540 } else {
1541 newstate = xmlRegStatePush(ctxt);
1542 if (newstate == NULL)
1543 return(-1);
1544 }
1545
1546 /*
1547 * The principle here is to use counted transition
1548 * to avoid explosion in the number of states in the
1549 * graph. This is clearly more complex but should not
1550 * be exploitable at runtime.
1551 */
1552 if ((atom->min == 0) && (atom->start0 == NULL)) {
1553 xmlRegAtomPtr copy;
1554 /*
1555 * duplicate a transition based on atom to count next
1556 * occurrences after 1. We cannot loop to atom->start
1557 * directly because we need an epsilon transition to
1558 * newstate.
1559 */
1560 /* ???? For some reason it seems we never reach that
1561 case, I suppose this got optimized out before when
1562 building the automata */
1563 copy = xmlRegCopyAtom(ctxt, atom);
1564 if (copy == NULL)
1565 return(-1);
1566 copy->quant = XML_REGEXP_QUANT_ONCE;
1567 copy->min = 0;
1568 copy->max = 0;
1569
1570 if (xmlFAGenerateTransitions(ctxt, atom->start, NULL, copy)
1571 < 0) {
1572 xmlRegFreeAtom(copy);
1573 return(-1);
1574 }
1575 inter = ctxt->state;
1576 counter = xmlRegGetCounter(ctxt);
1577 if (counter < 0)
1578 return(-1);
1579 ctxt->counters[counter].min = atom->min - 1;
1580 ctxt->counters[counter].max = atom->max - 1;
1581 /* count the number of times we see it again */
1582 xmlFAGenerateCountedEpsilonTransition(ctxt, inter,
1583 atom->stop, counter);
1584 /* allow a way out based on the count */
1585 xmlFAGenerateCountedTransition(ctxt, inter,
1586 newstate, counter);
1587 /* and also allow a direct exit for 0 */
1588 xmlFAGenerateEpsilonTransition(ctxt, atom->start,
1589 newstate);
1590 } else {
1591 /*
1592 * either we need the atom at least once or there
1593 * is an atom->start0 allowing to easily plug the
1594 * epsilon transition.
1595 */
1596 counter = xmlRegGetCounter(ctxt);
1597 if (counter < 0)
1598 return(-1);
1599 ctxt->counters[counter].min = atom->min - 1;
1600 ctxt->counters[counter].max = atom->max - 1;
1601 /* allow a way out based on the count */
1602 xmlFAGenerateCountedTransition(ctxt, atom->stop,
1603 newstate, counter);
1604 /* count the number of times we see it again */
1605 xmlFAGenerateCountedEpsilonTransition(ctxt, atom->stop,
1606 atom->start, counter);
1607 /* and if needed allow a direct exit for 0 */
1608 if (atom->min == 0)
1609 xmlFAGenerateEpsilonTransition(ctxt, atom->start0,
1610 newstate);
1611
1612 }
1613 atom->min = 0;
1614 atom->max = 0;
1615 atom->quant = XML_REGEXP_QUANT_ONCE;
1616 ctxt->state = newstate;
1617 }
1618 default:
1619 break;
1620 }
1621 if (xmlRegAtomPush(ctxt, atom) < 0)
1622 return(-1);
1623 return(0);
1624 }
1625 if ((atom->min == 0) && (atom->max == 0) &&
1626 (atom->quant == XML_REGEXP_QUANT_RANGE)) {
1627 /*
1628 * we can discard the atom and generate an epsilon transition instead
1629 */
1630 if (to == NULL) {
1631 to = xmlRegStatePush(ctxt);
1632 if (to == NULL)
1633 return(-1);
1634 }
1635 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1636 ctxt->state = to;
1637 xmlRegFreeAtom(atom);
1638 return(0);
1639 }
1640 if (to == NULL) {
1641 to = xmlRegStatePush(ctxt);
1642 if (to == NULL)
1643 return(-1);
1644 }
1645 end = to;
1646 if ((atom->quant == XML_REGEXP_QUANT_MULT) ||
1647 (atom->quant == XML_REGEXP_QUANT_PLUS)) {
1648 /*
1649 * Do not pollute the target state by adding transitions from
1650 * it as it is likely to be the shared target of multiple branches.
1651 * So isolate with an epsilon transition.
1652 */
1653 xmlRegStatePtr tmp;
1654
1655 tmp = xmlRegStatePush(ctxt);
1656 if (tmp == NULL)
1657 return(-1);
1658 xmlFAGenerateEpsilonTransition(ctxt, tmp, to);
1659 to = tmp;
1660 }
1661 if ((atom->quant == XML_REGEXP_QUANT_RANGE) &&
1662 (atom->min == 0) && (atom->max > 0)) {
1663 nullable = 1;
1664 atom->min = 1;
1665 if (atom->max == 1)
1666 atom->quant = XML_REGEXP_QUANT_OPT;
1667 }
1668 xmlRegStateAddTrans(ctxt, from, atom, to, -1, -1);
1669 ctxt->state = end;
1670 switch (atom->quant) {
1671 case XML_REGEXP_QUANT_OPT:
1672 atom->quant = XML_REGEXP_QUANT_ONCE;
1673 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1674 break;
1675 case XML_REGEXP_QUANT_MULT:
1676 atom->quant = XML_REGEXP_QUANT_ONCE;
1677 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1678 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1679 break;
1680 case XML_REGEXP_QUANT_PLUS:
1681 atom->quant = XML_REGEXP_QUANT_ONCE;
1682 xmlRegStateAddTrans(ctxt, to, atom, to, -1, -1);
1683 break;
1684 case XML_REGEXP_QUANT_RANGE:
1685 if (nullable)
1686 xmlFAGenerateEpsilonTransition(ctxt, from, to);
1687 break;
1688 default:
1689 break;
1690 }
1691 if (xmlRegAtomPush(ctxt, atom) < 0)
1692 return(-1);
1693 return(0);
1694}
1695
1696/**
1697 * xmlFAReduceEpsilonTransitions:
1698 * @ctxt: a regexp parser context
1699 * @fromnr: the from state
1700 * @tonr: the to state
1701 * @counter: should that transition be associated to a counted
1702 *
1703 */
1704static void
1705xmlFAReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int fromnr,
1706 int tonr, int counter) {
1707 int transnr;
1708 xmlRegStatePtr from;
1709 xmlRegStatePtr to;
1710
1711 from = ctxt->states[fromnr];
1712 if (from == NULL)
1713 return;
1714 to = ctxt->states[tonr];
1715 if (to == NULL)
1716 return;
1717 if ((to->mark == XML_REGEXP_MARK_START) ||
1718 (to->mark == XML_REGEXP_MARK_VISITED))
1719 return;
1720
1721 to->mark = XML_REGEXP_MARK_VISITED;
1722 if (to->type == XML_REGEXP_FINAL_STATE) {
1723 from->type = XML_REGEXP_FINAL_STATE;
1724 }
1725 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1726 xmlRegTransPtr t1 = &to->trans[transnr];
1727 int tcounter;
1728
1729 if (t1->to < 0)
1730 continue;
1731 if (t1->counter >= 0) {
1732 /* assert(counter < 0); */
1733 tcounter = t1->counter;
1734 } else {
1735 tcounter = counter;
1736 }
1737 if (t1->atom == NULL) {
1738 /*
1739 * Don't remove counted transitions
1740 * Don't loop either
1741 */
1742 if (t1->to != fromnr) {
1743 if (t1->count >= 0) {
1744 xmlRegStateAddTrans(ctxt, from, NULL, ctxt->states[t1->to],
1745 -1, t1->count);
1746 } else {
1747 xmlFAReduceEpsilonTransitions(ctxt, fromnr, t1->to,
1748 tcounter);
1749 }
1750 }
1751 } else {
1752 xmlRegStateAddTrans(ctxt, from, t1->atom,
1753 ctxt->states[t1->to], tcounter, -1);
1754 }
1755 }
1756}
1757
1758/**
1759 * xmlFAFinishReduceEpsilonTransitions:
1760 * @ctxt: a regexp parser context
1761 * @fromnr: the from state
1762 * @tonr: the to state
1763 * @counter: should that transition be associated to a counted
1764 *
1765 */
1766static void
1767xmlFAFinishReduceEpsilonTransitions(xmlRegParserCtxtPtr ctxt, int tonr) {
1768 int transnr;
1769 xmlRegStatePtr to;
1770
1771 to = ctxt->states[tonr];
1772 if (to == NULL)
1773 return;
1774 if ((to->mark == XML_REGEXP_MARK_START) ||
1775 (to->mark == XML_REGEXP_MARK_NORMAL))
1776 return;
1777
1778 to->mark = XML_REGEXP_MARK_NORMAL;
1779 for (transnr = 0;transnr < to->nbTrans;transnr++) {
1780 xmlRegTransPtr t1 = &to->trans[transnr];
1781 if ((t1->to >= 0) && (t1->atom == NULL))
1782 xmlFAFinishReduceEpsilonTransitions(ctxt, t1->to);
1783 }
1784}
1785
1786/**
1787 * xmlFAEliminateSimpleEpsilonTransitions:
1788 * @ctxt: a regexp parser context
1789 *
1790 * Eliminating general epsilon transitions can get costly in the general
1791 * algorithm due to the large amount of generated new transitions and
1792 * associated comparisons. However for simple epsilon transition used just
1793 * to separate building blocks when generating the automata this can be
1794 * reduced to state elimination:
1795 * - if there exists an epsilon from X to Y
1796 * - if there is no other transition from X
1797 * then X and Y are semantically equivalent and X can be eliminated
1798 * If X is the start state then make Y the start state, else replace the
1799 * target of all transitions to X by transitions to Y.
1800 *
1801 * If X is a final state, skip it.
1802 * Otherwise it would be necessary to manipulate counters for this case when
1803 * eliminating state 2:
1804 * State 1 has a transition with an atom to state 2.
1805 * State 2 is final and has an epsilon transition to state 1.
1806 */
1807static void
1808xmlFAEliminateSimpleEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1809 int statenr, i, j, newto;
1810 xmlRegStatePtr state, tmp;
1811
1812 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1813 state = ctxt->states[statenr];
1814 if (state == NULL)
1815 continue;
1816 if (state->nbTrans != 1)
1817 continue;
1818 if (state->type == XML_REGEXP_UNREACH_STATE ||
1819 state->type == XML_REGEXP_FINAL_STATE)
1820 continue;
1821 /* is the only transition out a basic transition */
1822 if ((state->trans[0].atom == NULL) &&
1823 (state->trans[0].to >= 0) &&
1824 (state->trans[0].to != statenr) &&
1825 (state->trans[0].counter < 0) &&
1826 (state->trans[0].count < 0)) {
1827 newto = state->trans[0].to;
1828
1829 if (state->type == XML_REGEXP_START_STATE) {
1830 } else {
1831 for (i = 0;i < state->nbTransTo;i++) {
1832 tmp = ctxt->states[state->transTo[i]];
1833 for (j = 0;j < tmp->nbTrans;j++) {
1834 if (tmp->trans[j].to == statenr) {
1835 tmp->trans[j].to = -1;
1836 xmlRegStateAddTrans(ctxt, tmp, tmp->trans[j].atom,
1837 ctxt->states[newto],
1838 tmp->trans[j].counter,
1839 tmp->trans[j].count);
1840 }
1841 }
1842 }
1843 if (state->type == XML_REGEXP_FINAL_STATE)
1844 ctxt->states[newto]->type = XML_REGEXP_FINAL_STATE;
1845 /* eliminate the transition completely */
1846 state->nbTrans = 0;
1847
1848 state->type = XML_REGEXP_UNREACH_STATE;
1849
1850 }
1851
1852 }
1853 }
1854}
1855/**
1856 * xmlFAEliminateEpsilonTransitions:
1857 * @ctxt: a regexp parser context
1858 *
1859 */
1860static void
1861xmlFAEliminateEpsilonTransitions(xmlRegParserCtxtPtr ctxt) {
1862 int statenr, transnr;
1863 xmlRegStatePtr state;
1864 int has_epsilon;
1865
1866 if (ctxt->states == NULL) return;
1867
1868 /*
1869 * Eliminate simple epsilon transition and the associated unreachable
1870 * states.
1871 */
1872 xmlFAEliminateSimpleEpsilonTransitions(ctxt);
1873 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1874 state = ctxt->states[statenr];
1875 if ((state != NULL) && (state->type == XML_REGEXP_UNREACH_STATE)) {
1876 xmlRegFreeState(state);
1877 ctxt->states[statenr] = NULL;
1878 }
1879 }
1880
1881 has_epsilon = 0;
1882
1883 /*
1884 * Build the completed transitions bypassing the epsilons
1885 * Use a marking algorithm to avoid loops
1886 * Mark sink states too.
1887 * Process from the latest states backward to the start when
1888 * there is long cascading epsilon chains this minimize the
1889 * recursions and transition compares when adding the new ones
1890 */
1891 for (statenr = ctxt->nbStates - 1;statenr >= 0;statenr--) {
1892 state = ctxt->states[statenr];
1893 if (state == NULL)
1894 continue;
1895 if ((state->nbTrans == 0) &&
1896 (state->type != XML_REGEXP_FINAL_STATE)) {
1897 state->type = XML_REGEXP_SINK_STATE;
1898 }
1899 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1900 if ((state->trans[transnr].atom == NULL) &&
1901 (state->trans[transnr].to >= 0)) {
1902 if (state->trans[transnr].to == statenr) {
1903 state->trans[transnr].to = -1;
1904 } else if (state->trans[transnr].count < 0) {
1905 int newto = state->trans[transnr].to;
1906
1907 has_epsilon = 1;
1908 state->trans[transnr].to = -2;
1909 state->mark = XML_REGEXP_MARK_START;
1910 xmlFAReduceEpsilonTransitions(ctxt, statenr,
1911 newto, state->trans[transnr].counter);
1912 xmlFAFinishReduceEpsilonTransitions(ctxt, newto);
1913 state->mark = XML_REGEXP_MARK_NORMAL;
1914 }
1915 }
1916 }
1917 }
1918 /*
1919 * Eliminate the epsilon transitions
1920 */
1921 if (has_epsilon) {
1922 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1923 state = ctxt->states[statenr];
1924 if (state == NULL)
1925 continue;
1926 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1927 xmlRegTransPtr trans = &(state->trans[transnr]);
1928 if ((trans->atom == NULL) &&
1929 (trans->count < 0) &&
1930 (trans->to >= 0)) {
1931 trans->to = -1;
1932 }
1933 }
1934 }
1935 }
1936
1937 /*
1938 * Use this pass to detect unreachable states too
1939 */
1940 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1941 state = ctxt->states[statenr];
1942 if (state != NULL)
1943 state->reached = XML_REGEXP_MARK_NORMAL;
1944 }
1945 state = ctxt->states[0];
1946 if (state != NULL)
1947 state->reached = XML_REGEXP_MARK_START;
1948 while (state != NULL) {
1949 xmlRegStatePtr target = NULL;
1950 state->reached = XML_REGEXP_MARK_VISITED;
1951 /*
1952 * Mark all states reachable from the current reachable state
1953 */
1954 for (transnr = 0;transnr < state->nbTrans;transnr++) {
1955 if ((state->trans[transnr].to >= 0) &&
1956 ((state->trans[transnr].atom != NULL) ||
1957 (state->trans[transnr].count >= 0))) {
1958 int newto = state->trans[transnr].to;
1959
1960 if (ctxt->states[newto] == NULL)
1961 continue;
1962 if (ctxt->states[newto]->reached == XML_REGEXP_MARK_NORMAL) {
1963 ctxt->states[newto]->reached = XML_REGEXP_MARK_START;
1964 target = ctxt->states[newto];
1965 }
1966 }
1967 }
1968
1969 /*
1970 * find the next accessible state not explored
1971 */
1972 if (target == NULL) {
1973 for (statenr = 1;statenr < ctxt->nbStates;statenr++) {
1974 state = ctxt->states[statenr];
1975 if ((state != NULL) && (state->reached ==
1976 XML_REGEXP_MARK_START)) {
1977 target = state;
1978 break;
1979 }
1980 }
1981 }
1982 state = target;
1983 }
1984 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
1985 state = ctxt->states[statenr];
1986 if ((state != NULL) && (state->reached == XML_REGEXP_MARK_NORMAL)) {
1987 xmlRegFreeState(state);
1988 ctxt->states[statenr] = NULL;
1989 }
1990 }
1991
1992}
1993
1994static int
1995xmlFACompareRanges(xmlRegRangePtr range1, xmlRegRangePtr range2) {
1996 int ret = 0;
1997
1998 if ((range1->type == XML_REGEXP_RANGES) ||
1999 (range2->type == XML_REGEXP_RANGES) ||
2000 (range2->type == XML_REGEXP_SUBREG) ||
2001 (range1->type == XML_REGEXP_SUBREG) ||
2002 (range1->type == XML_REGEXP_STRING) ||
2003 (range2->type == XML_REGEXP_STRING))
2004 return(-1);
2005
2006 /* put them in order */
2007 if (range1->type > range2->type) {
2008 xmlRegRangePtr tmp;
2009
2010 tmp = range1;
2011 range1 = range2;
2012 range2 = tmp;
2013 }
2014 if ((range1->type == XML_REGEXP_ANYCHAR) ||
2015 (range2->type == XML_REGEXP_ANYCHAR)) {
2016 ret = 1;
2017 } else if ((range1->type == XML_REGEXP_EPSILON) ||
2018 (range2->type == XML_REGEXP_EPSILON)) {
2019 return(0);
2020 } else if (range1->type == range2->type) {
2021 if (range1->type != XML_REGEXP_CHARVAL)
2022 ret = 1;
2023 else if ((range1->end < range2->start) ||
2024 (range2->end < range1->start))
2025 ret = 0;
2026 else
2027 ret = 1;
2028 } else if (range1->type == XML_REGEXP_CHARVAL) {
2029 int codepoint;
2030 int neg = 0;
2031
2032 /*
2033 * just check all codepoints in the range for acceptance,
2034 * this is usually way cheaper since done only once at
2035 * compilation than testing over and over at runtime or
2036 * pushing too many states when evaluating.
2037 */
2038 if (((range1->neg == 0) && (range2->neg != 0)) ||
2039 ((range1->neg != 0) && (range2->neg == 0)))
2040 neg = 1;
2041
2042 for (codepoint = range1->start;codepoint <= range1->end ;codepoint++) {
2043 ret = xmlRegCheckCharacterRange(range2->type, codepoint,
2044 0, range2->start, range2->end,
2045 range2->blockName);
2046 if (ret < 0)
2047 return(-1);
2048 if (((neg == 1) && (ret == 0)) ||
2049 ((neg == 0) && (ret == 1)))
2050 return(1);
2051 }
2052 return(0);
2053 } else if ((range1->type == XML_REGEXP_BLOCK_NAME) ||
2054 (range2->type == XML_REGEXP_BLOCK_NAME)) {
2055 if (range1->type == range2->type) {
2056 ret = xmlStrEqual(range1->blockName, range2->blockName);
2057 } else {
2058 /*
2059 * comparing a block range with anything else is way
2060 * too costly, and maintaining the table is like too much
2061 * memory too, so let's force the automata to save state
2062 * here.
2063 */
2064 return(1);
2065 }
2066 } else if ((range1->type < XML_REGEXP_LETTER) ||
2067 (range2->type < XML_REGEXP_LETTER)) {
2068 if ((range1->type == XML_REGEXP_ANYSPACE) &&
2069 (range2->type == XML_REGEXP_NOTSPACE))
2070 ret = 0;
2071 else if ((range1->type == XML_REGEXP_INITNAME) &&
2072 (range2->type == XML_REGEXP_NOTINITNAME))
2073 ret = 0;
2074 else if ((range1->type == XML_REGEXP_NAMECHAR) &&
2075 (range2->type == XML_REGEXP_NOTNAMECHAR))
2076 ret = 0;
2077 else if ((range1->type == XML_REGEXP_DECIMAL) &&
2078 (range2->type == XML_REGEXP_NOTDECIMAL))
2079 ret = 0;
2080 else if ((range1->type == XML_REGEXP_REALCHAR) &&
2081 (range2->type == XML_REGEXP_NOTREALCHAR))
2082 ret = 0;
2083 else {
2084 /* same thing to limit complexity */
2085 return(1);
2086 }
2087 } else {
2088 ret = 0;
2089 /* range1->type < range2->type here */
2090 switch (range1->type) {
2091 case XML_REGEXP_LETTER:
2092 /* all disjoint except in the subgroups */
2093 if ((range2->type == XML_REGEXP_LETTER_UPPERCASE) ||
2094 (range2->type == XML_REGEXP_LETTER_LOWERCASE) ||
2095 (range2->type == XML_REGEXP_LETTER_TITLECASE) ||
2096 (range2->type == XML_REGEXP_LETTER_MODIFIER) ||
2097 (range2->type == XML_REGEXP_LETTER_OTHERS))
2098 ret = 1;
2099 break;
2100 case XML_REGEXP_MARK:
2101 if ((range2->type == XML_REGEXP_MARK_NONSPACING) ||
2102 (range2->type == XML_REGEXP_MARK_SPACECOMBINING) ||
2103 (range2->type == XML_REGEXP_MARK_ENCLOSING))
2104 ret = 1;
2105 break;
2106 case XML_REGEXP_NUMBER:
2107 if ((range2->type == XML_REGEXP_NUMBER_DECIMAL) ||
2108 (range2->type == XML_REGEXP_NUMBER_LETTER) ||
2109 (range2->type == XML_REGEXP_NUMBER_OTHERS))
2110 ret = 1;
2111 break;
2112 case XML_REGEXP_PUNCT:
2113 if ((range2->type == XML_REGEXP_PUNCT_CONNECTOR) ||
2114 (range2->type == XML_REGEXP_PUNCT_DASH) ||
2115 (range2->type == XML_REGEXP_PUNCT_OPEN) ||
2116 (range2->type == XML_REGEXP_PUNCT_CLOSE) ||
2117 (range2->type == XML_REGEXP_PUNCT_INITQUOTE) ||
2118 (range2->type == XML_REGEXP_PUNCT_FINQUOTE) ||
2119 (range2->type == XML_REGEXP_PUNCT_OTHERS))
2120 ret = 1;
2121 break;
2122 case XML_REGEXP_SEPAR:
2123 if ((range2->type == XML_REGEXP_SEPAR_SPACE) ||
2124 (range2->type == XML_REGEXP_SEPAR_LINE) ||
2125 (range2->type == XML_REGEXP_SEPAR_PARA))
2126 ret = 1;
2127 break;
2128 case XML_REGEXP_SYMBOL:
2129 if ((range2->type == XML_REGEXP_SYMBOL_MATH) ||
2130 (range2->type == XML_REGEXP_SYMBOL_CURRENCY) ||
2131 (range2->type == XML_REGEXP_SYMBOL_MODIFIER) ||
2132 (range2->type == XML_REGEXP_SYMBOL_OTHERS))
2133 ret = 1;
2134 break;
2135 case XML_REGEXP_OTHER:
2136 if ((range2->type == XML_REGEXP_OTHER_CONTROL) ||
2137 (range2->type == XML_REGEXP_OTHER_FORMAT) ||
2138 (range2->type == XML_REGEXP_OTHER_PRIVATE))
2139 ret = 1;
2140 break;
2141 default:
2142 if ((range2->type >= XML_REGEXP_LETTER) &&
2143 (range2->type < XML_REGEXP_BLOCK_NAME))
2144 ret = 0;
2145 else {
2146 /* safety net ! */
2147 return(1);
2148 }
2149 }
2150 }
2151 if (((range1->neg == 0) && (range2->neg != 0)) ||
2152 ((range1->neg != 0) && (range2->neg == 0)))
2153 ret = !ret;
2154 return(ret);
2155}
2156
2157/**
2158 * xmlFACompareAtomTypes:
2159 * @type1: an atom type
2160 * @type2: an atom type
2161 *
2162 * Compares two atoms type to check whether they intersect in some ways,
2163 * this is used by xmlFACompareAtoms only
2164 *
2165 * Returns 1 if they may intersect and 0 otherwise
2166 */
2167static int
2168xmlFACompareAtomTypes(xmlRegAtomType type1, xmlRegAtomType type2) {
2169 if ((type1 == XML_REGEXP_EPSILON) ||
2170 (type1 == XML_REGEXP_CHARVAL) ||
2171 (type1 == XML_REGEXP_RANGES) ||
2172 (type1 == XML_REGEXP_SUBREG) ||
2173 (type1 == XML_REGEXP_STRING) ||
2174 (type1 == XML_REGEXP_ANYCHAR))
2175 return(1);
2176 if ((type2 == XML_REGEXP_EPSILON) ||
2177 (type2 == XML_REGEXP_CHARVAL) ||
2178 (type2 == XML_REGEXP_RANGES) ||
2179 (type2 == XML_REGEXP_SUBREG) ||
2180 (type2 == XML_REGEXP_STRING) ||
2181 (type2 == XML_REGEXP_ANYCHAR))
2182 return(1);
2183
2184 if (type1 == type2) return(1);
2185
2186 /* simplify subsequent compares by making sure type1 < type2 */
2187 if (type1 > type2) {
2188 xmlRegAtomType tmp = type1;
2189 type1 = type2;
2190 type2 = tmp;
2191 }
2192 switch (type1) {
2193 case XML_REGEXP_ANYSPACE: /* \s */
2194 /* can't be a letter, number, mark, punctuation, symbol */
2195 if ((type2 == XML_REGEXP_NOTSPACE) ||
2196 ((type2 >= XML_REGEXP_LETTER) &&
2197 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2198 ((type2 >= XML_REGEXP_NUMBER) &&
2199 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2200 ((type2 >= XML_REGEXP_MARK) &&
2201 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2202 ((type2 >= XML_REGEXP_PUNCT) &&
2203 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2204 ((type2 >= XML_REGEXP_SYMBOL) &&
2205 (type2 <= XML_REGEXP_SYMBOL_OTHERS))
2206 ) return(0);
2207 break;
2208 case XML_REGEXP_NOTSPACE: /* \S */
2209 break;
2210 case XML_REGEXP_INITNAME: /* \l */
2211 /* can't be a number, mark, separator, punctuation, symbol or other */
2212 if ((type2 == XML_REGEXP_NOTINITNAME) ||
2213 ((type2 >= XML_REGEXP_NUMBER) &&
2214 (type2 <= XML_REGEXP_NUMBER_OTHERS)) ||
2215 ((type2 >= XML_REGEXP_MARK) &&
2216 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2217 ((type2 >= XML_REGEXP_SEPAR) &&
2218 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2219 ((type2 >= XML_REGEXP_PUNCT) &&
2220 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2221 ((type2 >= XML_REGEXP_SYMBOL) &&
2222 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2223 ((type2 >= XML_REGEXP_OTHER) &&
2224 (type2 <= XML_REGEXP_OTHER_NA))
2225 ) return(0);
2226 break;
2227 case XML_REGEXP_NOTINITNAME: /* \L */
2228 break;
2229 case XML_REGEXP_NAMECHAR: /* \c */
2230 /* can't be a mark, separator, punctuation, symbol or other */
2231 if ((type2 == XML_REGEXP_NOTNAMECHAR) ||
2232 ((type2 >= XML_REGEXP_MARK) &&
2233 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2234 ((type2 >= XML_REGEXP_PUNCT) &&
2235 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2236 ((type2 >= XML_REGEXP_SEPAR) &&
2237 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2238 ((type2 >= XML_REGEXP_SYMBOL) &&
2239 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2240 ((type2 >= XML_REGEXP_OTHER) &&
2241 (type2 <= XML_REGEXP_OTHER_NA))
2242 ) return(0);
2243 break;
2244 case XML_REGEXP_NOTNAMECHAR: /* \C */
2245 break;
2246 case XML_REGEXP_DECIMAL: /* \d */
2247 /* can't be a letter, mark, separator, punctuation, symbol or other */
2248 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2249 (type2 == XML_REGEXP_REALCHAR) ||
2250 ((type2 >= XML_REGEXP_LETTER) &&
2251 (type2 <= XML_REGEXP_LETTER_OTHERS)) ||
2252 ((type2 >= XML_REGEXP_MARK) &&
2253 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2254 ((type2 >= XML_REGEXP_PUNCT) &&
2255 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2256 ((type2 >= XML_REGEXP_SEPAR) &&
2257 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2258 ((type2 >= XML_REGEXP_SYMBOL) &&
2259 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2260 ((type2 >= XML_REGEXP_OTHER) &&
2261 (type2 <= XML_REGEXP_OTHER_NA))
2262 )return(0);
2263 break;
2264 case XML_REGEXP_NOTDECIMAL: /* \D */
2265 break;
2266 case XML_REGEXP_REALCHAR: /* \w */
2267 /* can't be a mark, separator, punctuation, symbol or other */
2268 if ((type2 == XML_REGEXP_NOTDECIMAL) ||
2269 ((type2 >= XML_REGEXP_MARK) &&
2270 (type2 <= XML_REGEXP_MARK_ENCLOSING)) ||
2271 ((type2 >= XML_REGEXP_PUNCT) &&
2272 (type2 <= XML_REGEXP_PUNCT_OTHERS)) ||
2273 ((type2 >= XML_REGEXP_SEPAR) &&
2274 (type2 <= XML_REGEXP_SEPAR_PARA)) ||
2275 ((type2 >= XML_REGEXP_SYMBOL) &&
2276 (type2 <= XML_REGEXP_SYMBOL_OTHERS)) ||
2277 ((type2 >= XML_REGEXP_OTHER) &&
2278 (type2 <= XML_REGEXP_OTHER_NA))
2279 )return(0);
2280 break;
2281 case XML_REGEXP_NOTREALCHAR: /* \W */
2282 break;
2283 /*
2284 * at that point we know both type 1 and type2 are from
2285 * character categories are ordered and are different,
2286 * it becomes simple because this is a partition
2287 */
2288 case XML_REGEXP_LETTER:
2289 if (type2 <= XML_REGEXP_LETTER_OTHERS)
2290 return(1);
2291 return(0);
2292 case XML_REGEXP_LETTER_UPPERCASE:
2293 case XML_REGEXP_LETTER_LOWERCASE:
2294 case XML_REGEXP_LETTER_TITLECASE:
2295 case XML_REGEXP_LETTER_MODIFIER:
2296 case XML_REGEXP_LETTER_OTHERS:
2297 return(0);
2298 case XML_REGEXP_MARK:
2299 if (type2 <= XML_REGEXP_MARK_ENCLOSING)
2300 return(1);
2301 return(0);
2302 case XML_REGEXP_MARK_NONSPACING:
2303 case XML_REGEXP_MARK_SPACECOMBINING:
2304 case XML_REGEXP_MARK_ENCLOSING:
2305 return(0);
2306 case XML_REGEXP_NUMBER:
2307 if (type2 <= XML_REGEXP_NUMBER_OTHERS)
2308 return(1);
2309 return(0);
2310 case XML_REGEXP_NUMBER_DECIMAL:
2311 case XML_REGEXP_NUMBER_LETTER:
2312 case XML_REGEXP_NUMBER_OTHERS:
2313 return(0);
2314 case XML_REGEXP_PUNCT:
2315 if (type2 <= XML_REGEXP_PUNCT_OTHERS)
2316 return(1);
2317 return(0);
2318 case XML_REGEXP_PUNCT_CONNECTOR:
2319 case XML_REGEXP_PUNCT_DASH:
2320 case XML_REGEXP_PUNCT_OPEN:
2321 case XML_REGEXP_PUNCT_CLOSE:
2322 case XML_REGEXP_PUNCT_INITQUOTE:
2323 case XML_REGEXP_PUNCT_FINQUOTE:
2324 case XML_REGEXP_PUNCT_OTHERS:
2325 return(0);
2326 case XML_REGEXP_SEPAR:
2327 if (type2 <= XML_REGEXP_SEPAR_PARA)
2328 return(1);
2329 return(0);
2330 case XML_REGEXP_SEPAR_SPACE:
2331 case XML_REGEXP_SEPAR_LINE:
2332 case XML_REGEXP_SEPAR_PARA:
2333 return(0);
2334 case XML_REGEXP_SYMBOL:
2335 if (type2 <= XML_REGEXP_SYMBOL_OTHERS)
2336 return(1);
2337 return(0);
2338 case XML_REGEXP_SYMBOL_MATH:
2339 case XML_REGEXP_SYMBOL_CURRENCY:
2340 case XML_REGEXP_SYMBOL_MODIFIER:
2341 case XML_REGEXP_SYMBOL_OTHERS:
2342 return(0);
2343 case XML_REGEXP_OTHER:
2344 if (type2 <= XML_REGEXP_OTHER_NA)
2345 return(1);
2346 return(0);
2347 case XML_REGEXP_OTHER_CONTROL:
2348 case XML_REGEXP_OTHER_FORMAT:
2349 case XML_REGEXP_OTHER_PRIVATE:
2350 case XML_REGEXP_OTHER_NA:
2351 return(0);
2352 default:
2353 break;
2354 }
2355 return(1);
2356}
2357
2358/**
2359 * xmlFAEqualAtoms:
2360 * @atom1: an atom
2361 * @atom2: an atom
2362 * @deep: if not set only compare string pointers
2363 *
2364 * Compares two atoms to check whether they are the same exactly
2365 * this is used to remove equivalent transitions
2366 *
2367 * Returns 1 if same and 0 otherwise
2368 */
2369static int
2370xmlFAEqualAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2371 int ret = 0;
2372
2373 if (atom1 == atom2)
2374 return(1);
2375 if ((atom1 == NULL) || (atom2 == NULL))
2376 return(0);
2377
2378 if (atom1->type != atom2->type)
2379 return(0);
2380 switch (atom1->type) {
2381 case XML_REGEXP_EPSILON:
2382 ret = 0;
2383 break;
2384 case XML_REGEXP_STRING:
2385 if (!deep)
2386 ret = (atom1->valuep == atom2->valuep);
2387 else
2388 ret = xmlStrEqual((xmlChar *)atom1->valuep,
2389 (xmlChar *)atom2->valuep);
2390 break;
2391 case XML_REGEXP_CHARVAL:
2392 ret = (atom1->codepoint == atom2->codepoint);
2393 break;
2394 case XML_REGEXP_RANGES:
2395 /* too hard to do in the general case */
2396 ret = 0;
2397 default:
2398 break;
2399 }
2400 return(ret);
2401}
2402
2403/**
2404 * xmlFACompareAtoms:
2405 * @atom1: an atom
2406 * @atom2: an atom
2407 * @deep: if not set only compare string pointers
2408 *
2409 * Compares two atoms to check whether they intersect in some ways,
2410 * this is used by xmlFAComputesDeterminism and xmlFARecurseDeterminism only
2411 *
2412 * Returns 1 if yes and 0 otherwise
2413 */
2414static int
2415xmlFACompareAtoms(xmlRegAtomPtr atom1, xmlRegAtomPtr atom2, int deep) {
2416 int ret = 1;
2417
2418 if (atom1 == atom2)
2419 return(1);
2420 if ((atom1 == NULL) || (atom2 == NULL))
2421 return(0);
2422
2423 if ((atom1->type == XML_REGEXP_ANYCHAR) ||
2424 (atom2->type == XML_REGEXP_ANYCHAR))
2425 return(1);
2426
2427 if (atom1->type > atom2->type) {
2428 xmlRegAtomPtr tmp;
2429 tmp = atom1;
2430 atom1 = atom2;
2431 atom2 = tmp;
2432 }
2433 if (atom1->type != atom2->type) {
2434 ret = xmlFACompareAtomTypes(atom1->type, atom2->type);
2435 /* if they can't intersect at the type level break now */
2436 if (ret == 0)
2437 return(0);
2438 }
2439 switch (atom1->type) {
2440 case XML_REGEXP_STRING:
2441 if (!deep)
2442 ret = (atom1->valuep != atom2->valuep);
2443 else {
2444 xmlChar *val1 = (xmlChar *)atom1->valuep;
2445 xmlChar *val2 = (xmlChar *)atom2->valuep;
2446 int compound1 = (xmlStrchr(val1, '|') != NULL);
2447 int compound2 = (xmlStrchr(val2, '|') != NULL);
2448
2449 /* Ignore negative match flag for ##other namespaces */
2450 if (compound1 != compound2)
2451 return(0);
2452
2453 ret = xmlRegStrEqualWildcard(val1, val2);
2454 }
2455 break;
2456 case XML_REGEXP_EPSILON:
2457 goto not_determinist;
2458 case XML_REGEXP_CHARVAL:
2459 if (atom2->type == XML_REGEXP_CHARVAL) {
2460 ret = (atom1->codepoint == atom2->codepoint);
2461 } else {
2462 ret = xmlRegCheckCharacter(atom2, atom1->codepoint);
2463 if (ret < 0)
2464 ret = 1;
2465 }
2466 break;
2467 case XML_REGEXP_RANGES:
2468 if (atom2->type == XML_REGEXP_RANGES) {
2469 int i, j, res;
2470 xmlRegRangePtr r1, r2;
2471
2472 /*
2473 * need to check that none of the ranges eventually matches
2474 */
2475 for (i = 0;i < atom1->nbRanges;i++) {
2476 for (j = 0;j < atom2->nbRanges;j++) {
2477 r1 = atom1->ranges[i];
2478 r2 = atom2->ranges[j];
2479 res = xmlFACompareRanges(r1, r2);
2480 if (res == 1) {
2481 ret = 1;
2482 goto done;
2483 }
2484 }
2485 }
2486 ret = 0;
2487 }
2488 break;
2489 default:
2490 goto not_determinist;
2491 }
2492done:
2493 if (atom1->neg != atom2->neg) {
2494 ret = !ret;
2495 }
2496 if (ret == 0)
2497 return(0);
2498not_determinist:
2499 return(1);
2500}
2501
2502/**
2503 * xmlFARecurseDeterminism:
2504 * @ctxt: a regexp parser context
2505 *
2506 * Check whether the associated regexp is determinist,
2507 * should be called after xmlFAEliminateEpsilonTransitions()
2508 *
2509 */
2510static int
2511xmlFARecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state,
2512 int fromnr, int tonr, xmlRegAtomPtr atom) {
2513 int ret = 1;
2514 int res;
2515 int transnr, nbTrans;
2516 xmlRegTransPtr t1;
2517 int deep = 1;
2518
2519 if (state == NULL)
2520 return(ret);
2521 if (state->markd == XML_REGEXP_MARK_VISITED)
2522 return(ret);
2523
2524 if (ctxt->flags & AM_AUTOMATA_RNG)
2525 deep = 0;
2526
2527 /*
2528 * don't recurse on transitions potentially added in the course of
2529 * the elimination.
2530 */
2531 nbTrans = state->nbTrans;
2532 for (transnr = 0;transnr < nbTrans;transnr++) {
2533 t1 = &(state->trans[transnr]);
2534 /*
2535 * check transitions conflicting with the one looked at
2536 */
2537 if ((t1->to < 0) || (t1->to == fromnr))
2538 continue;
2539 if (t1->atom == NULL) {
2540 state->markd = XML_REGEXP_MARK_VISITED;
2541 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t1->to],
2542 fromnr, tonr, atom);
2543 if (res == 0) {
2544 ret = 0;
2545 /* t1->nd = 1; */
2546 }
2547 continue;
2548 }
2549 if (xmlFACompareAtoms(t1->atom, atom, deep)) {
2550 /* Treat equal transitions as deterministic. */
2551 if ((t1->to != tonr) ||
2552 (!xmlFAEqualAtoms(t1->atom, atom, deep)))
2553 ret = 0;
2554 /* mark the transition as non-deterministic */
2555 t1->nd = 1;
2556 }
2557 }
2558 return(ret);
2559}
2560
2561/**
2562 * xmlFAFinishRecurseDeterminism:
2563 * @ctxt: a regexp parser context
2564 *
2565 * Reset flags after checking determinism.
2566 */
2567static void
2568xmlFAFinishRecurseDeterminism(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr state) {
2569 int transnr, nbTrans;
2570
2571 if (state == NULL)
2572 return;
2573 if (state->markd != XML_REGEXP_MARK_VISITED)
2574 return;
2575 state->markd = 0;
2576
2577 nbTrans = state->nbTrans;
2578 for (transnr = 0; transnr < nbTrans; transnr++) {
2579 xmlRegTransPtr t1 = &state->trans[transnr];
2580 if ((t1->atom == NULL) && (t1->to >= 0))
2581 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t1->to]);
2582 }
2583}
2584
2585/**
2586 * xmlFAComputesDeterminism:
2587 * @ctxt: a regexp parser context
2588 *
2589 * Check whether the associated regexp is determinist,
2590 * should be called after xmlFAEliminateEpsilonTransitions()
2591 *
2592 */
2593static int
2594xmlFAComputesDeterminism(xmlRegParserCtxtPtr ctxt) {
2595 int statenr, transnr;
2596 xmlRegStatePtr state;
2597 xmlRegTransPtr t1, t2, last;
2598 int i;
2599 int ret = 1;
2600 int deep = 1;
2601
2602 if (ctxt->determinist != -1)
2603 return(ctxt->determinist);
2604
2605 if (ctxt->flags & AM_AUTOMATA_RNG)
2606 deep = 0;
2607
2608 /*
2609 * First cleanup the automata removing cancelled transitions
2610 */
2611 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2612 state = ctxt->states[statenr];
2613 if (state == NULL)
2614 continue;
2615 if (state->nbTrans < 2)
2616 continue;
2617 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2618 t1 = &(state->trans[transnr]);
2619 /*
2620 * Determinism checks in case of counted or all transitions
2621 * will have to be handled separately
2622 */
2623 if (t1->atom == NULL) {
2624 /* t1->nd = 1; */
2625 continue;
2626 }
2627 if (t1->to < 0) /* eliminated */
2628 continue;
2629 for (i = 0;i < transnr;i++) {
2630 t2 = &(state->trans[i]);
2631 if (t2->to < 0) /* eliminated */
2632 continue;
2633 if (t2->atom != NULL) {
2634 if (t1->to == t2->to) {
2635 /*
2636 * Here we use deep because we want to keep the
2637 * transitions which indicate a conflict
2638 */
2639 if (xmlFAEqualAtoms(t1->atom, t2->atom, deep) &&
2640 (t1->counter == t2->counter) &&
2641 (t1->count == t2->count))
2642 t2->to = -1; /* eliminated */
2643 }
2644 }
2645 }
2646 }
2647 }
2648
2649 /*
2650 * Check for all states that there aren't 2 transitions
2651 * with the same atom and a different target.
2652 */
2653 for (statenr = 0;statenr < ctxt->nbStates;statenr++) {
2654 state = ctxt->states[statenr];
2655 if (state == NULL)
2656 continue;
2657 if (state->nbTrans < 2)
2658 continue;
2659 last = NULL;
2660 for (transnr = 0;transnr < state->nbTrans;transnr++) {
2661 t1 = &(state->trans[transnr]);
2662 /*
2663 * Determinism checks in case of counted or all transitions
2664 * will have to be handled separately
2665 */
2666 if (t1->atom == NULL) {
2667 continue;
2668 }
2669 if (t1->to < 0) /* eliminated */
2670 continue;
2671 for (i = 0;i < transnr;i++) {
2672 t2 = &(state->trans[i]);
2673 if (t2->to < 0) /* eliminated */
2674 continue;
2675 if (t2->atom != NULL) {
2676 /*
2677 * But here we don't use deep because we want to
2678 * find transitions which indicate a conflict
2679 */
2680 if (xmlFACompareAtoms(t1->atom, t2->atom, 1)) {
2681 /*
2682 * Treat equal counter transitions that couldn't be
2683 * eliminated as deterministic.
2684 */
2685 if ((t1->to != t2->to) ||
2686 (t1->counter == t2->counter) ||
2687 (!xmlFAEqualAtoms(t1->atom, t2->atom, deep)))
2688 ret = 0;
2689 /* mark the transitions as non-deterministic ones */
2690 t1->nd = 1;
2691 t2->nd = 1;
2692 last = t1;
2693 }
2694 } else {
2695 int res;
2696
2697 /*
2698 * do the closure in case of remaining specific
2699 * epsilon transitions like choices or all
2700 */
2701 res = xmlFARecurseDeterminism(ctxt, ctxt->states[t2->to],
2702 statenr, t1->to, t1->atom);
2703 xmlFAFinishRecurseDeterminism(ctxt, ctxt->states[t2->to]);
2704 /* don't shortcut the computation so all non deterministic
2705 transition get marked down
2706 if (ret == 0)
2707 return(0);
2708 */
2709 if (res == 0) {
2710 t1->nd = 1;
2711 /* t2->nd = 1; */
2712 last = t1;
2713 ret = 0;
2714 }
2715 }
2716 }
2717 /* don't shortcut the computation so all non deterministic
2718 transition get marked down
2719 if (ret == 0)
2720 break; */
2721 }
2722
2723 /*
2724 * mark specifically the last non-deterministic transition
2725 * from a state since there is no need to set-up rollback
2726 * from it
2727 */
2728 if (last != NULL) {
2729 last->nd = 2;
2730 }
2731
2732 /* don't shortcut the computation so all non deterministic
2733 transition get marked down
2734 if (ret == 0)
2735 break; */
2736 }
2737
2738 ctxt->determinist = ret;
2739 return(ret);
2740}
2741
2742/************************************************************************
2743 * *
2744 * Routines to check input against transition atoms *
2745 * *
2746 ************************************************************************/
2747
2748static int
2749xmlRegCheckCharacterRange(xmlRegAtomType type, int codepoint, int neg,
2750 int start, int end, const xmlChar *blockName) {
2751 int ret = 0;
2752
2753 switch (type) {
2754 case XML_REGEXP_STRING:
2755 case XML_REGEXP_SUBREG:
2756 case XML_REGEXP_RANGES:
2757 case XML_REGEXP_EPSILON:
2758 return(-1);
2759 case XML_REGEXP_ANYCHAR:
2760 ret = ((codepoint != '\n') && (codepoint != '\r'));
2761 break;
2762 case XML_REGEXP_CHARVAL:
2763 ret = ((codepoint >= start) && (codepoint <= end));
2764 break;
2765 case XML_REGEXP_NOTSPACE:
2766 neg = !neg;
2767 /* Falls through. */
2768 case XML_REGEXP_ANYSPACE:
2769 ret = ((codepoint == '\n') || (codepoint == '\r') ||
2770 (codepoint == '\t') || (codepoint == ' '));
2771 break;
2772 case XML_REGEXP_NOTINITNAME:
2773 neg = !neg;
2774 /* Falls through. */
2775 case XML_REGEXP_INITNAME:
2776 ret = (IS_LETTER(codepoint) ||
2777 (codepoint == '_') || (codepoint == ':'));
2778 break;
2779 case XML_REGEXP_NOTNAMECHAR:
2780 neg = !neg;
2781 /* Falls through. */
2782 case XML_REGEXP_NAMECHAR:
2783 ret = (IS_LETTER(codepoint) || IS_DIGIT(codepoint) ||
2784 (codepoint == '.') || (codepoint == '-') ||
2785 (codepoint == '_') || (codepoint == ':') ||
2786 IS_COMBINING(codepoint) || IS_EXTENDER(codepoint));
2787 break;
2788 case XML_REGEXP_NOTDECIMAL:
2789 neg = !neg;
2790 /* Falls through. */
2791 case XML_REGEXP_DECIMAL:
2792 ret = xmlUCSIsCatNd(codepoint);
2793 break;
2794 case XML_REGEXP_REALCHAR:
2795 neg = !neg;
2796 /* Falls through. */
2797 case XML_REGEXP_NOTREALCHAR:
2798 ret = xmlUCSIsCatP(codepoint);
2799 if (ret == 0)
2800 ret = xmlUCSIsCatZ(codepoint);
2801 if (ret == 0)
2802 ret = xmlUCSIsCatC(codepoint);
2803 break;
2804 case XML_REGEXP_LETTER:
2805 ret = xmlUCSIsCatL(codepoint);
2806 break;
2807 case XML_REGEXP_LETTER_UPPERCASE:
2808 ret = xmlUCSIsCatLu(codepoint);
2809 break;
2810 case XML_REGEXP_LETTER_LOWERCASE:
2811 ret = xmlUCSIsCatLl(codepoint);
2812 break;
2813 case XML_REGEXP_LETTER_TITLECASE:
2814 ret = xmlUCSIsCatLt(codepoint);
2815 break;
2816 case XML_REGEXP_LETTER_MODIFIER:
2817 ret = xmlUCSIsCatLm(codepoint);
2818 break;
2819 case XML_REGEXP_LETTER_OTHERS:
2820 ret = xmlUCSIsCatLo(codepoint);
2821 break;
2822 case XML_REGEXP_MARK:
2823 ret = xmlUCSIsCatM(codepoint);
2824 break;
2825 case XML_REGEXP_MARK_NONSPACING:
2826 ret = xmlUCSIsCatMn(codepoint);
2827 break;
2828 case XML_REGEXP_MARK_SPACECOMBINING:
2829 ret = xmlUCSIsCatMc(codepoint);
2830 break;
2831 case XML_REGEXP_MARK_ENCLOSING:
2832 ret = xmlUCSIsCatMe(codepoint);
2833 break;
2834 case XML_REGEXP_NUMBER:
2835 ret = xmlUCSIsCatN(codepoint);
2836 break;
2837 case XML_REGEXP_NUMBER_DECIMAL:
2838 ret = xmlUCSIsCatNd(codepoint);
2839 break;
2840 case XML_REGEXP_NUMBER_LETTER:
2841 ret = xmlUCSIsCatNl(codepoint);
2842 break;
2843 case XML_REGEXP_NUMBER_OTHERS:
2844 ret = xmlUCSIsCatNo(codepoint);
2845 break;
2846 case XML_REGEXP_PUNCT:
2847 ret = xmlUCSIsCatP(codepoint);
2848 break;
2849 case XML_REGEXP_PUNCT_CONNECTOR:
2850 ret = xmlUCSIsCatPc(codepoint);
2851 break;
2852 case XML_REGEXP_PUNCT_DASH:
2853 ret = xmlUCSIsCatPd(codepoint);
2854 break;
2855 case XML_REGEXP_PUNCT_OPEN:
2856 ret = xmlUCSIsCatPs(codepoint);
2857 break;
2858 case XML_REGEXP_PUNCT_CLOSE:
2859 ret = xmlUCSIsCatPe(codepoint);
2860 break;
2861 case XML_REGEXP_PUNCT_INITQUOTE:
2862 ret = xmlUCSIsCatPi(codepoint);
2863 break;
2864 case XML_REGEXP_PUNCT_FINQUOTE:
2865 ret = xmlUCSIsCatPf(codepoint);
2866 break;
2867 case XML_REGEXP_PUNCT_OTHERS:
2868 ret = xmlUCSIsCatPo(codepoint);
2869 break;
2870 case XML_REGEXP_SEPAR:
2871 ret = xmlUCSIsCatZ(codepoint);
2872 break;
2873 case XML_REGEXP_SEPAR_SPACE:
2874 ret = xmlUCSIsCatZs(codepoint);
2875 break;
2876 case XML_REGEXP_SEPAR_LINE:
2877 ret = xmlUCSIsCatZl(codepoint);
2878 break;
2879 case XML_REGEXP_SEPAR_PARA:
2880 ret = xmlUCSIsCatZp(codepoint);
2881 break;
2882 case XML_REGEXP_SYMBOL:
2883 ret = xmlUCSIsCatS(codepoint);
2884 break;
2885 case XML_REGEXP_SYMBOL_MATH:
2886 ret = xmlUCSIsCatSm(codepoint);
2887 break;
2888 case XML_REGEXP_SYMBOL_CURRENCY:
2889 ret = xmlUCSIsCatSc(codepoint);
2890 break;
2891 case XML_REGEXP_SYMBOL_MODIFIER:
2892 ret = xmlUCSIsCatSk(codepoint);
2893 break;
2894 case XML_REGEXP_SYMBOL_OTHERS:
2895 ret = xmlUCSIsCatSo(codepoint);
2896 break;
2897 case XML_REGEXP_OTHER:
2898 ret = xmlUCSIsCatC(codepoint);
2899 break;
2900 case XML_REGEXP_OTHER_CONTROL:
2901 ret = xmlUCSIsCatCc(codepoint);
2902 break;
2903 case XML_REGEXP_OTHER_FORMAT:
2904 ret = xmlUCSIsCatCf(codepoint);
2905 break;
2906 case XML_REGEXP_OTHER_PRIVATE:
2907 ret = xmlUCSIsCatCo(codepoint);
2908 break;
2909 case XML_REGEXP_OTHER_NA:
2910 /* ret = xmlUCSIsCatCn(codepoint); */
2911 /* Seems it doesn't exist anymore in recent Unicode releases */
2912 ret = 0;
2913 break;
2914 case XML_REGEXP_BLOCK_NAME:
2915 ret = xmlUCSIsBlock(codepoint, (const char *) blockName);
2916 break;
2917 }
2918 if (neg)
2919 return(!ret);
2920 return(ret);
2921}
2922
2923static int
2924xmlRegCheckCharacter(xmlRegAtomPtr atom, int codepoint) {
2925 int i, ret = 0;
2926 xmlRegRangePtr range;
2927
2928 if ((atom == NULL) || (!IS_CHAR(codepoint)))
2929 return(-1);
2930
2931 switch (atom->type) {
2932 case XML_REGEXP_SUBREG:
2933 case XML_REGEXP_EPSILON:
2934 return(-1);
2935 case XML_REGEXP_CHARVAL:
2936 return(codepoint == atom->codepoint);
2937 case XML_REGEXP_RANGES: {
2938 int accept = 0;
2939
2940 for (i = 0;i < atom->nbRanges;i++) {
2941 range = atom->ranges[i];
2942 if (range->neg == 2) {
2943 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2944 0, range->start, range->end,
2945 range->blockName);
2946 if (ret != 0)
2947 return(0); /* excluded char */
2948 } else if (range->neg) {
2949 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2950 0, range->start, range->end,
2951 range->blockName);
2952 if (ret == 0)
2953 accept = 1;
2954 else
2955 return(0);
2956 } else {
2957 ret = xmlRegCheckCharacterRange(range->type, codepoint,
2958 0, range->start, range->end,
2959 range->blockName);
2960 if (ret != 0)
2961 accept = 1; /* might still be excluded */
2962 }
2963 }
2964 return(accept);
2965 }
2966 case XML_REGEXP_STRING:
2967 printf("TODO: XML_REGEXP_STRING\n");
2968 return(-1);
2969 case XML_REGEXP_ANYCHAR:
2970 case XML_REGEXP_ANYSPACE:
2971 case XML_REGEXP_NOTSPACE:
2972 case XML_REGEXP_INITNAME:
2973 case XML_REGEXP_NOTINITNAME:
2974 case XML_REGEXP_NAMECHAR:
2975 case XML_REGEXP_NOTNAMECHAR:
2976 case XML_REGEXP_DECIMAL:
2977 case XML_REGEXP_NOTDECIMAL:
2978 case XML_REGEXP_REALCHAR:
2979 case XML_REGEXP_NOTREALCHAR:
2980 case XML_REGEXP_LETTER:
2981 case XML_REGEXP_LETTER_UPPERCASE:
2982 case XML_REGEXP_LETTER_LOWERCASE:
2983 case XML_REGEXP_LETTER_TITLECASE:
2984 case XML_REGEXP_LETTER_MODIFIER:
2985 case XML_REGEXP_LETTER_OTHERS:
2986 case XML_REGEXP_MARK:
2987 case XML_REGEXP_MARK_NONSPACING:
2988 case XML_REGEXP_MARK_SPACECOMBINING:
2989 case XML_REGEXP_MARK_ENCLOSING:
2990 case XML_REGEXP_NUMBER:
2991 case XML_REGEXP_NUMBER_DECIMAL:
2992 case XML_REGEXP_NUMBER_LETTER:
2993 case XML_REGEXP_NUMBER_OTHERS:
2994 case XML_REGEXP_PUNCT:
2995 case XML_REGEXP_PUNCT_CONNECTOR:
2996 case XML_REGEXP_PUNCT_DASH:
2997 case XML_REGEXP_PUNCT_OPEN:
2998 case XML_REGEXP_PUNCT_CLOSE:
2999 case XML_REGEXP_PUNCT_INITQUOTE:
3000 case XML_REGEXP_PUNCT_FINQUOTE:
3001 case XML_REGEXP_PUNCT_OTHERS:
3002 case XML_REGEXP_SEPAR:
3003 case XML_REGEXP_SEPAR_SPACE:
3004 case XML_REGEXP_SEPAR_LINE:
3005 case XML_REGEXP_SEPAR_PARA:
3006 case XML_REGEXP_SYMBOL:
3007 case XML_REGEXP_SYMBOL_MATH:
3008 case XML_REGEXP_SYMBOL_CURRENCY:
3009 case XML_REGEXP_SYMBOL_MODIFIER:
3010 case XML_REGEXP_SYMBOL_OTHERS:
3011 case XML_REGEXP_OTHER:
3012 case XML_REGEXP_OTHER_CONTROL:
3013 case XML_REGEXP_OTHER_FORMAT:
3014 case XML_REGEXP_OTHER_PRIVATE:
3015 case XML_REGEXP_OTHER_NA:
3016 case XML_REGEXP_BLOCK_NAME:
3017 ret = xmlRegCheckCharacterRange(atom->type, codepoint, 0, 0, 0,
3018 (const xmlChar *)atom->valuep);
3019 if (atom->neg)
3020 ret = !ret;
3021 break;
3022 }
3023 return(ret);
3024}
3025
3026/************************************************************************
3027 * *
3028 * Saving and restoring state of an execution context *
3029 * *
3030 ************************************************************************/
3031
3032static void
3033xmlFARegExecSave(xmlRegExecCtxtPtr exec) {
3034#ifdef MAX_PUSH
3035 if (exec->nbPush > MAX_PUSH) {
3036 exec->status = XML_REGEXP_INTERNAL_LIMIT;
3037 return;
3038 }
3039 exec->nbPush++;
3040#endif
3041
3042 if (exec->maxRollbacks == 0) {
3043 exec->maxRollbacks = 4;
3044 exec->rollbacks = (xmlRegExecRollback *) xmlMalloc(exec->maxRollbacks *
3045 sizeof(xmlRegExecRollback));
3046 if (exec->rollbacks == NULL) {
3047 exec->maxRollbacks = 0;
3048 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3049 return;
3050 }
3051 memset(exec->rollbacks, 0,
3052 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3053 } else if (exec->nbRollbacks >= exec->maxRollbacks) {
3054 xmlRegExecRollback *tmp;
3055 int len = exec->maxRollbacks;
3056
3057 exec->maxRollbacks *= 2;
3058 tmp = (xmlRegExecRollback *) xmlRealloc(exec->rollbacks,
3059 exec->maxRollbacks * sizeof(xmlRegExecRollback));
3060 if (tmp == NULL) {
3061 exec->maxRollbacks /= 2;
3062 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3063 return;
3064 }
3065 exec->rollbacks = tmp;
3066 tmp = &exec->rollbacks[len];
3067 memset(tmp, 0, (exec->maxRollbacks - len) * sizeof(xmlRegExecRollback));
3068 }
3069 exec->rollbacks[exec->nbRollbacks].state = exec->state;
3070 exec->rollbacks[exec->nbRollbacks].index = exec->index;
3071 exec->rollbacks[exec->nbRollbacks].nextbranch = exec->transno + 1;
3072 if (exec->comp->nbCounters > 0) {
3073 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3074 exec->rollbacks[exec->nbRollbacks].counts = (int *)
3075 xmlMalloc(exec->comp->nbCounters * sizeof(int));
3076 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3077 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3078 return;
3079 }
3080 }
3081 memcpy(exec->rollbacks[exec->nbRollbacks].counts, exec->counts,
3082 exec->comp->nbCounters * sizeof(int));
3083 }
3084 exec->nbRollbacks++;
3085}
3086
3087static void
3088xmlFARegExecRollBack(xmlRegExecCtxtPtr exec) {
3089 if (exec->status != XML_REGEXP_OK)
3090 return;
3091 if (exec->nbRollbacks <= 0) {
3092 exec->status = XML_REGEXP_NOT_FOUND;
3093 return;
3094 }
3095 exec->nbRollbacks--;
3096 exec->state = exec->rollbacks[exec->nbRollbacks].state;
3097 exec->index = exec->rollbacks[exec->nbRollbacks].index;
3098 exec->transno = exec->rollbacks[exec->nbRollbacks].nextbranch;
3099 if (exec->comp->nbCounters > 0) {
3100 if (exec->rollbacks[exec->nbRollbacks].counts == NULL) {
3101 fprintf(stderr, "exec save: allocation failed");
3102 exec->status = XML_REGEXP_INTERNAL_ERROR;
3103 return;
3104 }
3105 if (exec->counts) {
3106 memcpy(exec->counts, exec->rollbacks[exec->nbRollbacks].counts,
3107 exec->comp->nbCounters * sizeof(int));
3108 }
3109 }
3110}
3111
3112/************************************************************************
3113 * *
3114 * Verifier, running an input against a compiled regexp *
3115 * *
3116 ************************************************************************/
3117
3118static int
3119xmlFARegExec(xmlRegexpPtr comp, const xmlChar *content) {
3120 xmlRegExecCtxt execval;
3121 xmlRegExecCtxtPtr exec = &execval;
3122 int ret, codepoint = 0, len, deter;
3123
3124 exec->inputString = content;
3125 exec->index = 0;
3126 exec->nbPush = 0;
3127 exec->determinist = 1;
3128 exec->maxRollbacks = 0;
3129 exec->nbRollbacks = 0;
3130 exec->rollbacks = NULL;
3131 exec->status = XML_REGEXP_OK;
3132 exec->comp = comp;
3133 exec->state = comp->states[0];
3134 exec->transno = 0;
3135 exec->transcount = 0;
3136 exec->inputStack = NULL;
3137 exec->inputStackMax = 0;
3138 if (comp->nbCounters > 0) {
3139 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int));
3140 if (exec->counts == NULL) {
3141 return(XML_REGEXP_OUT_OF_MEMORY);
3142 }
3143 memset(exec->counts, 0, comp->nbCounters * sizeof(int));
3144 } else
3145 exec->counts = NULL;
3146 while ((exec->status == XML_REGEXP_OK) && (exec->state != NULL) &&
3147 ((exec->inputString[exec->index] != 0) ||
3148 ((exec->state != NULL) &&
3149 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3150 xmlRegTransPtr trans;
3151 xmlRegAtomPtr atom;
3152
3153 /*
3154 * If end of input on non-terminal state, rollback, however we may
3155 * still have epsilon like transition for counted transitions
3156 * on counters, in that case don't break too early. Additionally,
3157 * if we are working on a range like "AB{0,2}", where B is not present,
3158 * we don't want to break.
3159 */
3160 len = 1;
3161 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL)) {
3162 /*
3163 * if there is a transition, we must check if
3164 * atom allows minOccurs of 0
3165 */
3166 if (exec->transno < exec->state->nbTrans) {
3167 trans = &exec->state->trans[exec->transno];
3168 if (trans->to >=0) {
3169 atom = trans->atom;
3170 if (!((atom->min == 0) && (atom->max > 0)))
3171 goto rollback;
3172 }
3173 } else
3174 goto rollback;
3175 }
3176
3177 exec->transcount = 0;
3178 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3179 trans = &exec->state->trans[exec->transno];
3180 if (trans->to < 0)
3181 continue;
3182 atom = trans->atom;
3183 ret = 0;
3184 deter = 1;
3185 if (trans->count >= 0) {
3186 int count;
3187 xmlRegCounterPtr counter;
3188
3189 if (exec->counts == NULL) {
3190 exec->status = XML_REGEXP_INTERNAL_ERROR;
3191 goto error;
3192 }
3193 /*
3194 * A counted transition.
3195 */
3196
3197 count = exec->counts[trans->count];
3198 counter = &exec->comp->counters[trans->count];
3199 ret = ((count >= counter->min) && (count <= counter->max));
3200 if ((ret) && (counter->min != counter->max))
3201 deter = 0;
3202 } else if (atom == NULL) {
3203 fprintf(stderr, "epsilon transition left at runtime\n");
3204 exec->status = XML_REGEXP_INTERNAL_ERROR;
3205 break;
3206 } else if (exec->inputString[exec->index] != 0) {
3207 len = 4;
3208 codepoint = xmlGetUTF8Char(&exec->inputString[exec->index],
3209 &len);
3210 if (codepoint < 0) {
3211 exec->status = XML_REGEXP_INVALID_UTF8;
3212 goto error;
3213 }
3214 ret = xmlRegCheckCharacter(atom, codepoint);
3215 if ((ret == 1) && (atom->min >= 0) && (atom->max > 0)) {
3216 xmlRegStatePtr to = comp->states[trans->to];
3217
3218 /*
3219 * this is a multiple input sequence
3220 * If there is a counter associated increment it now.
3221 * do not increment if the counter is already over the
3222 * maximum limit in which case get to next transition
3223 */
3224 if (trans->counter >= 0) {
3225 xmlRegCounterPtr counter;
3226
3227 if ((exec->counts == NULL) ||
3228 (exec->comp == NULL) ||
3229 (exec->comp->counters == NULL)) {
3230 exec->status = XML_REGEXP_INTERNAL_ERROR;
3231 goto error;
3232 }
3233 counter = &exec->comp->counters[trans->counter];
3234 if (exec->counts[trans->counter] >= counter->max)
3235 continue; /* for loop on transitions */
3236 }
3237 /* Save before incrementing */
3238 if (exec->state->nbTrans > exec->transno + 1) {
3239 xmlFARegExecSave(exec);
3240 if (exec->status != XML_REGEXP_OK)
3241 goto error;
3242 }
3243 if (trans->counter >= 0) {
3244 exec->counts[trans->counter]++;
3245 }
3246 exec->transcount = 1;
3247 do {
3248 /*
3249 * Try to progress as much as possible on the input
3250 */
3251 if (exec->transcount == atom->max) {
3252 break;
3253 }
3254 exec->index += len;
3255 /*
3256 * End of input: stop here
3257 */
3258 if (exec->inputString[exec->index] == 0) {
3259 exec->index -= len;
3260 break;
3261 }
3262 if (exec->transcount >= atom->min) {
3263 int transno = exec->transno;
3264 xmlRegStatePtr state = exec->state;
3265
3266 /*
3267 * The transition is acceptable save it
3268 */
3269 exec->transno = -1; /* trick */
3270 exec->state = to;
3271 xmlFARegExecSave(exec);
3272 if (exec->status != XML_REGEXP_OK)
3273 goto error;
3274 exec->transno = transno;
3275 exec->state = state;
3276 }
3277 len = 4;
3278 codepoint = xmlGetUTF8Char(
3279 &exec->inputString[exec->index], &len);
3280 if (codepoint < 0) {
3281 exec->status = XML_REGEXP_INVALID_UTF8;
3282 goto error;
3283 }
3284 ret = xmlRegCheckCharacter(atom, codepoint);
3285 exec->transcount++;
3286 } while (ret == 1);
3287 if (exec->transcount < atom->min)
3288 ret = 0;
3289
3290 /*
3291 * If the last check failed but one transition was found
3292 * possible, rollback
3293 */
3294 if (ret < 0)
3295 ret = 0;
3296 if (ret == 0) {
3297 goto rollback;
3298 }
3299 if (trans->counter >= 0) {
3300 if (exec->counts == NULL) {
3301 exec->status = XML_REGEXP_INTERNAL_ERROR;
3302 goto error;
3303 }
3304 exec->counts[trans->counter]--;
3305 }
3306 } else if ((ret == 0) && (atom->min == 0) && (atom->max > 0)) {
3307 /*
3308 * we don't match on the codepoint, but minOccurs of 0
3309 * says that's ok. Setting len to 0 inhibits stepping
3310 * over the codepoint.
3311 */
3312 exec->transcount = 1;
3313 len = 0;
3314 ret = 1;
3315 }
3316 } else if ((atom->min == 0) && (atom->max > 0)) {
3317 /* another spot to match when minOccurs is 0 */
3318 exec->transcount = 1;
3319 len = 0;
3320 ret = 1;
3321 }
3322 if (ret == 1) {
3323 if ((trans->nd == 1) ||
3324 ((trans->count >= 0) && (deter == 0) &&
3325 (exec->state->nbTrans > exec->transno + 1))) {
3326 xmlFARegExecSave(exec);
3327 if (exec->status != XML_REGEXP_OK)
3328 goto error;
3329 }
3330 if (trans->counter >= 0) {
3331 xmlRegCounterPtr counter;
3332
3333 /* make sure we don't go over the counter maximum value */
3334 if ((exec->counts == NULL) ||
3335 (exec->comp == NULL) ||
3336 (exec->comp->counters == NULL)) {
3337 exec->status = XML_REGEXP_INTERNAL_ERROR;
3338 goto error;
3339 }
3340 counter = &exec->comp->counters[trans->counter];
3341 if (exec->counts[trans->counter] >= counter->max)
3342 continue; /* for loop on transitions */
3343 exec->counts[trans->counter]++;
3344 }
3345 if ((trans->count >= 0) &&
3346 (trans->count < REGEXP_ALL_COUNTER)) {
3347 if (exec->counts == NULL) {
3348 exec->status = XML_REGEXP_INTERNAL_ERROR;
3349 goto error;
3350 }
3351 exec->counts[trans->count] = 0;
3352 }
3353 exec->state = comp->states[trans->to];
3354 exec->transno = 0;
3355 if (trans->atom != NULL) {
3356 exec->index += len;
3357 }
3358 goto progress;
3359 } else if (ret < 0) {
3360 exec->status = XML_REGEXP_INTERNAL_ERROR;
3361 break;
3362 }
3363 }
3364 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3365rollback:
3366 /*
3367 * Failed to find a way out
3368 */
3369 exec->determinist = 0;
3370 xmlFARegExecRollBack(exec);
3371 }
3372progress:
3373 continue;
3374 }
3375error:
3376 if (exec->rollbacks != NULL) {
3377 if (exec->counts != NULL) {
3378 int i;
3379
3380 for (i = 0;i < exec->maxRollbacks;i++)
3381 if (exec->rollbacks[i].counts != NULL)
3382 xmlFree(exec->rollbacks[i].counts);
3383 }
3384 xmlFree(exec->rollbacks);
3385 }
3386 if (exec->state == NULL)
3387 return(XML_REGEXP_INTERNAL_ERROR);
3388 if (exec->counts != NULL)
3389 xmlFree(exec->counts);
3390 if (exec->status == XML_REGEXP_OK)
3391 return(1);
3392 if (exec->status == XML_REGEXP_NOT_FOUND)
3393 return(0);
3394 return(exec->status);
3395}
3396
3397/************************************************************************
3398 * *
3399 * Progressive interface to the verifier one atom at a time *
3400 * *
3401 ************************************************************************/
3402
3403/**
3404 * xmlRegNewExecCtxt:
3405 * @comp: a precompiled regular expression
3406 * @callback: a callback function used for handling progresses in the
3407 * automata matching phase
3408 * @data: the context data associated to the callback in this context
3409 *
3410 * Build a context used for progressive evaluation of a regexp.
3411 *
3412 * Returns the new context
3413 */
3414xmlRegExecCtxtPtr
3415xmlRegNewExecCtxt(xmlRegexpPtr comp, xmlRegExecCallbacks callback, void *data) {
3416 xmlRegExecCtxtPtr exec;
3417
3418 if (comp == NULL)
3419 return(NULL);
3420 if ((comp->compact == NULL) && (comp->states == NULL))
3421 return(NULL);
3422 exec = (xmlRegExecCtxtPtr) xmlMalloc(sizeof(xmlRegExecCtxt));
3423 if (exec == NULL)
3424 return(NULL);
3425 memset(exec, 0, sizeof(xmlRegExecCtxt));
3426 exec->inputString = NULL;
3427 exec->index = 0;
3428 exec->determinist = 1;
3429 exec->maxRollbacks = 0;
3430 exec->nbRollbacks = 0;
3431 exec->rollbacks = NULL;
3432 exec->status = XML_REGEXP_OK;
3433 exec->comp = comp;
3434 if (comp->compact == NULL)
3435 exec->state = comp->states[0];
3436 exec->transno = 0;
3437 exec->transcount = 0;
3438 exec->callback = callback;
3439 exec->data = data;
3440 if (comp->nbCounters > 0) {
3441 /*
3442 * For error handling, exec->counts is allocated twice the size
3443 * the second half is used to store the data in case of rollback
3444 */
3445 exec->counts = (int *) xmlMalloc(comp->nbCounters * sizeof(int)
3446 * 2);
3447 if (exec->counts == NULL) {
3448 xmlFree(exec);
3449 return(NULL);
3450 }
3451 memset(exec->counts, 0, comp->nbCounters * sizeof(int) * 2);
3452 exec->errCounts = &exec->counts[comp->nbCounters];
3453 } else {
3454 exec->counts = NULL;
3455 exec->errCounts = NULL;
3456 }
3457 exec->inputStackMax = 0;
3458 exec->inputStackNr = 0;
3459 exec->inputStack = NULL;
3460 exec->errStateNo = -1;
3461 exec->errString = NULL;
3462 exec->nbPush = 0;
3463 return(exec);
3464}
3465
3466/**
3467 * xmlRegFreeExecCtxt:
3468 * @exec: a regular expression evaluation context
3469 *
3470 * Free the structures associated to a regular expression evaluation context.
3471 */
3472void
3473xmlRegFreeExecCtxt(xmlRegExecCtxtPtr exec) {
3474 if (exec == NULL)
3475 return;
3476
3477 if (exec->rollbacks != NULL) {
3478 if (exec->counts != NULL) {
3479 int i;
3480
3481 for (i = 0;i < exec->maxRollbacks;i++)
3482 if (exec->rollbacks[i].counts != NULL)
3483 xmlFree(exec->rollbacks[i].counts);
3484 }
3485 xmlFree(exec->rollbacks);
3486 }
3487 if (exec->counts != NULL)
3488 xmlFree(exec->counts);
3489 if (exec->inputStack != NULL) {
3490 int i;
3491
3492 for (i = 0;i < exec->inputStackNr;i++) {
3493 if (exec->inputStack[i].value != NULL)
3494 xmlFree(exec->inputStack[i].value);
3495 }
3496 xmlFree(exec->inputStack);
3497 }
3498 if (exec->errString != NULL)
3499 xmlFree(exec->errString);
3500 xmlFree(exec);
3501}
3502
3503static int
3504xmlRegExecSetErrString(xmlRegExecCtxtPtr exec, const xmlChar *value) {
3505 if (exec->errString != NULL)
3506 xmlFree(exec->errString);
3507 if (value == NULL) {
3508 exec->errString = NULL;
3509 } else {
3510 exec->errString = xmlStrdup(value);
3511 if (exec->errString == NULL) {
3512 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3513 return(-1);
3514 }
3515 }
3516 return(0);
3517}
3518
3519static void
3520xmlFARegExecSaveInputString(xmlRegExecCtxtPtr exec, const xmlChar *value,
3521 void *data) {
3522 if (exec->inputStackMax == 0) {
3523 exec->inputStackMax = 4;
3524 exec->inputStack = (xmlRegInputTokenPtr)
3525 xmlMalloc(exec->inputStackMax * sizeof(xmlRegInputToken));
3526 if (exec->inputStack == NULL) {
3527 exec->inputStackMax = 0;
3528 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3529 return;
3530 }
3531 } else if (exec->inputStackNr + 1 >= exec->inputStackMax) {
3532 xmlRegInputTokenPtr tmp;
3533
3534 exec->inputStackMax *= 2;
3535 tmp = (xmlRegInputTokenPtr) xmlRealloc(exec->inputStack,
3536 exec->inputStackMax * sizeof(xmlRegInputToken));
3537 if (tmp == NULL) {
3538 exec->inputStackMax /= 2;
3539 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3540 return;
3541 }
3542 exec->inputStack = tmp;
3543 }
3544 if (value == NULL) {
3545 exec->inputStack[exec->inputStackNr].value = NULL;
3546 } else {
3547 exec->inputStack[exec->inputStackNr].value = xmlStrdup(value);
3548 if (exec->inputStack[exec->inputStackNr].value == NULL) {
3549 exec->status = XML_REGEXP_OUT_OF_MEMORY;
3550 return;
3551 }
3552 }
3553 exec->inputStack[exec->inputStackNr].data = data;
3554 exec->inputStackNr++;
3555 exec->inputStack[exec->inputStackNr].value = NULL;
3556 exec->inputStack[exec->inputStackNr].data = NULL;
3557}
3558
3559/**
3560 * xmlRegStrEqualWildcard:
3561 * @expStr: the string to be evaluated
3562 * @valStr: the validation string
3563 *
3564 * Checks if both strings are equal or have the same content. "*"
3565 * can be used as a wildcard in @valStr; "|" is used as a separator of
3566 * substrings in both @expStr and @valStr.
3567 *
3568 * Returns 1 if the comparison is satisfied and the number of substrings
3569 * is equal, 0 otherwise.
3570 */
3571
3572static int
3573xmlRegStrEqualWildcard(const xmlChar *expStr, const xmlChar *valStr) {
3574 if (expStr == valStr) return(1);
3575 if (expStr == NULL) return(0);
3576 if (valStr == NULL) return(0);
3577 do {
3578 /*
3579 * Eval if we have a wildcard for the current item.
3580 */
3581 if (*expStr != *valStr) {
3582 /* if one of them starts with a wildcard make valStr be it */
3583 if (*valStr == '*') {
3584 const xmlChar *tmp;
3585
3586 tmp = valStr;
3587 valStr = expStr;
3588 expStr = tmp;
3589 }
3590 if ((*valStr != 0) && (*expStr != 0) && (*expStr++ == '*')) {
3591 do {
3592 if (*valStr == XML_REG_STRING_SEPARATOR)
3593 break;
3594 valStr++;
3595 } while (*valStr != 0);
3596 continue;
3597 } else
3598 return(0);
3599 }
3600 expStr++;
3601 valStr++;
3602 } while (*valStr != 0);
3603 if (*expStr != 0)
3604 return (0);
3605 else
3606 return (1);
3607}
3608
3609/**
3610 * xmlRegCompactPushString:
3611 * @exec: a regexp execution context
3612 * @comp: the precompiled exec with a compact table
3613 * @value: a string token input
3614 * @data: data associated to the token to reuse in callbacks
3615 *
3616 * Push one input token in the execution context
3617 *
3618 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3619 * a negative value in case of error.
3620 */
3621static int
3622xmlRegCompactPushString(xmlRegExecCtxtPtr exec,
3623 xmlRegexpPtr comp,
3624 const xmlChar *value,
3625 void *data) {
3626 int state = exec->index;
3627 int i, target;
3628
3629 if ((comp == NULL) || (comp->compact == NULL) || (comp->stringMap == NULL))
3630 return(-1);
3631
3632 if (value == NULL) {
3633 /*
3634 * are we at a final state ?
3635 */
3636 if (comp->compact[state * (comp->nbstrings + 1)] ==
3637 XML_REGEXP_FINAL_STATE)
3638 return(1);
3639 return(0);
3640 }
3641
3642 /*
3643 * Examine all outside transitions from current state
3644 */
3645 for (i = 0;i < comp->nbstrings;i++) {
3646 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
3647 if ((target > 0) && (target <= comp->nbstates)) {
3648 target--; /* to avoid 0 */
3649 if (xmlRegStrEqualWildcard(comp->stringMap[i], value)) {
3650 exec->index = target;
3651 if ((exec->callback != NULL) && (comp->transdata != NULL)) {
3652 exec->callback(exec->data, value,
3653 comp->transdata[state * comp->nbstrings + i], data);
3654 }
3655 if (comp->compact[target * (comp->nbstrings + 1)] ==
3656 XML_REGEXP_SINK_STATE)
3657 goto error;
3658
3659 if (comp->compact[target * (comp->nbstrings + 1)] ==
3660 XML_REGEXP_FINAL_STATE)
3661 return(1);
3662 return(0);
3663 }
3664 }
3665 }
3666 /*
3667 * Failed to find an exit transition out from current state for the
3668 * current token
3669 */
3670error:
3671 exec->errStateNo = state;
3672 exec->status = XML_REGEXP_NOT_FOUND;
3673 xmlRegExecSetErrString(exec, value);
3674 return(exec->status);
3675}
3676
3677/**
3678 * xmlRegExecPushStringInternal:
3679 * @exec: a regexp execution context or NULL to indicate the end
3680 * @value: a string token input
3681 * @data: data associated to the token to reuse in callbacks
3682 * @compound: value was assembled from 2 strings
3683 *
3684 * Push one input token in the execution context
3685 *
3686 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
3687 * a negative value in case of error.
3688 */
3689static int
3690xmlRegExecPushStringInternal(xmlRegExecCtxtPtr exec, const xmlChar *value,
3691 void *data, int compound) {
3692 xmlRegTransPtr trans;
3693 xmlRegAtomPtr atom;
3694 int ret;
3695 int final = 0;
3696 int progress = 1;
3697
3698 if (exec == NULL)
3699 return(-1);
3700 if (exec->comp == NULL)
3701 return(-1);
3702 if (exec->status != XML_REGEXP_OK)
3703 return(exec->status);
3704
3705 if (exec->comp->compact != NULL)
3706 return(xmlRegCompactPushString(exec, exec->comp, value, data));
3707
3708 if (value == NULL) {
3709 if (exec->state->type == XML_REGEXP_FINAL_STATE)
3710 return(1);
3711 final = 1;
3712 }
3713
3714 /*
3715 * If we have an active rollback stack push the new value there
3716 * and get back to where we were left
3717 */
3718 if ((value != NULL) && (exec->inputStackNr > 0)) {
3719 xmlFARegExecSaveInputString(exec, value, data);
3720 value = exec->inputStack[exec->index].value;
3721 data = exec->inputStack[exec->index].data;
3722 }
3723
3724 while ((exec->status == XML_REGEXP_OK) &&
3725 ((value != NULL) ||
3726 ((final == 1) &&
3727 (exec->state->type != XML_REGEXP_FINAL_STATE)))) {
3728
3729 /*
3730 * End of input on non-terminal state, rollback, however we may
3731 * still have epsilon like transition for counted transitions
3732 * on counters, in that case don't break too early.
3733 */
3734 if ((value == NULL) && (exec->counts == NULL))
3735 goto rollback;
3736
3737 exec->transcount = 0;
3738 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
3739 trans = &exec->state->trans[exec->transno];
3740 if (trans->to < 0)
3741 continue;
3742 atom = trans->atom;
3743 ret = 0;
3744 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
3745 int i;
3746 int count;
3747 xmlRegTransPtr t;
3748 xmlRegCounterPtr counter;
3749
3750 ret = 0;
3751
3752 /*
3753 * Check all counted transitions from the current state
3754 */
3755 if ((value == NULL) && (final)) {
3756 ret = 1;
3757 } else if (value != NULL) {
3758 for (i = 0;i < exec->state->nbTrans;i++) {
3759 t = &exec->state->trans[i];
3760 if ((t->counter < 0) || (t == trans))
3761 continue;
3762 counter = &exec->comp->counters[t->counter];
3763 count = exec->counts[t->counter];
3764 if ((count < counter->max) &&
3765 (t->atom != NULL) &&
3766 (xmlStrEqual(value, t->atom->valuep))) {
3767 ret = 0;
3768 break;
3769 }
3770 if ((count >= counter->min) &&
3771 (count < counter->max) &&
3772 (t->atom != NULL) &&
3773 (xmlStrEqual(value, t->atom->valuep))) {
3774 ret = 1;
3775 break;
3776 }
3777 }
3778 }
3779 } else if (trans->count == REGEXP_ALL_COUNTER) {
3780 int i;
3781 int count;
3782 xmlRegTransPtr t;
3783 xmlRegCounterPtr counter;
3784
3785 ret = 1;
3786
3787 /*
3788 * Check all counted transitions from the current state
3789 */
3790 for (i = 0;i < exec->state->nbTrans;i++) {
3791 t = &exec->state->trans[i];
3792 if ((t->counter < 0) || (t == trans))
3793 continue;
3794 counter = &exec->comp->counters[t->counter];
3795 count = exec->counts[t->counter];
3796 if ((count < counter->min) || (count > counter->max)) {
3797 ret = 0;
3798 break;
3799 }
3800 }
3801 } else if (trans->count >= 0) {
3802 int count;
3803 xmlRegCounterPtr counter;
3804
3805 /*
3806 * A counted transition.
3807 */
3808
3809 count = exec->counts[trans->count];
3810 counter = &exec->comp->counters[trans->count];
3811 ret = ((count >= counter->min) && (count <= counter->max));
3812 } else if (atom == NULL) {
3813 fprintf(stderr, "epsilon transition left at runtime\n");
3814 exec->status = XML_REGEXP_INTERNAL_ERROR;
3815 break;
3816 } else if (value != NULL) {
3817 ret = xmlRegStrEqualWildcard(atom->valuep, value);
3818 if (atom->neg) {
3819 ret = !ret;
3820 if (!compound)
3821 ret = 0;
3822 }
3823 if ((ret == 1) && (trans->counter >= 0)) {
3824 xmlRegCounterPtr counter;
3825 int count;
3826
3827 count = exec->counts[trans->counter];
3828 counter = &exec->comp->counters[trans->counter];
3829 if (count >= counter->max)
3830 ret = 0;
3831 }
3832
3833 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
3834 xmlRegStatePtr to = exec->comp->states[trans->to];
3835
3836 /*
3837 * this is a multiple input sequence
3838 */
3839 if (exec->state->nbTrans > exec->transno + 1) {
3840 if (exec->inputStackNr <= 0) {
3841 xmlFARegExecSaveInputString(exec, value, data);
3842 }
3843 xmlFARegExecSave(exec);
3844 }
3845 exec->transcount = 1;
3846 do {
3847 /*
3848 * Try to progress as much as possible on the input
3849 */
3850 if (exec->transcount == atom->max) {
3851 break;
3852 }
3853 exec->index++;
3854 value = exec->inputStack[exec->index].value;
3855 data = exec->inputStack[exec->index].data;
3856
3857 /*
3858 * End of input: stop here
3859 */
3860 if (value == NULL) {
3861 exec->index --;
3862 break;
3863 }
3864 if (exec->transcount >= atom->min) {
3865 int transno = exec->transno;
3866 xmlRegStatePtr state = exec->state;
3867
3868 /*
3869 * The transition is acceptable save it
3870 */
3871 exec->transno = -1; /* trick */
3872 exec->state = to;
3873 if (exec->inputStackNr <= 0) {
3874 xmlFARegExecSaveInputString(exec, value, data);
3875 }
3876 xmlFARegExecSave(exec);
3877 exec->transno = transno;
3878 exec->state = state;
3879 }
3880 ret = xmlStrEqual(value, atom->valuep);
3881 exec->transcount++;
3882 } while (ret == 1);
3883 if (exec->transcount < atom->min)
3884 ret = 0;
3885
3886 /*
3887 * If the last check failed but one transition was found
3888 * possible, rollback
3889 */
3890 if (ret < 0)
3891 ret = 0;
3892 if (ret == 0) {
3893 goto rollback;
3894 }
3895 }
3896 }
3897 if (ret == 1) {
3898 if ((exec->callback != NULL) && (atom != NULL) &&
3899 (data != NULL)) {
3900 exec->callback(exec->data, atom->valuep,
3901 atom->data, data);
3902 }
3903 if (exec->state->nbTrans > exec->transno + 1) {
3904 if (exec->inputStackNr <= 0) {
3905 xmlFARegExecSaveInputString(exec, value, data);
3906 }
3907 xmlFARegExecSave(exec);
3908 }
3909 if (trans->counter >= 0) {
3910 exec->counts[trans->counter]++;
3911 }
3912 if ((trans->count >= 0) &&
3913 (trans->count < REGEXP_ALL_COUNTER)) {
3914 exec->counts[trans->count] = 0;
3915 }
3916 if ((exec->comp->states[trans->to] != NULL) &&
3917 (exec->comp->states[trans->to]->type ==
3918 XML_REGEXP_SINK_STATE)) {
3919 /*
3920 * entering a sink state, save the current state as error
3921 * state.
3922 */
3923 if (xmlRegExecSetErrString(exec, value) < 0)
3924 break;
3925 exec->errState = exec->state;
3926 memcpy(exec->errCounts, exec->counts,
3927 exec->comp->nbCounters * sizeof(int));
3928 }
3929 exec->state = exec->comp->states[trans->to];
3930 exec->transno = 0;
3931 if (trans->atom != NULL) {
3932 if (exec->inputStack != NULL) {
3933 exec->index++;
3934 if (exec->index < exec->inputStackNr) {
3935 value = exec->inputStack[exec->index].value;
3936 data = exec->inputStack[exec->index].data;
3937 } else {
3938 value = NULL;
3939 data = NULL;
3940 }
3941 } else {
3942 value = NULL;
3943 data = NULL;
3944 }
3945 }
3946 goto progress;
3947 } else if (ret < 0) {
3948 exec->status = XML_REGEXP_INTERNAL_ERROR;
3949 break;
3950 }
3951 }
3952 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
3953rollback:
3954 /*
3955 * if we didn't yet rollback on the current input
3956 * store the current state as the error state.
3957 */
3958 if ((progress) && (exec->state != NULL) &&
3959 (exec->state->type != XML_REGEXP_SINK_STATE)) {
3960 progress = 0;
3961 if (xmlRegExecSetErrString(exec, value) < 0)
3962 break;
3963 exec->errState = exec->state;
3964 if (exec->comp->nbCounters)
3965 memcpy(exec->errCounts, exec->counts,
3966 exec->comp->nbCounters * sizeof(int));
3967 }
3968
3969 /*
3970 * Failed to find a way out
3971 */
3972 exec->determinist = 0;
3973 xmlFARegExecRollBack(exec);
3974 if ((exec->inputStack != NULL ) &&
3975 (exec->status == XML_REGEXP_OK)) {
3976 value = exec->inputStack[exec->index].value;
3977 data = exec->inputStack[exec->index].data;
3978 }
3979 }
3980 continue;
3981progress:
3982 progress = 1;
3983 continue;
3984 }
3985 if (exec->status == XML_REGEXP_OK) {
3986 return(exec->state->type == XML_REGEXP_FINAL_STATE);
3987 }
3988 return(exec->status);
3989}
3990
3991/**
3992 * xmlRegExecPushString:
3993 * @exec: a regexp execution context or NULL to indicate the end
3994 * @value: a string token input
3995 * @data: data associated to the token to reuse in callbacks
3996 *
3997 * Push one input token in the execution context
3998 *
3999 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4000 * a negative value in case of error.
4001 */
4002int
4003xmlRegExecPushString(xmlRegExecCtxtPtr exec, const xmlChar *value,
4004 void *data) {
4005 return(xmlRegExecPushStringInternal(exec, value, data, 0));
4006}
4007
4008/**
4009 * xmlRegExecPushString2:
4010 * @exec: a regexp execution context or NULL to indicate the end
4011 * @value: the first string token input
4012 * @value2: the second string token input
4013 * @data: data associated to the token to reuse in callbacks
4014 *
4015 * Push one input token in the execution context
4016 *
4017 * Returns: 1 if the regexp reached a final state, 0 if non-final, and
4018 * a negative value in case of error.
4019 */
4020int
4021xmlRegExecPushString2(xmlRegExecCtxtPtr exec, const xmlChar *value,
4022 const xmlChar *value2, void *data) {
4023 xmlChar buf[150];
4024 int lenn, lenp, ret;
4025 xmlChar *str;
4026
4027 if (exec == NULL)
4028 return(-1);
4029 if (exec->comp == NULL)
4030 return(-1);
4031 if (exec->status != XML_REGEXP_OK)
4032 return(exec->status);
4033
4034 if (value2 == NULL)
4035 return(xmlRegExecPushString(exec, value, data));
4036
4037 lenn = strlen((char *) value2);
4038 lenp = strlen((char *) value);
4039
4040 if (150 < lenn + lenp + 2) {
4041 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
4042 if (str == NULL) {
4043 exec->status = XML_REGEXP_OUT_OF_MEMORY;
4044 return(-1);
4045 }
4046 } else {
4047 str = buf;
4048 }
4049 memcpy(&str[0], value, lenp);
4050 str[lenp] = XML_REG_STRING_SEPARATOR;
4051 memcpy(&str[lenp + 1], value2, lenn);
4052 str[lenn + lenp + 1] = 0;
4053
4054 if (exec->comp->compact != NULL)
4055 ret = xmlRegCompactPushString(exec, exec->comp, str, data);
4056 else
4057 ret = xmlRegExecPushStringInternal(exec, str, data, 1);
4058
4059 if (str != buf)
4060 xmlFree(str);
4061 return(ret);
4062}
4063
4064/**
4065 * xmlRegExecGetValues:
4066 * @exec: a regexp execution context
4067 * @err: error extraction or normal one
4068 * @nbval: pointer to the number of accepted values IN/OUT
4069 * @nbneg: return number of negative transitions
4070 * @values: pointer to the array of acceptable values
4071 * @terminal: return value if this was a terminal state
4072 *
4073 * Extract information from the regexp execution, internal routine to
4074 * implement xmlRegExecNextValues() and xmlRegExecErrInfo()
4075 *
4076 * Returns: 0 in case of success or -1 in case of error.
4077 */
4078static int
4079xmlRegExecGetValues(xmlRegExecCtxtPtr exec, int err,
4080 int *nbval, int *nbneg,
4081 xmlChar **values, int *terminal) {
4082 int maxval;
4083 int nb = 0;
4084
4085 if ((exec == NULL) || (nbval == NULL) || (nbneg == NULL) ||
4086 (values == NULL) || (*nbval <= 0))
4087 return(-1);
4088
4089 maxval = *nbval;
4090 *nbval = 0;
4091 *nbneg = 0;
4092 if ((exec->comp != NULL) && (exec->comp->compact != NULL)) {
4093 xmlRegexpPtr comp;
4094 int target, i, state;
4095
4096 comp = exec->comp;
4097
4098 if (err) {
4099 if (exec->errStateNo == -1) return(-1);
4100 state = exec->errStateNo;
4101 } else {
4102 state = exec->index;
4103 }
4104 if (terminal != NULL) {
4105 if (comp->compact[state * (comp->nbstrings + 1)] ==
4106 XML_REGEXP_FINAL_STATE)
4107 *terminal = 1;
4108 else
4109 *terminal = 0;
4110 }
4111 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4112 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4113 if ((target > 0) && (target <= comp->nbstates) &&
4114 (comp->compact[(target - 1) * (comp->nbstrings + 1)] !=
4115 XML_REGEXP_SINK_STATE)) {
4116 values[nb++] = comp->stringMap[i];
4117 (*nbval)++;
4118 }
4119 }
4120 for (i = 0;(i < comp->nbstrings) && (nb < maxval);i++) {
4121 target = comp->compact[state * (comp->nbstrings + 1) + i + 1];
4122 if ((target > 0) && (target <= comp->nbstates) &&
4123 (comp->compact[(target - 1) * (comp->nbstrings + 1)] ==
4124 XML_REGEXP_SINK_STATE)) {
4125 values[nb++] = comp->stringMap[i];
4126 (*nbneg)++;
4127 }
4128 }
4129 } else {
4130 int transno;
4131 xmlRegTransPtr trans;
4132 xmlRegAtomPtr atom;
4133 xmlRegStatePtr state;
4134
4135 if (terminal != NULL) {
4136 if (exec->state->type == XML_REGEXP_FINAL_STATE)
4137 *terminal = 1;
4138 else
4139 *terminal = 0;
4140 }
4141
4142 if (err) {
4143 if (exec->errState == NULL) return(-1);
4144 state = exec->errState;
4145 } else {
4146 if (exec->state == NULL) return(-1);
4147 state = exec->state;
4148 }
4149 for (transno = 0;
4150 (transno < state->nbTrans) && (nb < maxval);
4151 transno++) {
4152 trans = &state->trans[transno];
4153 if (trans->to < 0)
4154 continue;
4155 atom = trans->atom;
4156 if ((atom == NULL) || (atom->valuep == NULL))
4157 continue;
4158 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4159 /* this should not be reached but ... */
4160 } else if (trans->count == REGEXP_ALL_COUNTER) {
4161 /* this should not be reached but ... */
4162 } else if (trans->counter >= 0) {
4163 xmlRegCounterPtr counter = NULL;
4164 int count;
4165
4166 if (err)
4167 count = exec->errCounts[trans->counter];
4168 else
4169 count = exec->counts[trans->counter];
4170 if (exec->comp != NULL)
4171 counter = &exec->comp->counters[trans->counter];
4172 if ((counter == NULL) || (count < counter->max)) {
4173 if (atom->neg)
4174 values[nb++] = (xmlChar *) atom->valuep2;
4175 else
4176 values[nb++] = (xmlChar *) atom->valuep;
4177 (*nbval)++;
4178 }
4179 } else {
4180 if ((exec->comp != NULL) && (exec->comp->states[trans->to] != NULL) &&
4181 (exec->comp->states[trans->to]->type !=
4182 XML_REGEXP_SINK_STATE)) {
4183 if (atom->neg)
4184 values[nb++] = (xmlChar *) atom->valuep2;
4185 else
4186 values[nb++] = (xmlChar *) atom->valuep;
4187 (*nbval)++;
4188 }
4189 }
4190 }
4191 for (transno = 0;
4192 (transno < state->nbTrans) && (nb < maxval);
4193 transno++) {
4194 trans = &state->trans[transno];
4195 if (trans->to < 0)
4196 continue;
4197 atom = trans->atom;
4198 if ((atom == NULL) || (atom->valuep == NULL))
4199 continue;
4200 if (trans->count == REGEXP_ALL_LAX_COUNTER) {
4201 continue;
4202 } else if (trans->count == REGEXP_ALL_COUNTER) {
4203 continue;
4204 } else if (trans->counter >= 0) {
4205 continue;
4206 } else {
4207 if ((exec->comp->states[trans->to] != NULL) &&
4208 (exec->comp->states[trans->to]->type ==
4209 XML_REGEXP_SINK_STATE)) {
4210 if (atom->neg)
4211 values[nb++] = (xmlChar *) atom->valuep2;
4212 else
4213 values[nb++] = (xmlChar *) atom->valuep;
4214 (*nbneg)++;
4215 }
4216 }
4217 }
4218 }
4219 return(0);
4220}
4221
4222/**
4223 * xmlRegExecNextValues:
4224 * @exec: a regexp execution context
4225 * @nbval: pointer to the number of accepted values IN/OUT
4226 * @nbneg: return number of negative transitions
4227 * @values: pointer to the array of acceptable values
4228 * @terminal: return value if this was a terminal state
4229 *
4230 * Extract information from the regexp execution,
4231 * the parameter @values must point to an array of @nbval string pointers
4232 * on return nbval will contain the number of possible strings in that
4233 * state and the @values array will be updated with them. The string values
4234 * returned will be freed with the @exec context and don't need to be
4235 * deallocated.
4236 *
4237 * Returns: 0 in case of success or -1 in case of error.
4238 */
4239int
4240xmlRegExecNextValues(xmlRegExecCtxtPtr exec, int *nbval, int *nbneg,
4241 xmlChar **values, int *terminal) {
4242 return(xmlRegExecGetValues(exec, 0, nbval, nbneg, values, terminal));
4243}
4244
4245/**
4246 * xmlRegExecErrInfo:
4247 * @exec: a regexp execution context generating an error
4248 * @string: return value for the error string
4249 * @nbval: pointer to the number of accepted values IN/OUT
4250 * @nbneg: return number of negative transitions
4251 * @values: pointer to the array of acceptable values
4252 * @terminal: return value if this was a terminal state
4253 *
4254 * Extract error information from the regexp execution, the parameter
4255 * @string will be updated with the value pushed and not accepted,
4256 * the parameter @values must point to an array of @nbval string pointers
4257 * on return nbval will contain the number of possible strings in that
4258 * state and the @values array will be updated with them. The string values
4259 * returned will be freed with the @exec context and don't need to be
4260 * deallocated.
4261 *
4262 * Returns: 0 in case of success or -1 in case of error.
4263 */
4264int
4265xmlRegExecErrInfo(xmlRegExecCtxtPtr exec, const xmlChar **string,
4266 int *nbval, int *nbneg, xmlChar **values, int *terminal) {
4267 if (exec == NULL)
4268 return(-1);
4269 if (string != NULL) {
4270 if (exec->status != XML_REGEXP_OK)
4271 *string = exec->errString;
4272 else
4273 *string = NULL;
4274 }
4275 return(xmlRegExecGetValues(exec, 1, nbval, nbneg, values, terminal));
4276}
4277
4278#if 0
4279static int
4280xmlRegExecPushChar(xmlRegExecCtxtPtr exec, int UCS) {
4281 xmlRegTransPtr trans;
4282 xmlRegAtomPtr atom;
4283 int ret;
4284 int codepoint, len;
4285
4286 if (exec == NULL)
4287 return(-1);
4288 if (exec->status != XML_REGEXP_OK)
4289 return(exec->status);
4290
4291 while ((exec->status == XML_REGEXP_OK) &&
4292 ((exec->inputString[exec->index] != 0) ||
4293 (exec->state->type != XML_REGEXP_FINAL_STATE))) {
4294
4295 /*
4296 * End of input on non-terminal state, rollback, however we may
4297 * still have epsilon like transition for counted transitions
4298 * on counters, in that case don't break too early.
4299 */
4300 if ((exec->inputString[exec->index] == 0) && (exec->counts == NULL))
4301 goto rollback;
4302
4303 exec->transcount = 0;
4304 for (;exec->transno < exec->state->nbTrans;exec->transno++) {
4305 trans = &exec->state->trans[exec->transno];
4306 if (trans->to < 0)
4307 continue;
4308 atom = trans->atom;
4309 ret = 0;
4310 if (trans->count >= 0) {
4311 int count;
4312 xmlRegCounterPtr counter;
4313
4314 /*
4315 * A counted transition.
4316 */
4317
4318 count = exec->counts[trans->count];
4319 counter = &exec->comp->counters[trans->count];
4320 ret = ((count >= counter->min) && (count <= counter->max));
4321 } else if (atom == NULL) {
4322 fprintf(stderr, "epsilon transition left at runtime\n");
4323 exec->status = XML_REGEXP_INTERNAL_ERROR;
4324 break;
4325 } else if (exec->inputString[exec->index] != 0) {
4326 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]), len);
4327 ret = xmlRegCheckCharacter(atom, codepoint);
4328 if ((ret == 1) && (atom->min > 0) && (atom->max > 0)) {
4329 xmlRegStatePtr to = exec->comp->states[trans->to];
4330
4331 /*
4332 * this is a multiple input sequence
4333 */
4334 if (exec->state->nbTrans > exec->transno + 1) {
4335 xmlFARegExecSave(exec);
4336 }
4337 exec->transcount = 1;
4338 do {
4339 /*
4340 * Try to progress as much as possible on the input
4341 */
4342 if (exec->transcount == atom->max) {
4343 break;
4344 }
4345 exec->index += len;
4346 /*
4347 * End of input: stop here
4348 */
4349 if (exec->inputString[exec->index] == 0) {
4350 exec->index -= len;
4351 break;
4352 }
4353 if (exec->transcount >= atom->min) {
4354 int transno = exec->transno;
4355 xmlRegStatePtr state = exec->state;
4356
4357 /*
4358 * The transition is acceptable save it
4359 */
4360 exec->transno = -1; /* trick */
4361 exec->state = to;
4362 xmlFARegExecSave(exec);
4363 exec->transno = transno;
4364 exec->state = state;
4365 }
4366 codepoint = CUR_SCHAR(&(exec->inputString[exec->index]),
4367 len);
4368 ret = xmlRegCheckCharacter(atom, codepoint);
4369 exec->transcount++;
4370 } while (ret == 1);
4371 if (exec->transcount < atom->min)
4372 ret = 0;
4373
4374 /*
4375 * If the last check failed but one transition was found
4376 * possible, rollback
4377 */
4378 if (ret < 0)
4379 ret = 0;
4380 if (ret == 0) {
4381 goto rollback;
4382 }
4383 }
4384 }
4385 if (ret == 1) {
4386 if (exec->state->nbTrans > exec->transno + 1) {
4387 xmlFARegExecSave(exec);
4388 }
4389 /*
4390 * restart count for expressions like this ((abc){2})*
4391 */
4392 if (trans->count >= 0) {
4393 exec->counts[trans->count] = 0;
4394 }
4395 if (trans->counter >= 0) {
4396 exec->counts[trans->counter]++;
4397 }
4398 exec->state = exec->comp->states[trans->to];
4399 exec->transno = 0;
4400 if (trans->atom != NULL) {
4401 exec->index += len;
4402 }
4403 goto progress;
4404 } else if (ret < 0) {
4405 exec->status = XML_REGEXP_INTERNAL_ERROR;
4406 break;
4407 }
4408 }
4409 if ((exec->transno != 0) || (exec->state->nbTrans == 0)) {
4410rollback:
4411 /*
4412 * Failed to find a way out
4413 */
4414 exec->determinist = 0;
4415 xmlFARegExecRollBack(exec);
4416 }
4417progress:
4418 continue;
4419 }
4420}
4421#endif
4422/************************************************************************
4423 * *
4424 * Parser for the Schemas Datatype Regular Expressions *
4425 * http://www.w3.org/TR/2001/REC-xmlschema-2-20010502/#regexs *
4426 * *
4427 ************************************************************************/
4428
4429/**
4430 * xmlFAIsChar:
4431 * @ctxt: a regexp parser context
4432 *
4433 * [10] Char ::= [^.\?*+()|#x5B#x5D]
4434 */
4435static int
4436xmlFAIsChar(xmlRegParserCtxtPtr ctxt) {
4437 int cur;
4438 int len;
4439
4440 len = 4;
4441 cur = xmlGetUTF8Char(ctxt->cur, &len);
4442 if (cur < 0) {
4443 ERROR("Invalid UTF-8");
4444 return(0);
4445 }
4446 if ((cur == '.') || (cur == '\\') || (cur == '?') ||
4447 (cur == '*') || (cur == '+') || (cur == '(') ||
4448 (cur == ')') || (cur == '|') || (cur == 0x5B) ||
4449 (cur == 0x5D) || (cur == 0))
4450 return(-1);
4451 return(cur);
4452}
4453
4454/**
4455 * xmlFAParseCharProp:
4456 * @ctxt: a regexp parser context
4457 *
4458 * [27] charProp ::= IsCategory | IsBlock
4459 * [28] IsCategory ::= Letters | Marks | Numbers | Punctuation |
4460 * Separators | Symbols | Others
4461 * [29] Letters ::= 'L' [ultmo]?
4462 * [30] Marks ::= 'M' [nce]?
4463 * [31] Numbers ::= 'N' [dlo]?
4464 * [32] Punctuation ::= 'P' [cdseifo]?
4465 * [33] Separators ::= 'Z' [slp]?
4466 * [34] Symbols ::= 'S' [mcko]?
4467 * [35] Others ::= 'C' [cfon]?
4468 * [36] IsBlock ::= 'Is' [a-zA-Z0-9#x2D]+
4469 */
4470static void
4471xmlFAParseCharProp(xmlRegParserCtxtPtr ctxt) {
4472 int cur;
4473 xmlRegAtomType type = (xmlRegAtomType) 0;
4474 xmlChar *blockName = NULL;
4475
4476 cur = CUR;
4477 if (cur == 'L') {
4478 NEXT;
4479 cur = CUR;
4480 if (cur == 'u') {
4481 NEXT;
4482 type = XML_REGEXP_LETTER_UPPERCASE;
4483 } else if (cur == 'l') {
4484 NEXT;
4485 type = XML_REGEXP_LETTER_LOWERCASE;
4486 } else if (cur == 't') {
4487 NEXT;
4488 type = XML_REGEXP_LETTER_TITLECASE;
4489 } else if (cur == 'm') {
4490 NEXT;
4491 type = XML_REGEXP_LETTER_MODIFIER;
4492 } else if (cur == 'o') {
4493 NEXT;
4494 type = XML_REGEXP_LETTER_OTHERS;
4495 } else {
4496 type = XML_REGEXP_LETTER;
4497 }
4498 } else if (cur == 'M') {
4499 NEXT;
4500 cur = CUR;
4501 if (cur == 'n') {
4502 NEXT;
4503 /* nonspacing */
4504 type = XML_REGEXP_MARK_NONSPACING;
4505 } else if (cur == 'c') {
4506 NEXT;
4507 /* spacing combining */
4508 type = XML_REGEXP_MARK_SPACECOMBINING;
4509 } else if (cur == 'e') {
4510 NEXT;
4511 /* enclosing */
4512 type = XML_REGEXP_MARK_ENCLOSING;
4513 } else {
4514 /* all marks */
4515 type = XML_REGEXP_MARK;
4516 }
4517 } else if (cur == 'N') {
4518 NEXT;
4519 cur = CUR;
4520 if (cur == 'd') {
4521 NEXT;
4522 /* digital */
4523 type = XML_REGEXP_NUMBER_DECIMAL;
4524 } else if (cur == 'l') {
4525 NEXT;
4526 /* letter */
4527 type = XML_REGEXP_NUMBER_LETTER;
4528 } else if (cur == 'o') {
4529 NEXT;
4530 /* other */
4531 type = XML_REGEXP_NUMBER_OTHERS;
4532 } else {
4533 /* all numbers */
4534 type = XML_REGEXP_NUMBER;
4535 }
4536 } else if (cur == 'P') {
4537 NEXT;
4538 cur = CUR;
4539 if (cur == 'c') {
4540 NEXT;
4541 /* connector */
4542 type = XML_REGEXP_PUNCT_CONNECTOR;
4543 } else if (cur == 'd') {
4544 NEXT;
4545 /* dash */
4546 type = XML_REGEXP_PUNCT_DASH;
4547 } else if (cur == 's') {
4548 NEXT;
4549 /* open */
4550 type = XML_REGEXP_PUNCT_OPEN;
4551 } else if (cur == 'e') {
4552 NEXT;
4553 /* close */
4554 type = XML_REGEXP_PUNCT_CLOSE;
4555 } else if (cur == 'i') {
4556 NEXT;
4557 /* initial quote */
4558 type = XML_REGEXP_PUNCT_INITQUOTE;
4559 } else if (cur == 'f') {
4560 NEXT;
4561 /* final quote */
4562 type = XML_REGEXP_PUNCT_FINQUOTE;
4563 } else if (cur == 'o') {
4564 NEXT;
4565 /* other */
4566 type = XML_REGEXP_PUNCT_OTHERS;
4567 } else {
4568 /* all punctuation */
4569 type = XML_REGEXP_PUNCT;
4570 }
4571 } else if (cur == 'Z') {
4572 NEXT;
4573 cur = CUR;
4574 if (cur == 's') {
4575 NEXT;
4576 /* space */
4577 type = XML_REGEXP_SEPAR_SPACE;
4578 } else if (cur == 'l') {
4579 NEXT;
4580 /* line */
4581 type = XML_REGEXP_SEPAR_LINE;
4582 } else if (cur == 'p') {
4583 NEXT;
4584 /* paragraph */
4585 type = XML_REGEXP_SEPAR_PARA;
4586 } else {
4587 /* all separators */
4588 type = XML_REGEXP_SEPAR;
4589 }
4590 } else if (cur == 'S') {
4591 NEXT;
4592 cur = CUR;
4593 if (cur == 'm') {
4594 NEXT;
4595 type = XML_REGEXP_SYMBOL_MATH;
4596 /* math */
4597 } else if (cur == 'c') {
4598 NEXT;
4599 type = XML_REGEXP_SYMBOL_CURRENCY;
4600 /* currency */
4601 } else if (cur == 'k') {
4602 NEXT;
4603 type = XML_REGEXP_SYMBOL_MODIFIER;
4604 /* modifiers */
4605 } else if (cur == 'o') {
4606 NEXT;
4607 type = XML_REGEXP_SYMBOL_OTHERS;
4608 /* other */
4609 } else {
4610 /* all symbols */
4611 type = XML_REGEXP_SYMBOL;
4612 }
4613 } else if (cur == 'C') {
4614 NEXT;
4615 cur = CUR;
4616 if (cur == 'c') {
4617 NEXT;
4618 /* control */
4619 type = XML_REGEXP_OTHER_CONTROL;
4620 } else if (cur == 'f') {
4621 NEXT;
4622 /* format */
4623 type = XML_REGEXP_OTHER_FORMAT;
4624 } else if (cur == 'o') {
4625 NEXT;
4626 /* private use */
4627 type = XML_REGEXP_OTHER_PRIVATE;
4628 } else if (cur == 'n') {
4629 NEXT;
4630 /* not assigned */
4631 type = XML_REGEXP_OTHER_NA;
4632 } else {
4633 /* all others */
4634 type = XML_REGEXP_OTHER;
4635 }
4636 } else if (cur == 'I') {
4637 const xmlChar *start;
4638 NEXT;
4639 cur = CUR;
4640 if (cur != 's') {
4641 ERROR("IsXXXX expected");
4642 return;
4643 }
4644 NEXT;
4645 start = ctxt->cur;
4646 cur = CUR;
4647 if (((cur >= 'a') && (cur <= 'z')) ||
4648 ((cur >= 'A') && (cur <= 'Z')) ||
4649 ((cur >= '0') && (cur <= '9')) ||
4650 (cur == 0x2D)) {
4651 NEXT;
4652 cur = CUR;
4653 while (((cur >= 'a') && (cur <= 'z')) ||
4654 ((cur >= 'A') && (cur <= 'Z')) ||
4655 ((cur >= '0') && (cur <= '9')) ||
4656 (cur == 0x2D)) {
4657 NEXT;
4658 cur = CUR;
4659 }
4660 }
4661 type = XML_REGEXP_BLOCK_NAME;
4662 blockName = xmlStrndup(start, ctxt->cur - start);
4663 if (blockName == NULL)
4664 xmlRegexpErrMemory(ctxt);
4665 } else {
4666 ERROR("Unknown char property");
4667 return;
4668 }
4669 if (ctxt->atom == NULL) {
4670 ctxt->atom = xmlRegNewAtom(ctxt, type);
4671 if (ctxt->atom == NULL) {
4672 xmlFree(blockName);
4673 return;
4674 }
4675 ctxt->atom->valuep = blockName;
4676 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4677 if (xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4678 type, 0, 0, blockName) == NULL) {
4679 xmlFree(blockName);
4680 }
4681 }
4682}
4683
4684static int parse_escaped_codeunit(xmlRegParserCtxtPtr ctxt)
4685{
4686 int val = 0, i, cur;
4687 for (i = 0; i < 4; i++) {
4688 NEXT;
4689 val *= 16;
4690 cur = CUR;
4691 if (cur >= '0' && cur <= '9') {
4692 val += cur - '0';
4693 } else if (cur >= 'A' && cur <= 'F') {
4694 val += cur - 'A' + 10;
4695 } else if (cur >= 'a' && cur <= 'f') {
4696 val += cur - 'a' + 10;
4697 } else {
4698 ERROR("Expecting hex digit");
4699 return -1;
4700 }
4701 }
4702 return val;
4703}
4704
4705static int parse_escaped_codepoint(xmlRegParserCtxtPtr ctxt)
4706{
4707 int val = parse_escaped_codeunit(ctxt);
4708 if (0xD800 <= val && val <= 0xDBFF) {
4709 NEXT;
4710 if (CUR == '\\') {
4711 NEXT;
4712 if (CUR == 'u') {
4713 int low = parse_escaped_codeunit(ctxt);
4714 if (0xDC00 <= low && low <= 0xDFFF) {
4715 return (val - 0xD800) * 0x400 + (low - 0xDC00) + 0x10000;
4716 }
4717 }
4718 }
4719 ERROR("Invalid low surrogate pair code unit");
4720 val = -1;
4721 }
4722 return val;
4723}
4724
4725/**
4726 * xmlFAParseCharClassEsc:
4727 * @ctxt: a regexp parser context
4728 *
4729 * [23] charClassEsc ::= ( SingleCharEsc | MultiCharEsc | catEsc | complEsc )
4730 * [24] SingleCharEsc ::= '\' [nrt\|.?*+(){}#x2D#x5B#x5D#x5E]
4731 * [25] catEsc ::= '\p{' charProp '}'
4732 * [26] complEsc ::= '\P{' charProp '}'
4733 * [37] MultiCharEsc ::= '.' | ('\' [sSiIcCdDwW])
4734 */
4735static void
4736xmlFAParseCharClassEsc(xmlRegParserCtxtPtr ctxt) {
4737 int cur;
4738
4739 if (CUR == '.') {
4740 if (ctxt->atom == NULL) {
4741 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_ANYCHAR);
4742 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4743 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4744 XML_REGEXP_ANYCHAR, 0, 0, NULL);
4745 }
4746 NEXT;
4747 return;
4748 }
4749 if (CUR != '\\') {
4750 ERROR("Escaped sequence: expecting \\");
4751 return;
4752 }
4753 NEXT;
4754 cur = CUR;
4755 if (cur == 'p') {
4756 NEXT;
4757 if (CUR != '{') {
4758 ERROR("Expecting '{'");
4759 return;
4760 }
4761 NEXT;
4762 xmlFAParseCharProp(ctxt);
4763 if (CUR != '}') {
4764 ERROR("Expecting '}'");
4765 return;
4766 }
4767 NEXT;
4768 } else if (cur == 'P') {
4769 NEXT;
4770 if (CUR != '{') {
4771 ERROR("Expecting '{'");
4772 return;
4773 }
4774 NEXT;
4775 xmlFAParseCharProp(ctxt);
4776 if (ctxt->atom != NULL)
4777 ctxt->atom->neg = 1;
4778 if (CUR != '}') {
4779 ERROR("Expecting '}'");
4780 return;
4781 }
4782 NEXT;
4783 } else if ((cur == 'n') || (cur == 'r') || (cur == 't') || (cur == '\\') ||
4784 (cur == '|') || (cur == '.') || (cur == '?') || (cur == '*') ||
4785 (cur == '+') || (cur == '(') || (cur == ')') || (cur == '{') ||
4786 (cur == '}') || (cur == 0x2D) || (cur == 0x5B) || (cur == 0x5D) ||
4787 (cur == 0x5E) ||
4788
4789 /* Non-standard escape sequences:
4790 * Java 1.8|.NET Core 3.1|MSXML 6 */
4791 (cur == '!') || /* + | + | + */
4792 (cur == '"') || /* + | + | + */
4793 (cur == '#') || /* + | + | + */
4794 (cur == '$') || /* + | + | + */
4795 (cur == '%') || /* + | + | + */
4796 (cur == ',') || /* + | + | + */
4797 (cur == '/') || /* + | + | + */
4798 (cur == ':') || /* + | + | + */
4799 (cur == ';') || /* + | + | + */
4800 (cur == '=') || /* + | + | + */
4801 (cur == '>') || /* | + | + */
4802 (cur == '@') || /* + | + | + */
4803 (cur == '`') || /* + | + | + */
4804 (cur == '~') || /* + | + | + */
4805 (cur == 'u')) { /* | + | + */
4806 if (ctxt->atom == NULL) {
4807 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
4808 if (ctxt->atom != NULL) {
4809 switch (cur) {
4810 case 'n':
4811 ctxt->atom->codepoint = '\n';
4812 break;
4813 case 'r':
4814 ctxt->atom->codepoint = '\r';
4815 break;
4816 case 't':
4817 ctxt->atom->codepoint = '\t';
4818 break;
4819 case 'u':
4820 cur = parse_escaped_codepoint(ctxt);
4821 if (cur < 0) {
4822 return;
4823 }
4824 ctxt->atom->codepoint = cur;
4825 break;
4826 default:
4827 ctxt->atom->codepoint = cur;
4828 }
4829 }
4830 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4831 switch (cur) {
4832 case 'n':
4833 cur = '\n';
4834 break;
4835 case 'r':
4836 cur = '\r';
4837 break;
4838 case 't':
4839 cur = '\t';
4840 break;
4841 }
4842 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4843 XML_REGEXP_CHARVAL, cur, cur, NULL);
4844 }
4845 NEXT;
4846 } else if ((cur == 's') || (cur == 'S') || (cur == 'i') || (cur == 'I') ||
4847 (cur == 'c') || (cur == 'C') || (cur == 'd') || (cur == 'D') ||
4848 (cur == 'w') || (cur == 'W')) {
4849 xmlRegAtomType type = XML_REGEXP_ANYSPACE;
4850
4851 switch (cur) {
4852 case 's':
4853 type = XML_REGEXP_ANYSPACE;
4854 break;
4855 case 'S':
4856 type = XML_REGEXP_NOTSPACE;
4857 break;
4858 case 'i':
4859 type = XML_REGEXP_INITNAME;
4860 break;
4861 case 'I':
4862 type = XML_REGEXP_NOTINITNAME;
4863 break;
4864 case 'c':
4865 type = XML_REGEXP_NAMECHAR;
4866 break;
4867 case 'C':
4868 type = XML_REGEXP_NOTNAMECHAR;
4869 break;
4870 case 'd':
4871 type = XML_REGEXP_DECIMAL;
4872 break;
4873 case 'D':
4874 type = XML_REGEXP_NOTDECIMAL;
4875 break;
4876 case 'w':
4877 type = XML_REGEXP_REALCHAR;
4878 break;
4879 case 'W':
4880 type = XML_REGEXP_NOTREALCHAR;
4881 break;
4882 }
4883 NEXT;
4884 if (ctxt->atom == NULL) {
4885 ctxt->atom = xmlRegNewAtom(ctxt, type);
4886 } else if (ctxt->atom->type == XML_REGEXP_RANGES) {
4887 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4888 type, 0, 0, NULL);
4889 }
4890 } else {
4891 ERROR("Wrong escape sequence, misuse of character '\\'");
4892 }
4893}
4894
4895/**
4896 * xmlFAParseCharRange:
4897 * @ctxt: a regexp parser context
4898 *
4899 * [17] charRange ::= seRange | XmlCharRef | XmlCharIncDash
4900 * [18] seRange ::= charOrEsc '-' charOrEsc
4901 * [20] charOrEsc ::= XmlChar | SingleCharEsc
4902 * [21] XmlChar ::= [^\#x2D#x5B#x5D]
4903 * [22] XmlCharIncDash ::= [^\#x5B#x5D]
4904 */
4905static void
4906xmlFAParseCharRange(xmlRegParserCtxtPtr ctxt) {
4907 int cur, len;
4908 int start = -1;
4909 int end = -1;
4910
4911 if (CUR == '\0') {
4912 ERROR("Expecting ']'");
4913 return;
4914 }
4915
4916 cur = CUR;
4917 if (cur == '\\') {
4918 NEXT;
4919 cur = CUR;
4920 switch (cur) {
4921 case 'n': start = 0xA; break;
4922 case 'r': start = 0xD; break;
4923 case 't': start = 0x9; break;
4924 case '\\': case '|': case '.': case '-': case '^': case '?':
4925 case '*': case '+': case '{': case '}': case '(': case ')':
4926 case '[': case ']':
4927 start = cur; break;
4928 default:
4929 ERROR("Invalid escape value");
4930 return;
4931 }
4932 end = start;
4933 len = 1;
4934 } else if ((cur != 0x5B) && (cur != 0x5D)) {
4935 len = 4;
4936 end = start = xmlGetUTF8Char(ctxt->cur, &len);
4937 if (start < 0) {
4938 ERROR("Invalid UTF-8");
4939 return;
4940 }
4941 } else {
4942 ERROR("Expecting a char range");
4943 return;
4944 }
4945 /*
4946 * Since we are "inside" a range, we can assume ctxt->cur is past
4947 * the start of ctxt->string, and PREV should be safe
4948 */
4949 if ((start == '-') && (NXT(1) != ']') && (PREV != '[') && (PREV != '^')) {
4950 NEXTL(len);
4951 return;
4952 }
4953 NEXTL(len);
4954 cur = CUR;
4955 if ((cur != '-') || (NXT(1) == '[') || (NXT(1) == ']')) {
4956 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4957 XML_REGEXP_CHARVAL, start, end, NULL);
4958 return;
4959 }
4960 NEXT;
4961 cur = CUR;
4962 if (cur == '\\') {
4963 NEXT;
4964 cur = CUR;
4965 switch (cur) {
4966 case 'n': end = 0xA; break;
4967 case 'r': end = 0xD; break;
4968 case 't': end = 0x9; break;
4969 case '\\': case '|': case '.': case '-': case '^': case '?':
4970 case '*': case '+': case '{': case '}': case '(': case ')':
4971 case '[': case ']':
4972 end = cur; break;
4973 default:
4974 ERROR("Invalid escape value");
4975 return;
4976 }
4977 len = 1;
4978 } else if ((cur != '\0') && (cur != 0x5B) && (cur != 0x5D)) {
4979 len = 4;
4980 end = xmlGetUTF8Char(ctxt->cur, &len);
4981 if (end < 0) {
4982 ERROR("Invalid UTF-8");
4983 return;
4984 }
4985 } else {
4986 ERROR("Expecting the end of a char range");
4987 return;
4988 }
4989
4990 /* TODO check that the values are acceptable character ranges for XML */
4991 if (end < start) {
4992 ERROR("End of range is before start of range");
4993 } else {
4994 NEXTL(len);
4995 xmlRegAtomAddRange(ctxt, ctxt->atom, ctxt->neg,
4996 XML_REGEXP_CHARVAL, start, end, NULL);
4997 }
4998 return;
4999}
5000
5001/**
5002 * xmlFAParsePosCharGroup:
5003 * @ctxt: a regexp parser context
5004 *
5005 * [14] posCharGroup ::= ( charRange | charClassEsc )+
5006 */
5007static void
5008xmlFAParsePosCharGroup(xmlRegParserCtxtPtr ctxt) {
5009 do {
5010 if (CUR == '\\') {
5011 xmlFAParseCharClassEsc(ctxt);
5012 } else {
5013 xmlFAParseCharRange(ctxt);
5014 }
5015 } while ((CUR != ']') && (CUR != '-') &&
5016 (CUR != 0) && (ctxt->error == 0));
5017}
5018
5019/**
5020 * xmlFAParseCharGroup:
5021 * @ctxt: a regexp parser context
5022 *
5023 * [13] charGroup ::= posCharGroup | negCharGroup | charClassSub
5024 * [15] negCharGroup ::= '^' posCharGroup
5025 * [16] charClassSub ::= ( posCharGroup | negCharGroup ) '-' charClassExpr
5026 * [12] charClassExpr ::= '[' charGroup ']'
5027 */
5028static void
5029xmlFAParseCharGroup(xmlRegParserCtxtPtr ctxt) {
5030 int neg = ctxt->neg;
5031
5032 if (CUR == '^') {
5033 NEXT;
5034 ctxt->neg = !ctxt->neg;
5035 xmlFAParsePosCharGroup(ctxt);
5036 ctxt->neg = neg;
5037 }
5038 while ((CUR != ']') && (ctxt->error == 0)) {
5039 if ((CUR == '-') && (NXT(1) == '[')) {
5040 NEXT; /* eat the '-' */
5041 NEXT; /* eat the '[' */
5042 ctxt->neg = 2;
5043 xmlFAParseCharGroup(ctxt);
5044 ctxt->neg = neg;
5045 if (CUR == ']') {
5046 NEXT;
5047 } else {
5048 ERROR("charClassExpr: ']' expected");
5049 }
5050 break;
5051 } else {
5052 xmlFAParsePosCharGroup(ctxt);
5053 }
5054 }
5055}
5056
5057/**
5058 * xmlFAParseCharClass:
5059 * @ctxt: a regexp parser context
5060 *
5061 * [11] charClass ::= charClassEsc | charClassExpr
5062 * [12] charClassExpr ::= '[' charGroup ']'
5063 */
5064static void
5065xmlFAParseCharClass(xmlRegParserCtxtPtr ctxt) {
5066 if (CUR == '[') {
5067 NEXT;
5068 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_RANGES);
5069 if (ctxt->atom == NULL)
5070 return;
5071 xmlFAParseCharGroup(ctxt);
5072 if (CUR == ']') {
5073 NEXT;
5074 } else {
5075 ERROR("xmlFAParseCharClass: ']' expected");
5076 }
5077 } else {
5078 xmlFAParseCharClassEsc(ctxt);
5079 }
5080}
5081
5082/**
5083 * xmlFAParseQuantExact:
5084 * @ctxt: a regexp parser context
5085 *
5086 * [8] QuantExact ::= [0-9]+
5087 *
5088 * Returns 0 if success or -1 in case of error
5089 */
5090static int
5091xmlFAParseQuantExact(xmlRegParserCtxtPtr ctxt) {
5092 int ret = 0;
5093 int ok = 0;
5094 int overflow = 0;
5095
5096 while ((CUR >= '0') && (CUR <= '9')) {
5097 if (ret > INT_MAX / 10) {
5098 overflow = 1;
5099 } else {
5100 int digit = CUR - '0';
5101
5102 ret *= 10;
5103 if (ret > INT_MAX - digit)
5104 overflow = 1;
5105 else
5106 ret += digit;
5107 }
5108 ok = 1;
5109 NEXT;
5110 }
5111 if ((ok != 1) || (overflow == 1)) {
5112 return(-1);
5113 }
5114 return(ret);
5115}
5116
5117/**
5118 * xmlFAParseQuantifier:
5119 * @ctxt: a regexp parser context
5120 *
5121 * [4] quantifier ::= [?*+] | ( '{' quantity '}' )
5122 * [5] quantity ::= quantRange | quantMin | QuantExact
5123 * [6] quantRange ::= QuantExact ',' QuantExact
5124 * [7] quantMin ::= QuantExact ','
5125 * [8] QuantExact ::= [0-9]+
5126 */
5127static int
5128xmlFAParseQuantifier(xmlRegParserCtxtPtr ctxt) {
5129 int cur;
5130
5131 cur = CUR;
5132 if ((cur == '?') || (cur == '*') || (cur == '+')) {
5133 if (ctxt->atom != NULL) {
5134 if (cur == '?')
5135 ctxt->atom->quant = XML_REGEXP_QUANT_OPT;
5136 else if (cur == '*')
5137 ctxt->atom->quant = XML_REGEXP_QUANT_MULT;
5138 else if (cur == '+')
5139 ctxt->atom->quant = XML_REGEXP_QUANT_PLUS;
5140 }
5141 NEXT;
5142 return(1);
5143 }
5144 if (cur == '{') {
5145 int min = 0, max = 0;
5146
5147 NEXT;
5148 cur = xmlFAParseQuantExact(ctxt);
5149 if (cur >= 0)
5150 min = cur;
5151 else {
5152 ERROR("Improper quantifier");
5153 }
5154 if (CUR == ',') {
5155 NEXT;
5156 if (CUR == '}')
5157 max = INT_MAX;
5158 else {
5159 cur = xmlFAParseQuantExact(ctxt);
5160 if (cur >= 0)
5161 max = cur;
5162 else {
5163 ERROR("Improper quantifier");
5164 }
5165 }
5166 }
5167 if (CUR == '}') {
5168 NEXT;
5169 } else {
5170 ERROR("Unterminated quantifier");
5171 }
5172 if (max == 0)
5173 max = min;
5174 if (ctxt->atom != NULL) {
5175 ctxt->atom->quant = XML_REGEXP_QUANT_RANGE;
5176 ctxt->atom->min = min;
5177 ctxt->atom->max = max;
5178 }
5179 return(1);
5180 }
5181 return(0);
5182}
5183
5184/**
5185 * xmlFAParseAtom:
5186 * @ctxt: a regexp parser context
5187 *
5188 * [9] atom ::= Char | charClass | ( '(' regExp ')' )
5189 */
5190static int
5191xmlFAParseAtom(xmlRegParserCtxtPtr ctxt) {
5192 int codepoint, len;
5193
5194 codepoint = xmlFAIsChar(ctxt);
5195 if (codepoint > 0) {
5196 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_CHARVAL);
5197 if (ctxt->atom == NULL)
5198 return(-1);
5199 len = 4;
5200 codepoint = xmlGetUTF8Char(ctxt->cur, &len);
5201 if (codepoint < 0) {
5202 ERROR("Invalid UTF-8");
5203 return(-1);
5204 }
5205 ctxt->atom->codepoint = codepoint;
5206 NEXTL(len);
5207 return(1);
5208 } else if (CUR == '|') {
5209 return(0);
5210 } else if (CUR == 0) {
5211 return(0);
5212 } else if (CUR == ')') {
5213 return(0);
5214 } else if (CUR == '(') {
5215 xmlRegStatePtr start, oldend, start0;
5216
5217 NEXT;
5218 if (ctxt->depth >= 50) {
5219 ERROR("xmlFAParseAtom: maximum nesting depth exceeded");
5220 return(-1);
5221 }
5222 /*
5223 * this extra Epsilon transition is needed if we count with 0 allowed
5224 * unfortunately this can't be known at that point
5225 */
5226 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5227 start0 = ctxt->state;
5228 xmlFAGenerateEpsilonTransition(ctxt, ctxt->state, NULL);
5229 start = ctxt->state;
5230 oldend = ctxt->end;
5231 ctxt->end = NULL;
5232 ctxt->atom = NULL;
5233 ctxt->depth++;
5234 xmlFAParseRegExp(ctxt, 0);
5235 ctxt->depth--;
5236 if (CUR == ')') {
5237 NEXT;
5238 } else {
5239 ERROR("xmlFAParseAtom: expecting ')'");
5240 }
5241 ctxt->atom = xmlRegNewAtom(ctxt, XML_REGEXP_SUBREG);
5242 if (ctxt->atom == NULL)
5243 return(-1);
5244 ctxt->atom->start = start;
5245 ctxt->atom->start0 = start0;
5246 ctxt->atom->stop = ctxt->state;
5247 ctxt->end = oldend;
5248 return(1);
5249 } else if ((CUR == '[') || (CUR == '\\') || (CUR == '.')) {
5250 xmlFAParseCharClass(ctxt);
5251 return(1);
5252 }
5253 return(0);
5254}
5255
5256/**
5257 * xmlFAParsePiece:
5258 * @ctxt: a regexp parser context
5259 *
5260 * [3] piece ::= atom quantifier?
5261 */
5262static int
5263xmlFAParsePiece(xmlRegParserCtxtPtr ctxt) {
5264 int ret;
5265
5266 ctxt->atom = NULL;
5267 ret = xmlFAParseAtom(ctxt);
5268 if (ret == 0)
5269 return(0);
5270 if (ctxt->atom == NULL) {
5271 ERROR("internal: no atom generated");
5272 }
5273 xmlFAParseQuantifier(ctxt);
5274 return(1);
5275}
5276
5277/**
5278 * xmlFAParseBranch:
5279 * @ctxt: a regexp parser context
5280 * @to: optional target to the end of the branch
5281 *
5282 * @to is used to optimize by removing duplicate path in automata
5283 * in expressions like (a|b)(c|d)
5284 *
5285 * [2] branch ::= piece*
5286 */
5287static int
5288xmlFAParseBranch(xmlRegParserCtxtPtr ctxt, xmlRegStatePtr to) {
5289 xmlRegStatePtr previous;
5290 int ret;
5291
5292 previous = ctxt->state;
5293 ret = xmlFAParsePiece(ctxt);
5294 if (ret == 0) {
5295 /* Empty branch */
5296 xmlFAGenerateEpsilonTransition(ctxt, previous, to);
5297 } else {
5298 if (xmlFAGenerateTransitions(ctxt, previous,
5299 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5300 ctxt->atom) < 0) {
5301 xmlRegFreeAtom(ctxt->atom);
5302 ctxt->atom = NULL;
5303 return(-1);
5304 }
5305 previous = ctxt->state;
5306 ctxt->atom = NULL;
5307 }
5308 while ((ret != 0) && (ctxt->error == 0)) {
5309 ret = xmlFAParsePiece(ctxt);
5310 if (ret != 0) {
5311 if (xmlFAGenerateTransitions(ctxt, previous,
5312 (CUR=='|' || CUR==')' || CUR==0) ? to : NULL,
5313 ctxt->atom) < 0) {
5314 xmlRegFreeAtom(ctxt->atom);
5315 ctxt->atom = NULL;
5316 return(-1);
5317 }
5318 previous = ctxt->state;
5319 ctxt->atom = NULL;
5320 }
5321 }
5322 return(0);
5323}
5324
5325/**
5326 * xmlFAParseRegExp:
5327 * @ctxt: a regexp parser context
5328 * @top: is this the top-level expression ?
5329 *
5330 * [1] regExp ::= branch ( '|' branch )*
5331 */
5332static void
5333xmlFAParseRegExp(xmlRegParserCtxtPtr ctxt, int top) {
5334 xmlRegStatePtr start, end;
5335
5336 /* if not top start should have been generated by an epsilon trans */
5337 start = ctxt->state;
5338 ctxt->end = NULL;
5339 xmlFAParseBranch(ctxt, NULL);
5340 if (top) {
5341 ctxt->state->type = XML_REGEXP_FINAL_STATE;
5342 }
5343 if (CUR != '|') {
5344 ctxt->end = ctxt->state;
5345 return;
5346 }
5347 end = ctxt->state;
5348 while ((CUR == '|') && (ctxt->error == 0)) {
5349 NEXT;
5350 ctxt->state = start;
5351 ctxt->end = NULL;
5352 xmlFAParseBranch(ctxt, end);
5353 }
5354 if (!top) {
5355 ctxt->state = end;
5356 ctxt->end = end;
5357 }
5358}
5359
5360/************************************************************************
5361 * *
5362 * The basic API *
5363 * *
5364 ************************************************************************/
5365
5366/**
5367 * xmlRegexpPrint:
5368 * @output: the file for the output debug
5369 * @regexp: the compiled regexp
5370 *
5371 * Print the content of the compiled regular expression
5372 */
5373void
5374xmlRegexpPrint(FILE *output, xmlRegexpPtr regexp) {
5375 int i;
5376
5377 if (output == NULL)
5378 return;
5379 fprintf(output, " regexp: ");
5380 if (regexp == NULL) {
5381 fprintf(output, "NULL\n");
5382 return;
5383 }
5384 fprintf(output, "'%s' ", regexp->string);
5385 fprintf(output, "\n");
5386 fprintf(output, "%d atoms:\n", regexp->nbAtoms);
5387 for (i = 0;i < regexp->nbAtoms; i++) {
5388 fprintf(output, " %02d ", i);
5389 xmlRegPrintAtom(output, regexp->atoms[i]);
5390 }
5391 fprintf(output, "%d states:", regexp->nbStates);
5392 fprintf(output, "\n");
5393 for (i = 0;i < regexp->nbStates; i++) {
5394 xmlRegPrintState(output, regexp->states[i]);
5395 }
5396 fprintf(output, "%d counters:\n", regexp->nbCounters);
5397 for (i = 0;i < regexp->nbCounters; i++) {
5398 fprintf(output, " %d: min %d max %d\n", i, regexp->counters[i].min,
5399 regexp->counters[i].max);
5400 }
5401}
5402
5403/**
5404 * xmlRegexpCompile:
5405 * @regexp: a regular expression string
5406 *
5407 * Parses a regular expression conforming to XML Schemas Part 2 Datatype
5408 * Appendix F and builds an automata suitable for testing strings against
5409 * that regular expression
5410 *
5411 * Returns the compiled expression or NULL in case of error
5412 */
5413xmlRegexpPtr
5414xmlRegexpCompile(const xmlChar *regexp) {
5415 xmlRegexpPtr ret = NULL;
5416 xmlRegParserCtxtPtr ctxt;
5417
5418 if (regexp == NULL)
5419 return(NULL);
5420
5421 ctxt = xmlRegNewParserCtxt(regexp);
5422 if (ctxt == NULL)
5423 return(NULL);
5424
5425 /* initialize the parser */
5426 ctxt->state = xmlRegStatePush(ctxt);
5427 if (ctxt->state == NULL)
5428 goto error;
5429 ctxt->start = ctxt->state;
5430 ctxt->end = NULL;
5431
5432 /* parse the expression building an automata */
5433 xmlFAParseRegExp(ctxt, 1);
5434 if (CUR != 0) {
5435 ERROR("xmlFAParseRegExp: extra characters");
5436 }
5437 if (ctxt->error != 0)
5438 goto error;
5439 ctxt->end = ctxt->state;
5440 ctxt->start->type = XML_REGEXP_START_STATE;
5441 ctxt->end->type = XML_REGEXP_FINAL_STATE;
5442
5443 /* remove the Epsilon except for counted transitions */
5444 xmlFAEliminateEpsilonTransitions(ctxt);
5445
5446
5447 if (ctxt->error != 0)
5448 goto error;
5449 ret = xmlRegEpxFromParse(ctxt);
5450
5451error:
5452 xmlRegFreeParserCtxt(ctxt);
5453 return(ret);
5454}
5455
5456/**
5457 * xmlRegexpExec:
5458 * @comp: the compiled regular expression
5459 * @content: the value to check against the regular expression
5460 *
5461 * Check if the regular expression generates the value
5462 *
5463 * Returns 1 if it matches, 0 if not and a negative value in case of error
5464 */
5465int
5466xmlRegexpExec(xmlRegexpPtr comp, const xmlChar *content) {
5467 if ((comp == NULL) || (content == NULL))
5468 return(-1);
5469 return(xmlFARegExec(comp, content));
5470}
5471
5472/**
5473 * xmlRegexpIsDeterminist:
5474 * @comp: the compiled regular expression
5475 *
5476 * Check if the regular expression is determinist
5477 *
5478 * Returns 1 if it yes, 0 if not and a negative value in case of error
5479 */
5480int
5481xmlRegexpIsDeterminist(xmlRegexpPtr comp) {
5482 xmlAutomataPtr am;
5483 int ret;
5484
5485 if (comp == NULL)
5486 return(-1);
5487 if (comp->determinist != -1)
5488 return(comp->determinist);
5489
5490 am = xmlNewAutomata();
5491 if (am == NULL)
5492 return(-1);
5493 if (am->states != NULL) {
5494 int i;
5495
5496 for (i = 0;i < am->nbStates;i++)
5497 xmlRegFreeState(am->states[i]);
5498 xmlFree(am->states);
5499 }
5500 am->nbAtoms = comp->nbAtoms;
5501 am->atoms = comp->atoms;
5502 am->nbStates = comp->nbStates;
5503 am->states = comp->states;
5504 am->determinist = -1;
5505 am->flags = comp->flags;
5506 ret = xmlFAComputesDeterminism(am);
5507 am->atoms = NULL;
5508 am->states = NULL;
5509 xmlFreeAutomata(am);
5510 comp->determinist = ret;
5511 return(ret);
5512}
5513
5514/**
5515 * xmlRegFreeRegexp:
5516 * @regexp: the regexp
5517 *
5518 * Free a regexp
5519 */
5520void
5521xmlRegFreeRegexp(xmlRegexpPtr regexp) {
5522 int i;
5523 if (regexp == NULL)
5524 return;
5525
5526 if (regexp->string != NULL)
5527 xmlFree(regexp->string);
5528 if (regexp->states != NULL) {
5529 for (i = 0;i < regexp->nbStates;i++)
5530 xmlRegFreeState(regexp->states[i]);
5531 xmlFree(regexp->states);
5532 }
5533 if (regexp->atoms != NULL) {
5534 for (i = 0;i < regexp->nbAtoms;i++)
5535 xmlRegFreeAtom(regexp->atoms[i]);
5536 xmlFree(regexp->atoms);
5537 }
5538 if (regexp->counters != NULL)
5539 xmlFree(regexp->counters);
5540 if (regexp->compact != NULL)
5541 xmlFree(regexp->compact);
5542 if (regexp->transdata != NULL)
5543 xmlFree(regexp->transdata);
5544 if (regexp->stringMap != NULL) {
5545 for (i = 0; i < regexp->nbstrings;i++)
5546 xmlFree(regexp->stringMap[i]);
5547 xmlFree(regexp->stringMap);
5548 }
5549
5550 xmlFree(regexp);
5551}
5552
5553#ifdef LIBXML_AUTOMATA_ENABLED
5554/************************************************************************
5555 * *
5556 * The Automata interface *
5557 * *
5558 ************************************************************************/
5559
5560/**
5561 * xmlNewAutomata:
5562 *
5563 * Create a new automata
5564 *
5565 * Returns the new object or NULL in case of failure
5566 */
5567xmlAutomataPtr
5568xmlNewAutomata(void) {
5569 xmlAutomataPtr ctxt;
5570
5571 ctxt = xmlRegNewParserCtxt(NULL);
5572 if (ctxt == NULL)
5573 return(NULL);
5574
5575 /* initialize the parser */
5576 ctxt->state = xmlRegStatePush(ctxt);
5577 if (ctxt->state == NULL) {
5578 xmlFreeAutomata(ctxt);
5579 return(NULL);
5580 }
5581 ctxt->start = ctxt->state;
5582 ctxt->end = NULL;
5583
5584 ctxt->start->type = XML_REGEXP_START_STATE;
5585 ctxt->flags = 0;
5586
5587 return(ctxt);
5588}
5589
5590/**
5591 * xmlFreeAutomata:
5592 * @am: an automata
5593 *
5594 * Free an automata
5595 */
5596void
5597xmlFreeAutomata(xmlAutomataPtr am) {
5598 if (am == NULL)
5599 return;
5600 xmlRegFreeParserCtxt(am);
5601}
5602
5603/**
5604 * xmlAutomataSetFlags:
5605 * @am: an automata
5606 * @flags: a set of internal flags
5607 *
5608 * Set some flags on the automata
5609 */
5610void
5611xmlAutomataSetFlags(xmlAutomataPtr am, int flags) {
5612 if (am == NULL)
5613 return;
5614 am->flags |= flags;
5615}
5616
5617/**
5618 * xmlAutomataGetInitState:
5619 * @am: an automata
5620 *
5621 * Initial state lookup
5622 *
5623 * Returns the initial state of the automata
5624 */
5625xmlAutomataStatePtr
5626xmlAutomataGetInitState(xmlAutomataPtr am) {
5627 if (am == NULL)
5628 return(NULL);
5629 return(am->start);
5630}
5631
5632/**
5633 * xmlAutomataSetFinalState:
5634 * @am: an automata
5635 * @state: a state in this automata
5636 *
5637 * Makes that state a final state
5638 *
5639 * Returns 0 or -1 in case of error
5640 */
5641int
5642xmlAutomataSetFinalState(xmlAutomataPtr am, xmlAutomataStatePtr state) {
5643 if ((am == NULL) || (state == NULL))
5644 return(-1);
5645 state->type = XML_REGEXP_FINAL_STATE;
5646 return(0);
5647}
5648
5649/**
5650 * xmlAutomataNewTransition:
5651 * @am: an automata
5652 * @from: the starting point of the transition
5653 * @to: the target point of the transition or NULL
5654 * @token: the input string associated to that transition
5655 * @data: data passed to the callback function if the transition is activated
5656 *
5657 * If @to is NULL, this creates first a new target state in the automata
5658 * and then adds a transition from the @from state to the target state
5659 * activated by the value of @token
5660 *
5661 * Returns the target state or NULL in case of error
5662 */
5663xmlAutomataStatePtr
5664xmlAutomataNewTransition(xmlAutomataPtr am, xmlAutomataStatePtr from,
5665 xmlAutomataStatePtr to, const xmlChar *token,
5666 void *data) {
5667 xmlRegAtomPtr atom;
5668
5669 if ((am == NULL) || (from == NULL) || (token == NULL))
5670 return(NULL);
5671 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5672 if (atom == NULL)
5673 return(NULL);
5674 atom->data = data;
5675 atom->valuep = xmlStrdup(token);
5676 if (atom->valuep == NULL) {
5677 xmlRegFreeAtom(atom);
5678 xmlRegexpErrMemory(am);
5679 return(NULL);
5680 }
5681
5682 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5683 xmlRegFreeAtom(atom);
5684 return(NULL);
5685 }
5686 if (to == NULL)
5687 return(am->state);
5688 return(to);
5689}
5690
5691/**
5692 * xmlAutomataNewTransition2:
5693 * @am: an automata
5694 * @from: the starting point of the transition
5695 * @to: the target point of the transition or NULL
5696 * @token: the first input string associated to that transition
5697 * @token2: the second input string associated to that transition
5698 * @data: data passed to the callback function if the transition is activated
5699 *
5700 * If @to is NULL, this creates first a new target state in the automata
5701 * and then adds a transition from the @from state to the target state
5702 * activated by the value of @token
5703 *
5704 * Returns the target state or NULL in case of error
5705 */
5706xmlAutomataStatePtr
5707xmlAutomataNewTransition2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5708 xmlAutomataStatePtr to, const xmlChar *token,
5709 const xmlChar *token2, void *data) {
5710 xmlRegAtomPtr atom;
5711
5712 if ((am == NULL) || (from == NULL) || (token == NULL))
5713 return(NULL);
5714 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5715 if (atom == NULL)
5716 return(NULL);
5717 atom->data = data;
5718 if ((token2 == NULL) || (*token2 == 0)) {
5719 atom->valuep = xmlStrdup(token);
5720 } else {
5721 int lenn, lenp;
5722 xmlChar *str;
5723
5724 lenn = strlen((char *) token2);
5725 lenp = strlen((char *) token);
5726
5727 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5728 if (str == NULL) {
5729 xmlRegFreeAtom(atom);
5730 return(NULL);
5731 }
5732 memcpy(&str[0], token, lenp);
5733 str[lenp] = '|';
5734 memcpy(&str[lenp + 1], token2, lenn);
5735 str[lenn + lenp + 1] = 0;
5736
5737 atom->valuep = str;
5738 }
5739
5740 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5741 xmlRegFreeAtom(atom);
5742 return(NULL);
5743 }
5744 if (to == NULL)
5745 return(am->state);
5746 return(to);
5747}
5748
5749/**
5750 * xmlAutomataNewNegTrans:
5751 * @am: an automata
5752 * @from: the starting point of the transition
5753 * @to: the target point of the transition or NULL
5754 * @token: the first input string associated to that transition
5755 * @token2: the second input string associated to that transition
5756 * @data: data passed to the callback function if the transition is activated
5757 *
5758 * If @to is NULL, this creates first a new target state in the automata
5759 * and then adds a transition from the @from state to the target state
5760 * activated by any value except (@token,@token2)
5761 * Note that if @token2 is not NULL, then (X, NULL) won't match to follow
5762 # the semantic of XSD ##other
5763 *
5764 * Returns the target state or NULL in case of error
5765 */
5766xmlAutomataStatePtr
5767xmlAutomataNewNegTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5768 xmlAutomataStatePtr to, const xmlChar *token,
5769 const xmlChar *token2, void *data) {
5770 xmlRegAtomPtr atom;
5771 xmlChar err_msg[200];
5772
5773 if ((am == NULL) || (from == NULL) || (token == NULL))
5774 return(NULL);
5775 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5776 if (atom == NULL)
5777 return(NULL);
5778 atom->data = data;
5779 atom->neg = 1;
5780 if ((token2 == NULL) || (*token2 == 0)) {
5781 atom->valuep = xmlStrdup(token);
5782 } else {
5783 int lenn, lenp;
5784 xmlChar *str;
5785
5786 lenn = strlen((char *) token2);
5787 lenp = strlen((char *) token);
5788
5789 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5790 if (str == NULL) {
5791 xmlRegFreeAtom(atom);
5792 return(NULL);
5793 }
5794 memcpy(&str[0], token, lenp);
5795 str[lenp] = '|';
5796 memcpy(&str[lenp + 1], token2, lenn);
5797 str[lenn + lenp + 1] = 0;
5798
5799 atom->valuep = str;
5800 }
5801 snprintf((char *) err_msg, 199, "not %s", (const char *) atom->valuep);
5802 err_msg[199] = 0;
5803 atom->valuep2 = xmlStrdup(err_msg);
5804
5805 if (xmlFAGenerateTransitions(am, from, to, atom) < 0) {
5806 xmlRegFreeAtom(atom);
5807 return(NULL);
5808 }
5809 am->negs++;
5810 if (to == NULL)
5811 return(am->state);
5812 return(to);
5813}
5814
5815/**
5816 * xmlAutomataNewCountTrans2:
5817 * @am: an automata
5818 * @from: the starting point of the transition
5819 * @to: the target point of the transition or NULL
5820 * @token: the input string associated to that transition
5821 * @token2: the second input string associated to that transition
5822 * @min: the minimum successive occurrences of token
5823 * @max: the maximum successive occurrences of token
5824 * @data: data associated to the transition
5825 *
5826 * If @to is NULL, this creates first a new target state in the automata
5827 * and then adds a transition from the @from state to the target state
5828 * activated by a succession of input of value @token and @token2 and
5829 * whose number is between @min and @max
5830 *
5831 * Returns the target state or NULL in case of error
5832 */
5833xmlAutomataStatePtr
5834xmlAutomataNewCountTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
5835 xmlAutomataStatePtr to, const xmlChar *token,
5836 const xmlChar *token2,
5837 int min, int max, void *data) {
5838 xmlRegAtomPtr atom;
5839 int counter;
5840
5841 if ((am == NULL) || (from == NULL) || (token == NULL))
5842 return(NULL);
5843 if (min < 0)
5844 return(NULL);
5845 if ((max < min) || (max < 1))
5846 return(NULL);
5847 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5848 if (atom == NULL)
5849 return(NULL);
5850 if ((token2 == NULL) || (*token2 == 0)) {
5851 atom->valuep = xmlStrdup(token);
5852 if (atom->valuep == NULL)
5853 goto error;
5854 } else {
5855 int lenn, lenp;
5856 xmlChar *str;
5857
5858 lenn = strlen((char *) token2);
5859 lenp = strlen((char *) token);
5860
5861 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
5862 if (str == NULL)
5863 goto error;
5864 memcpy(&str[0], token, lenp);
5865 str[lenp] = '|';
5866 memcpy(&str[lenp + 1], token2, lenn);
5867 str[lenn + lenp + 1] = 0;
5868
5869 atom->valuep = str;
5870 }
5871 atom->data = data;
5872 if (min == 0)
5873 atom->min = 1;
5874 else
5875 atom->min = min;
5876 atom->max = max;
5877
5878 /*
5879 * associate a counter to the transition.
5880 */
5881 counter = xmlRegGetCounter(am);
5882 if (counter < 0)
5883 goto error;
5884 am->counters[counter].min = min;
5885 am->counters[counter].max = max;
5886
5887 /* xmlFAGenerateTransitions(am, from, to, atom); */
5888 if (to == NULL) {
5889 to = xmlRegStatePush(am);
5890 if (to == NULL)
5891 goto error;
5892 }
5893 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5894 if (xmlRegAtomPush(am, atom) < 0)
5895 goto error;
5896 am->state = to;
5897
5898 if (to == NULL)
5899 to = am->state;
5900 if (to == NULL)
5901 return(NULL);
5902 if (min == 0)
5903 xmlFAGenerateEpsilonTransition(am, from, to);
5904 return(to);
5905
5906error:
5907 xmlRegFreeAtom(atom);
5908 return(NULL);
5909}
5910
5911/**
5912 * xmlAutomataNewCountTrans:
5913 * @am: an automata
5914 * @from: the starting point of the transition
5915 * @to: the target point of the transition or NULL
5916 * @token: the input string associated to that transition
5917 * @min: the minimum successive occurrences of token
5918 * @max: the maximum successive occurrences of token
5919 * @data: data associated to the transition
5920 *
5921 * If @to is NULL, this creates first a new target state in the automata
5922 * and then adds a transition from the @from state to the target state
5923 * activated by a succession of input of value @token and whose number
5924 * is between @min and @max
5925 *
5926 * Returns the target state or NULL in case of error
5927 */
5928xmlAutomataStatePtr
5929xmlAutomataNewCountTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
5930 xmlAutomataStatePtr to, const xmlChar *token,
5931 int min, int max, void *data) {
5932 xmlRegAtomPtr atom;
5933 int counter;
5934
5935 if ((am == NULL) || (from == NULL) || (token == NULL))
5936 return(NULL);
5937 if (min < 0)
5938 return(NULL);
5939 if ((max < min) || (max < 1))
5940 return(NULL);
5941 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
5942 if (atom == NULL)
5943 return(NULL);
5944 atom->valuep = xmlStrdup(token);
5945 if (atom->valuep == NULL)
5946 goto error;
5947 atom->data = data;
5948 if (min == 0)
5949 atom->min = 1;
5950 else
5951 atom->min = min;
5952 atom->max = max;
5953
5954 /*
5955 * associate a counter to the transition.
5956 */
5957 counter = xmlRegGetCounter(am);
5958 if (counter < 0)
5959 goto error;
5960 am->counters[counter].min = min;
5961 am->counters[counter].max = max;
5962
5963 /* xmlFAGenerateTransitions(am, from, to, atom); */
5964 if (to == NULL) {
5965 to = xmlRegStatePush(am);
5966 if (to == NULL)
5967 goto error;
5968 }
5969 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
5970 if (xmlRegAtomPush(am, atom) < 0)
5971 goto error;
5972 am->state = to;
5973
5974 if (to == NULL)
5975 to = am->state;
5976 if (to == NULL)
5977 return(NULL);
5978 if (min == 0)
5979 xmlFAGenerateEpsilonTransition(am, from, to);
5980 return(to);
5981
5982error:
5983 xmlRegFreeAtom(atom);
5984 return(NULL);
5985}
5986
5987/**
5988 * xmlAutomataNewOnceTrans2:
5989 * @am: an automata
5990 * @from: the starting point of the transition
5991 * @to: the target point of the transition or NULL
5992 * @token: the input string associated to that transition
5993 * @token2: the second input string associated to that transition
5994 * @min: the minimum successive occurrences of token
5995 * @max: the maximum successive occurrences of token
5996 * @data: data associated to the transition
5997 *
5998 * If @to is NULL, this creates first a new target state in the automata
5999 * and then adds a transition from the @from state to the target state
6000 * activated by a succession of input of value @token and @token2 and whose
6001 * number is between @min and @max, moreover that transition can only be
6002 * crossed once.
6003 *
6004 * Returns the target state or NULL in case of error
6005 */
6006xmlAutomataStatePtr
6007xmlAutomataNewOnceTrans2(xmlAutomataPtr am, xmlAutomataStatePtr from,
6008 xmlAutomataStatePtr to, const xmlChar *token,
6009 const xmlChar *token2,
6010 int min, int max, void *data) {
6011 xmlRegAtomPtr atom;
6012 int counter;
6013
6014 if ((am == NULL) || (from == NULL) || (token == NULL))
6015 return(NULL);
6016 if (min < 1)
6017 return(NULL);
6018 if (max < min)
6019 return(NULL);
6020 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6021 if (atom == NULL)
6022 return(NULL);
6023 if ((token2 == NULL) || (*token2 == 0)) {
6024 atom->valuep = xmlStrdup(token);
6025 if (atom->valuep == NULL)
6026 goto error;
6027 } else {
6028 int lenn, lenp;
6029 xmlChar *str;
6030
6031 lenn = strlen((char *) token2);
6032 lenp = strlen((char *) token);
6033
6034 str = (xmlChar *) xmlMallocAtomic(lenn + lenp + 2);
6035 if (str == NULL)
6036 goto error;
6037 memcpy(&str[0], token, lenp);
6038 str[lenp] = '|';
6039 memcpy(&str[lenp + 1], token2, lenn);
6040 str[lenn + lenp + 1] = 0;
6041
6042 atom->valuep = str;
6043 }
6044 atom->data = data;
6045 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6046 atom->min = min;
6047 atom->max = max;
6048 /*
6049 * associate a counter to the transition.
6050 */
6051 counter = xmlRegGetCounter(am);
6052 if (counter < 0)
6053 goto error;
6054 am->counters[counter].min = 1;
6055 am->counters[counter].max = 1;
6056
6057 /* xmlFAGenerateTransitions(am, from, to, atom); */
6058 if (to == NULL) {
6059 to = xmlRegStatePush(am);
6060 if (to == NULL)
6061 goto error;
6062 }
6063 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6064 if (xmlRegAtomPush(am, atom) < 0)
6065 goto error;
6066 am->state = to;
6067 return(to);
6068
6069error:
6070 xmlRegFreeAtom(atom);
6071 return(NULL);
6072}
6073
6074
6075
6076/**
6077 * xmlAutomataNewOnceTrans:
6078 * @am: an automata
6079 * @from: the starting point of the transition
6080 * @to: the target point of the transition or NULL
6081 * @token: the input string associated to that transition
6082 * @min: the minimum successive occurrences of token
6083 * @max: the maximum successive occurrences of token
6084 * @data: data associated to the transition
6085 *
6086 * If @to is NULL, this creates first a new target state in the automata
6087 * and then adds a transition from the @from state to the target state
6088 * activated by a succession of input of value @token and whose number
6089 * is between @min and @max, moreover that transition can only be crossed
6090 * once.
6091 *
6092 * Returns the target state or NULL in case of error
6093 */
6094xmlAutomataStatePtr
6095xmlAutomataNewOnceTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6096 xmlAutomataStatePtr to, const xmlChar *token,
6097 int min, int max, void *data) {
6098 xmlRegAtomPtr atom;
6099 int counter;
6100
6101 if ((am == NULL) || (from == NULL) || (token == NULL))
6102 return(NULL);
6103 if (min < 1)
6104 return(NULL);
6105 if (max < min)
6106 return(NULL);
6107 atom = xmlRegNewAtom(am, XML_REGEXP_STRING);
6108 if (atom == NULL)
6109 return(NULL);
6110 atom->valuep = xmlStrdup(token);
6111 atom->data = data;
6112 atom->quant = XML_REGEXP_QUANT_ONCEONLY;
6113 atom->min = min;
6114 atom->max = max;
6115 /*
6116 * associate a counter to the transition.
6117 */
6118 counter = xmlRegGetCounter(am);
6119 if (counter < 0)
6120 goto error;
6121 am->counters[counter].min = 1;
6122 am->counters[counter].max = 1;
6123
6124 /* xmlFAGenerateTransitions(am, from, to, atom); */
6125 if (to == NULL) {
6126 to = xmlRegStatePush(am);
6127 if (to == NULL)
6128 goto error;
6129 }
6130 xmlRegStateAddTrans(am, from, atom, to, counter, -1);
6131 if (xmlRegAtomPush(am, atom) < 0)
6132 goto error;
6133 am->state = to;
6134 return(to);
6135
6136error:
6137 xmlRegFreeAtom(atom);
6138 return(NULL);
6139}
6140
6141/**
6142 * xmlAutomataNewState:
6143 * @am: an automata
6144 *
6145 * Create a new disconnected state in the automata
6146 *
6147 * Returns the new state or NULL in case of error
6148 */
6149xmlAutomataStatePtr
6150xmlAutomataNewState(xmlAutomataPtr am) {
6151 if (am == NULL)
6152 return(NULL);
6153 return(xmlRegStatePush(am));
6154}
6155
6156/**
6157 * xmlAutomataNewEpsilon:
6158 * @am: an automata
6159 * @from: the starting point of the transition
6160 * @to: the target point of the transition or NULL
6161 *
6162 * If @to is NULL, this creates first a new target state in the automata
6163 * and then adds an epsilon transition from the @from state to the
6164 * target state
6165 *
6166 * Returns the target state or NULL in case of error
6167 */
6168xmlAutomataStatePtr
6169xmlAutomataNewEpsilon(xmlAutomataPtr am, xmlAutomataStatePtr from,
6170 xmlAutomataStatePtr to) {
6171 if ((am == NULL) || (from == NULL))
6172 return(NULL);
6173 xmlFAGenerateEpsilonTransition(am, from, to);
6174 if (to == NULL)
6175 return(am->state);
6176 return(to);
6177}
6178
6179/**
6180 * xmlAutomataNewAllTrans:
6181 * @am: an automata
6182 * @from: the starting point of the transition
6183 * @to: the target point of the transition or NULL
6184 * @lax: allow to transition if not all all transitions have been activated
6185 *
6186 * If @to is NULL, this creates first a new target state in the automata
6187 * and then adds a an ALL transition from the @from state to the
6188 * target state. That transition is an epsilon transition allowed only when
6189 * all transitions from the @from node have been activated.
6190 *
6191 * Returns the target state or NULL in case of error
6192 */
6193xmlAutomataStatePtr
6194xmlAutomataNewAllTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6195 xmlAutomataStatePtr to, int lax) {
6196 if ((am == NULL) || (from == NULL))
6197 return(NULL);
6198 xmlFAGenerateAllTransition(am, from, to, lax);
6199 if (to == NULL)
6200 return(am->state);
6201 return(to);
6202}
6203
6204/**
6205 * xmlAutomataNewCounter:
6206 * @am: an automata
6207 * @min: the minimal value on the counter
6208 * @max: the maximal value on the counter
6209 *
6210 * Create a new counter
6211 *
6212 * Returns the counter number or -1 in case of error
6213 */
6214int
6215xmlAutomataNewCounter(xmlAutomataPtr am, int min, int max) {
6216 int ret;
6217
6218 if (am == NULL)
6219 return(-1);
6220
6221 ret = xmlRegGetCounter(am);
6222 if (ret < 0)
6223 return(-1);
6224 am->counters[ret].min = min;
6225 am->counters[ret].max = max;
6226 return(ret);
6227}
6228
6229/**
6230 * xmlAutomataNewCountedTrans:
6231 * @am: an automata
6232 * @from: the starting point of the transition
6233 * @to: the target point of the transition or NULL
6234 * @counter: the counter associated to that transition
6235 *
6236 * If @to is NULL, this creates first a new target state in the automata
6237 * and then adds an epsilon transition from the @from state to the target state
6238 * which will increment the counter provided
6239 *
6240 * Returns the target state or NULL in case of error
6241 */
6242xmlAutomataStatePtr
6243xmlAutomataNewCountedTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6244 xmlAutomataStatePtr to, int counter) {
6245 if ((am == NULL) || (from == NULL) || (counter < 0))
6246 return(NULL);
6247 xmlFAGenerateCountedEpsilonTransition(am, from, to, counter);
6248 if (to == NULL)
6249 return(am->state);
6250 return(to);
6251}
6252
6253/**
6254 * xmlAutomataNewCounterTrans:
6255 * @am: an automata
6256 * @from: the starting point of the transition
6257 * @to: the target point of the transition or NULL
6258 * @counter: the counter associated to that transition
6259 *
6260 * If @to is NULL, this creates first a new target state in the automata
6261 * and then adds an epsilon transition from the @from state to the target state
6262 * which will be allowed only if the counter is within the right range.
6263 *
6264 * Returns the target state or NULL in case of error
6265 */
6266xmlAutomataStatePtr
6267xmlAutomataNewCounterTrans(xmlAutomataPtr am, xmlAutomataStatePtr from,
6268 xmlAutomataStatePtr to, int counter) {
6269 if ((am == NULL) || (from == NULL) || (counter < 0))
6270 return(NULL);
6271 xmlFAGenerateCountedTransition(am, from, to, counter);
6272 if (to == NULL)
6273 return(am->state);
6274 return(to);
6275}
6276
6277/**
6278 * xmlAutomataCompile:
6279 * @am: an automata
6280 *
6281 * Compile the automata into a Reg Exp ready for being executed.
6282 * The automata should be free after this point.
6283 *
6284 * Returns the compiled regexp or NULL in case of error
6285 */
6286xmlRegexpPtr
6287xmlAutomataCompile(xmlAutomataPtr am) {
6288 xmlRegexpPtr ret;
6289
6290 if ((am == NULL) || (am->error != 0)) return(NULL);
6291 xmlFAEliminateEpsilonTransitions(am);
6292 if (am->error != 0)
6293 return(NULL);
6294 /* xmlFAComputesDeterminism(am); */
6295 ret = xmlRegEpxFromParse(am);
6296
6297 return(ret);
6298}
6299
6300/**
6301 * xmlAutomataIsDeterminist:
6302 * @am: an automata
6303 *
6304 * Checks if an automata is determinist.
6305 *
6306 * Returns 1 if true, 0 if not, and -1 in case of error
6307 */
6308int
6309xmlAutomataIsDeterminist(xmlAutomataPtr am) {
6310 int ret;
6311
6312 if (am == NULL)
6313 return(-1);
6314
6315 ret = xmlFAComputesDeterminism(am);
6316 return(ret);
6317}
6318#endif /* LIBXML_AUTOMATA_ENABLED */
6319
6320#ifdef LIBXML_EXPR_ENABLED
6321/************************************************************************
6322 * *
6323 * Formal Expression handling code *
6324 * *
6325 ************************************************************************/
6326/************************************************************************
6327 * *
6328 * Expression handling context *
6329 * *
6330 ************************************************************************/
6331
6332struct _xmlExpCtxt {
6333 xmlDictPtr dict;
6334 xmlExpNodePtr *table;
6335 int size;
6336 int nbElems;
6337 int nb_nodes;
6338 int maxNodes;
6339 const char *expr;
6340 const char *cur;
6341 int nb_cons;
6342 int tabSize;
6343};
6344
6345/**
6346 * xmlExpNewCtxt:
6347 * @maxNodes: the maximum number of nodes
6348 * @dict: optional dictionary to use internally
6349 *
6350 * Creates a new context for manipulating expressions
6351 *
6352 * Returns the context or NULL in case of error
6353 */
6354xmlExpCtxtPtr
6355xmlExpNewCtxt(int maxNodes, xmlDictPtr dict) {
6356 xmlExpCtxtPtr ret;
6357 int size = 256;
6358
6359 if (maxNodes <= 4096)
6360 maxNodes = 4096;
6361
6362 ret = (xmlExpCtxtPtr) xmlMalloc(sizeof(xmlExpCtxt));
6363 if (ret == NULL)
6364 return(NULL);
6365 memset(ret, 0, sizeof(xmlExpCtxt));
6366 ret->size = size;
6367 ret->nbElems = 0;
6368 ret->maxNodes = maxNodes;
6369 ret->table = xmlMalloc(size * sizeof(xmlExpNodePtr));
6370 if (ret->table == NULL) {
6371 xmlFree(ret);
6372 return(NULL);
6373 }
6374 memset(ret->table, 0, size * sizeof(xmlExpNodePtr));
6375 if (dict == NULL) {
6376 ret->dict = xmlDictCreate();
6377 if (ret->dict == NULL) {
6378 xmlFree(ret->table);
6379 xmlFree(ret);
6380 return(NULL);
6381 }
6382 } else {
6383 ret->dict = dict;
6384 xmlDictReference(ret->dict);
6385 }
6386 return(ret);
6387}
6388
6389/**
6390 * xmlExpFreeCtxt:
6391 * @ctxt: an expression context
6392 *
6393 * Free an expression context
6394 */
6395void
6396xmlExpFreeCtxt(xmlExpCtxtPtr ctxt) {
6397 if (ctxt == NULL)
6398 return;
6399 xmlDictFree(ctxt->dict);
6400 if (ctxt->table != NULL)
6401 xmlFree(ctxt->table);
6402 xmlFree(ctxt);
6403}
6404
6405/************************************************************************
6406 * *
6407 * Structure associated to an expression node *
6408 * *
6409 ************************************************************************/
6410#define MAX_NODES 10000
6411
6412/*
6413 * TODO:
6414 * - Wildcards
6415 * - public API for creation
6416 *
6417 * Started
6418 * - regression testing
6419 *
6420 * Done
6421 * - split into module and test tool
6422 * - memleaks
6423 */
6424
6425typedef enum {
6426 XML_EXP_NILABLE = (1 << 0)
6427} xmlExpNodeInfo;
6428
6429#define IS_NILLABLE(node) ((node)->info & XML_EXP_NILABLE)
6430
6431struct _xmlExpNode {
6432 unsigned char type;/* xmlExpNodeType */
6433 unsigned char info;/* OR of xmlExpNodeInfo */
6434 unsigned short key; /* the hash key */
6435 unsigned int ref; /* The number of references */
6436 int c_max; /* the maximum length it can consume */
6437 xmlExpNodePtr exp_left;
6438 xmlExpNodePtr next;/* the next node in the hash table or free list */
6439 union {
6440 struct {
6441 int f_min;
6442 int f_max;
6443 } count;
6444 struct {
6445 xmlExpNodePtr f_right;
6446 } children;
6447 const xmlChar *f_str;
6448 } field;
6449};
6450
6451#define exp_min field.count.f_min
6452#define exp_max field.count.f_max
6453/* #define exp_left field.children.f_left */
6454#define exp_right field.children.f_right
6455#define exp_str field.f_str
6456
6457static xmlExpNodePtr xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type);
6458static xmlExpNode forbiddenExpNode = {
6459 XML_EXP_FORBID, 0, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6460};
6461xmlExpNodePtr forbiddenExp = &forbiddenExpNode;
6462static xmlExpNode emptyExpNode = {
6463 XML_EXP_EMPTY, 1, 0, 0, 0, NULL, NULL, {{ 0, 0}}
6464};
6465xmlExpNodePtr emptyExp = &emptyExpNode;
6466
6467/************************************************************************
6468 * *
6469 * The custom hash table for unicity and canonicalization *
6470 * of sub-expressions pointers *
6471 * *
6472 ************************************************************************/
6473/*
6474 * xmlExpHashNameComputeKey:
6475 * Calculate the hash key for a token
6476 */
6477static unsigned short
6478xmlExpHashNameComputeKey(const xmlChar *name) {
6479 unsigned short value = 0L;
6480 char ch;
6481
6482 if (name != NULL) {
6483 value += 30 * (*name);
6484 while ((ch = *name++) != 0) {
6485 value = value ^ ((value << 5) + (value >> 3) + (unsigned long)ch);
6486 }
6487 }
6488 return (value);
6489}
6490
6491/*
6492 * xmlExpHashComputeKey:
6493 * Calculate the hash key for a compound expression
6494 */
6495static unsigned short
6496xmlExpHashComputeKey(xmlExpNodeType type, xmlExpNodePtr left,
6497 xmlExpNodePtr right) {
6498 unsigned long value;
6499 unsigned short ret;
6500
6501 switch (type) {
6502 case XML_EXP_SEQ:
6503 value = left->key;
6504 value += right->key;
6505 value *= 3;
6506 ret = (unsigned short) value;
6507 break;
6508 case XML_EXP_OR:
6509 value = left->key;
6510 value += right->key;
6511 value *= 7;
6512 ret = (unsigned short) value;
6513 break;
6514 case XML_EXP_COUNT:
6515 value = left->key;
6516 value += right->key;
6517 ret = (unsigned short) value;
6518 break;
6519 default:
6520 ret = 0;
6521 }
6522 return(ret);
6523}
6524
6525
6526static xmlExpNodePtr
6527xmlExpNewNode(xmlExpCtxtPtr ctxt, xmlExpNodeType type) {
6528 xmlExpNodePtr ret;
6529
6530 if (ctxt->nb_nodes >= MAX_NODES)
6531 return(NULL);
6532 ret = (xmlExpNodePtr) xmlMalloc(sizeof(xmlExpNode));
6533 if (ret == NULL)
6534 return(NULL);
6535 memset(ret, 0, sizeof(xmlExpNode));
6536 ret->type = type;
6537 ret->next = NULL;
6538 ctxt->nb_nodes++;
6539 ctxt->nb_cons++;
6540 return(ret);
6541}
6542
6543/**
6544 * xmlExpHashGetEntry:
6545 * @table: the hash table
6546 *
6547 * Get the unique entry from the hash table. The entry is created if
6548 * needed. @left and @right are consumed, i.e. their ref count will
6549 * be decremented by the operation.
6550 *
6551 * Returns the pointer or NULL in case of error
6552 */
6553static xmlExpNodePtr
6554xmlExpHashGetEntry(xmlExpCtxtPtr ctxt, xmlExpNodeType type,
6555 xmlExpNodePtr left, xmlExpNodePtr right,
6556 const xmlChar *name, int min, int max) {
6557 unsigned short kbase, key;
6558 xmlExpNodePtr entry;
6559 xmlExpNodePtr insert;
6560
6561 if (ctxt == NULL)
6562 return(NULL);
6563
6564 /*
6565 * Check for duplicate and insertion location.
6566 */
6567 if (type == XML_EXP_ATOM) {
6568 kbase = xmlExpHashNameComputeKey(name);
6569 } else if (type == XML_EXP_COUNT) {
6570 /* COUNT reduction rule 1 */
6571 /* a{1} -> a */
6572 if (min == max) {
6573 if (min == 1) {
6574 return(left);
6575 }
6576 if (min == 0) {
6577 xmlExpFree(ctxt, left);
6578 return(emptyExp);
6579 }
6580 }
6581 if (min < 0) {
6582 xmlExpFree(ctxt, left);
6583 return(forbiddenExp);
6584 }
6585 if (max == -1)
6586 kbase = min + 79;
6587 else
6588 kbase = max - min;
6589 kbase += left->key;
6590 } else if (type == XML_EXP_OR) {
6591 /* Forbid reduction rules */
6592 if (left->type == XML_EXP_FORBID) {
6593 xmlExpFree(ctxt, left);
6594 return(right);
6595 }
6596 if (right->type == XML_EXP_FORBID) {
6597 xmlExpFree(ctxt, right);
6598 return(left);
6599 }
6600
6601 /* OR reduction rule 1 */
6602 /* a | a reduced to a */
6603 if (left == right) {
6604 xmlExpFree(ctxt, right);
6605 return(left);
6606 }
6607 /* OR canonicalization rule 1 */
6608 /* linearize (a | b) | c into a | (b | c) */
6609 if ((left->type == XML_EXP_OR) && (right->type != XML_EXP_OR)) {
6610 xmlExpNodePtr tmp = left;
6611 left = right;
6612 right = tmp;
6613 }
6614 /* OR reduction rule 2 */
6615 /* a | (a | b) and b | (a | b) are reduced to a | b */
6616 if (right->type == XML_EXP_OR) {
6617 if ((left == right->exp_left) ||
6618 (left == right->exp_right)) {
6619 xmlExpFree(ctxt, left);
6620 return(right);
6621 }
6622 }
6623 /* OR canonicalization rule 2 */
6624 /* linearize (a | b) | c into a | (b | c) */
6625 if (left->type == XML_EXP_OR) {
6626 xmlExpNodePtr tmp;
6627
6628 /* OR canonicalization rule 2 */
6629 if ((left->exp_right->type != XML_EXP_OR) &&
6630 (left->exp_right->key < left->exp_left->key)) {
6631 tmp = left->exp_right;
6632 left->exp_right = left->exp_left;
6633 left->exp_left = tmp;
6634 }
6635 left->exp_right->ref++;
6636 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_right, right,
6637 NULL, 0, 0);
6638 left->exp_left->ref++;
6639 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left->exp_left, tmp,
6640 NULL, 0, 0);
6641
6642 xmlExpFree(ctxt, left);
6643 return(tmp);
6644 }
6645 if (right->type == XML_EXP_OR) {
6646 /* Ordering in the tree */
6647 /* C | (A | B) -> A | (B | C) */
6648 if (left->key > right->exp_right->key) {
6649 xmlExpNodePtr tmp;
6650 right->exp_right->ref++;
6651 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_right,
6652 left, NULL, 0, 0);
6653 right->exp_left->ref++;
6654 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6655 tmp, NULL, 0, 0);
6656 xmlExpFree(ctxt, right);
6657 return(tmp);
6658 }
6659 /* Ordering in the tree */
6660 /* B | (A | C) -> A | (B | C) */
6661 if (left->key > right->exp_left->key) {
6662 xmlExpNodePtr tmp;
6663 right->exp_right->ref++;
6664 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, left,
6665 right->exp_right, NULL, 0, 0);
6666 right->exp_left->ref++;
6667 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_OR, right->exp_left,
6668 tmp, NULL, 0, 0);
6669 xmlExpFree(ctxt, right);
6670 return(tmp);
6671 }
6672 }
6673 /* we know both types are != XML_EXP_OR here */
6674 else if (left->key > right->key) {
6675 xmlExpNodePtr tmp = left;
6676 left = right;
6677 right = tmp;
6678 }
6679 kbase = xmlExpHashComputeKey(type, left, right);
6680 } else if (type == XML_EXP_SEQ) {
6681 /* Forbid reduction rules */
6682 if (left->type == XML_EXP_FORBID) {
6683 xmlExpFree(ctxt, right);
6684 return(left);
6685 }
6686 if (right->type == XML_EXP_FORBID) {
6687 xmlExpFree(ctxt, left);
6688 return(right);
6689 }
6690 /* Empty reduction rules */
6691 if (right->type == XML_EXP_EMPTY) {
6692 return(left);
6693 }
6694 if (left->type == XML_EXP_EMPTY) {
6695 return(right);
6696 }
6697 kbase = xmlExpHashComputeKey(type, left, right);
6698 } else
6699 return(NULL);
6700
6701 key = kbase % ctxt->size;
6702 if (ctxt->table[key] != NULL) {
6703 for (insert = ctxt->table[key]; insert != NULL;
6704 insert = insert->next) {
6705 if ((insert->key == kbase) &&
6706 (insert->type == type)) {
6707 if (type == XML_EXP_ATOM) {
6708 if (name == insert->exp_str) {
6709 insert->ref++;
6710 return(insert);
6711 }
6712 } else if (type == XML_EXP_COUNT) {
6713 if ((insert->exp_min == min) && (insert->exp_max == max) &&
6714 (insert->exp_left == left)) {
6715 insert->ref++;
6716 left->ref--;
6717 return(insert);
6718 }
6719 } else if ((insert->exp_left == left) &&
6720 (insert->exp_right == right)) {
6721 insert->ref++;
6722 left->ref--;
6723 right->ref--;
6724 return(insert);
6725 }
6726 }
6727 }
6728 }
6729
6730 entry = xmlExpNewNode(ctxt, type);
6731 if (entry == NULL)
6732 return(NULL);
6733 entry->key = kbase;
6734 if (type == XML_EXP_ATOM) {
6735 entry->exp_str = name;
6736 entry->c_max = 1;
6737 } else if (type == XML_EXP_COUNT) {
6738 entry->exp_min = min;
6739 entry->exp_max = max;
6740 entry->exp_left = left;
6741 if ((min == 0) || (IS_NILLABLE(left)))
6742 entry->info |= XML_EXP_NILABLE;
6743 if (max < 0)
6744 entry->c_max = -1;
6745 else
6746 entry->c_max = max * entry->exp_left->c_max;
6747 } else {
6748 entry->exp_left = left;
6749 entry->exp_right = right;
6750 if (type == XML_EXP_OR) {
6751 if ((IS_NILLABLE(left)) || (IS_NILLABLE(right)))
6752 entry->info |= XML_EXP_NILABLE;
6753 if ((entry->exp_left->c_max == -1) ||
6754 (entry->exp_right->c_max == -1))
6755 entry->c_max = -1;
6756 else if (entry->exp_left->c_max > entry->exp_right->c_max)
6757 entry->c_max = entry->exp_left->c_max;
6758 else
6759 entry->c_max = entry->exp_right->c_max;
6760 } else {
6761 if ((IS_NILLABLE(left)) && (IS_NILLABLE(right)))
6762 entry->info |= XML_EXP_NILABLE;
6763 if ((entry->exp_left->c_max == -1) ||
6764 (entry->exp_right->c_max == -1))
6765 entry->c_max = -1;
6766 else
6767 entry->c_max = entry->exp_left->c_max + entry->exp_right->c_max;
6768 }
6769 }
6770 entry->ref = 1;
6771 if (ctxt->table[key] != NULL)
6772 entry->next = ctxt->table[key];
6773
6774 ctxt->table[key] = entry;
6775 ctxt->nbElems++;
6776
6777 return(entry);
6778}
6779
6780/**
6781 * xmlExpFree:
6782 * @ctxt: the expression context
6783 * @exp: the expression
6784 *
6785 * Dereference the expression
6786 */
6787void
6788xmlExpFree(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp) {
6789 if ((exp == NULL) || (exp == forbiddenExp) || (exp == emptyExp))
6790 return;
6791 exp->ref--;
6792 if (exp->ref == 0) {
6793 unsigned short key;
6794
6795 /* Unlink it first from the hash table */
6796 key = exp->key % ctxt->size;
6797 if (ctxt->table[key] == exp) {
6798 ctxt->table[key] = exp->next;
6799 } else {
6800 xmlExpNodePtr tmp;
6801
6802 tmp = ctxt->table[key];
6803 while (tmp != NULL) {
6804 if (tmp->next == exp) {
6805 tmp->next = exp->next;
6806 break;
6807 }
6808 tmp = tmp->next;
6809 }
6810 }
6811
6812 if ((exp->type == XML_EXP_SEQ) || (exp->type == XML_EXP_OR)) {
6813 xmlExpFree(ctxt, exp->exp_left);
6814 xmlExpFree(ctxt, exp->exp_right);
6815 } else if (exp->type == XML_EXP_COUNT) {
6816 xmlExpFree(ctxt, exp->exp_left);
6817 }
6818 xmlFree(exp);
6819 ctxt->nb_nodes--;
6820 }
6821}
6822
6823/**
6824 * xmlExpRef:
6825 * @exp: the expression
6826 *
6827 * Increase the reference count of the expression
6828 */
6829void
6830xmlExpRef(xmlExpNodePtr exp) {
6831 if (exp != NULL)
6832 exp->ref++;
6833}
6834
6835/**
6836 * xmlExpNewAtom:
6837 * @ctxt: the expression context
6838 * @name: the atom name
6839 * @len: the atom name length in byte (or -1);
6840 *
6841 * Get the atom associated to this name from that context
6842 *
6843 * Returns the node or NULL in case of error
6844 */
6845xmlExpNodePtr
6846xmlExpNewAtom(xmlExpCtxtPtr ctxt, const xmlChar *name, int len) {
6847 if ((ctxt == NULL) || (name == NULL))
6848 return(NULL);
6849 name = xmlDictLookup(ctxt->dict, name, len);
6850 if (name == NULL)
6851 return(NULL);
6852 return(xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, name, 0, 0));
6853}
6854
6855/**
6856 * xmlExpNewOr:
6857 * @ctxt: the expression context
6858 * @left: left expression
6859 * @right: right expression
6860 *
6861 * Get the atom associated to the choice @left | @right
6862 * Note that @left and @right are consumed in the operation, to keep
6863 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6864 * this is true even in case of failure (unless ctxt == NULL).
6865 *
6866 * Returns the node or NULL in case of error
6867 */
6868xmlExpNodePtr
6869xmlExpNewOr(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6870 if (ctxt == NULL)
6871 return(NULL);
6872 if ((left == NULL) || (right == NULL)) {
6873 xmlExpFree(ctxt, left);
6874 xmlExpFree(ctxt, right);
6875 return(NULL);
6876 }
6877 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, left, right, NULL, 0, 0));
6878}
6879
6880/**
6881 * xmlExpNewSeq:
6882 * @ctxt: the expression context
6883 * @left: left expression
6884 * @right: right expression
6885 *
6886 * Get the atom associated to the sequence @left , @right
6887 * Note that @left and @right are consumed in the operation, to keep
6888 * an handle on them use xmlExpRef() and use xmlExpFree() to release them,
6889 * this is true even in case of failure (unless ctxt == NULL).
6890 *
6891 * Returns the node or NULL in case of error
6892 */
6893xmlExpNodePtr
6894xmlExpNewSeq(xmlExpCtxtPtr ctxt, xmlExpNodePtr left, xmlExpNodePtr right) {
6895 if (ctxt == NULL)
6896 return(NULL);
6897 if ((left == NULL) || (right == NULL)) {
6898 xmlExpFree(ctxt, left);
6899 xmlExpFree(ctxt, right);
6900 return(NULL);
6901 }
6902 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, left, right, NULL, 0, 0));
6903}
6904
6905/**
6906 * xmlExpNewRange:
6907 * @ctxt: the expression context
6908 * @subset: the expression to be repeated
6909 * @min: the lower bound for the repetition
6910 * @max: the upper bound for the repetition, -1 means infinite
6911 *
6912 * Get the atom associated to the range (@subset){@min, @max}
6913 * Note that @subset is consumed in the operation, to keep
6914 * an handle on it use xmlExpRef() and use xmlExpFree() to release it,
6915 * this is true even in case of failure (unless ctxt == NULL).
6916 *
6917 * Returns the node or NULL in case of error
6918 */
6919xmlExpNodePtr
6920xmlExpNewRange(xmlExpCtxtPtr ctxt, xmlExpNodePtr subset, int min, int max) {
6921 if (ctxt == NULL)
6922 return(NULL);
6923 if ((subset == NULL) || (min < 0) || (max < -1) ||
6924 ((max >= 0) && (min > max))) {
6925 xmlExpFree(ctxt, subset);
6926 return(NULL);
6927 }
6928 return(xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, subset,
6929 NULL, NULL, min, max));
6930}
6931
6932/************************************************************************
6933 * *
6934 * Public API for operations on expressions *
6935 * *
6936 ************************************************************************/
6937
6938static int
6939xmlExpGetLanguageInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6940 const xmlChar**list, int len, int nb) {
6941 int tmp, tmp2;
6942tail:
6943 switch (exp->type) {
6944 case XML_EXP_EMPTY:
6945 return(0);
6946 case XML_EXP_ATOM:
6947 for (tmp = 0;tmp < nb;tmp++)
6948 if (list[tmp] == exp->exp_str)
6949 return(0);
6950 if (nb >= len)
6951 return(-2);
6952 list[nb] = exp->exp_str;
6953 return(1);
6954 case XML_EXP_COUNT:
6955 exp = exp->exp_left;
6956 goto tail;
6957 case XML_EXP_SEQ:
6958 case XML_EXP_OR:
6959 tmp = xmlExpGetLanguageInt(ctxt, exp->exp_left, list, len, nb);
6960 if (tmp < 0)
6961 return(tmp);
6962 tmp2 = xmlExpGetLanguageInt(ctxt, exp->exp_right, list, len,
6963 nb + tmp);
6964 if (tmp2 < 0)
6965 return(tmp2);
6966 return(tmp + tmp2);
6967 }
6968 return(-1);
6969}
6970
6971/**
6972 * xmlExpGetLanguage:
6973 * @ctxt: the expression context
6974 * @exp: the expression
6975 * @langList: where to store the tokens
6976 * @len: the allocated length of @list
6977 *
6978 * Find all the strings used in @exp and store them in @list
6979 *
6980 * Returns the number of unique strings found, -1 in case of errors and
6981 * -2 if there is more than @len strings
6982 */
6983int
6984xmlExpGetLanguage(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6985 const xmlChar**langList, int len) {
6986 if ((ctxt == NULL) || (exp == NULL) || (langList == NULL) || (len <= 0))
6987 return(-1);
6988 return(xmlExpGetLanguageInt(ctxt, exp, langList, len, 0));
6989}
6990
6991static int
6992xmlExpGetStartInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
6993 const xmlChar**list, int len, int nb) {
6994 int tmp, tmp2;
6995tail:
6996 switch (exp->type) {
6997 case XML_EXP_FORBID:
6998 return(0);
6999 case XML_EXP_EMPTY:
7000 return(0);
7001 case XML_EXP_ATOM:
7002 for (tmp = 0;tmp < nb;tmp++)
7003 if (list[tmp] == exp->exp_str)
7004 return(0);
7005 if (nb >= len)
7006 return(-2);
7007 list[nb] = exp->exp_str;
7008 return(1);
7009 case XML_EXP_COUNT:
7010 exp = exp->exp_left;
7011 goto tail;
7012 case XML_EXP_SEQ:
7013 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7014 if (tmp < 0)
7015 return(tmp);
7016 if (IS_NILLABLE(exp->exp_left)) {
7017 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7018 nb + tmp);
7019 if (tmp2 < 0)
7020 return(tmp2);
7021 tmp += tmp2;
7022 }
7023 return(tmp);
7024 case XML_EXP_OR:
7025 tmp = xmlExpGetStartInt(ctxt, exp->exp_left, list, len, nb);
7026 if (tmp < 0)
7027 return(tmp);
7028 tmp2 = xmlExpGetStartInt(ctxt, exp->exp_right, list, len,
7029 nb + tmp);
7030 if (tmp2 < 0)
7031 return(tmp2);
7032 return(tmp + tmp2);
7033 }
7034 return(-1);
7035}
7036
7037/**
7038 * xmlExpGetStart:
7039 * @ctxt: the expression context
7040 * @exp: the expression
7041 * @tokList: where to store the tokens
7042 * @len: the allocated length of @list
7043 *
7044 * Find all the strings that appears at the start of the languages
7045 * accepted by @exp and store them in @list. E.g. for (a, b) | c
7046 * it will return the list [a, c]
7047 *
7048 * Returns the number of unique strings found, -1 in case of errors and
7049 * -2 if there is more than @len strings
7050 */
7051int
7052xmlExpGetStart(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7053 const xmlChar**tokList, int len) {
7054 if ((ctxt == NULL) || (exp == NULL) || (tokList == NULL) || (len <= 0))
7055 return(-1);
7056 return(xmlExpGetStartInt(ctxt, exp, tokList, len, 0));
7057}
7058
7059/**
7060 * xmlExpIsNillable:
7061 * @exp: the expression
7062 *
7063 * Finds if the expression is nillable, i.e. if it accepts the empty sequence
7064 *
7065 * Returns 1 if nillable, 0 if not and -1 in case of error
7066 */
7067int
7068xmlExpIsNillable(xmlExpNodePtr exp) {
7069 if (exp == NULL)
7070 return(-1);
7071 return(IS_NILLABLE(exp) != 0);
7072}
7073
7074static xmlExpNodePtr
7075xmlExpStringDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, const xmlChar *str)
7076{
7077 xmlExpNodePtr ret;
7078
7079 switch (exp->type) {
7080 case XML_EXP_EMPTY:
7081 return(forbiddenExp);
7082 case XML_EXP_FORBID:
7083 return(forbiddenExp);
7084 case XML_EXP_ATOM:
7085 if (exp->exp_str == str) {
7086 ret = emptyExp;
7087 } else {
7088 /* TODO wildcards here */
7089 ret = forbiddenExp;
7090 }
7091 return(ret);
7092 case XML_EXP_OR: {
7093 xmlExpNodePtr tmp;
7094
7095 tmp = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7096 if (tmp == NULL) {
7097 return(NULL);
7098 }
7099 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7100 if (ret == NULL) {
7101 xmlExpFree(ctxt, tmp);
7102 return(NULL);
7103 }
7104 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret,
7105 NULL, 0, 0);
7106 return(ret);
7107 }
7108 case XML_EXP_SEQ:
7109 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7110 if (ret == NULL) {
7111 return(NULL);
7112 } else if (ret == forbiddenExp) {
7113 if (IS_NILLABLE(exp->exp_left)) {
7114 ret = xmlExpStringDeriveInt(ctxt, exp->exp_right, str);
7115 }
7116 } else {
7117 exp->exp_right->ref++;
7118 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, exp->exp_right,
7119 NULL, 0, 0);
7120 }
7121 return(ret);
7122 case XML_EXP_COUNT: {
7123 int min, max;
7124 xmlExpNodePtr tmp;
7125
7126 if (exp->exp_max == 0)
7127 return(forbiddenExp);
7128 ret = xmlExpStringDeriveInt(ctxt, exp->exp_left, str);
7129 if (ret == NULL)
7130 return(NULL);
7131 if (ret == forbiddenExp) {
7132 return(ret);
7133 }
7134 if (exp->exp_max == 1)
7135 return(ret);
7136 if (exp->exp_max < 0) /* unbounded */
7137 max = -1;
7138 else
7139 max = exp->exp_max - 1;
7140 if (exp->exp_min > 0)
7141 min = exp->exp_min - 1;
7142 else
7143 min = 0;
7144 exp->exp_left->ref++;
7145 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left, NULL,
7146 NULL, min, max);
7147 if (ret == emptyExp) {
7148 return(tmp);
7149 }
7150 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, tmp,
7151 NULL, 0, 0));
7152 }
7153 }
7154 return(NULL);
7155}
7156
7157/**
7158 * xmlExpStringDerive:
7159 * @ctxt: the expression context
7160 * @exp: the expression
7161 * @str: the string
7162 * @len: the string len in bytes if available
7163 *
7164 * Do one step of Brzozowski derivation of the expression @exp with
7165 * respect to the input string
7166 *
7167 * Returns the resulting expression or NULL in case of internal error
7168 */
7169xmlExpNodePtr
7170xmlExpStringDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7171 const xmlChar *str, int len) {
7172 const xmlChar *input;
7173
7174 if ((exp == NULL) || (ctxt == NULL) || (str == NULL)) {
7175 return(NULL);
7176 }
7177 /*
7178 * check the string is in the dictionary, if yes use an interned
7179 * copy, otherwise we know it's not an acceptable input
7180 */
7181 input = xmlDictExists(ctxt->dict, str, len);
7182 if (input == NULL) {
7183 return(forbiddenExp);
7184 }
7185 return(xmlExpStringDeriveInt(ctxt, exp, input));
7186}
7187
7188static int
7189xmlExpCheckCard(xmlExpNodePtr exp, xmlExpNodePtr sub) {
7190 int ret = 1;
7191
7192 if (sub->c_max == -1) {
7193 if (exp->c_max != -1)
7194 ret = 0;
7195 } else if ((exp->c_max >= 0) && (exp->c_max < sub->c_max)) {
7196 ret = 0;
7197 }
7198#if 0
7199 if ((IS_NILLABLE(sub)) && (!IS_NILLABLE(exp)))
7200 ret = 0;
7201#endif
7202 return(ret);
7203}
7204
7205static xmlExpNodePtr xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp,
7206 xmlExpNodePtr sub);
7207/**
7208 * xmlExpDivide:
7209 * @ctxt: the expressions context
7210 * @exp: the englobing expression
7211 * @sub: the subexpression
7212 * @mult: the multiple expression
7213 * @remain: the remain from the derivation of the multiple
7214 *
7215 * Check if exp is a multiple of sub, i.e. if there is a finite number n
7216 * so that sub{n} subsume exp
7217 *
7218 * Returns the multiple value if successful, 0 if it is not a multiple
7219 * and -1 in case of internal error.
7220 */
7221
7222static int
7223xmlExpDivide(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub,
7224 xmlExpNodePtr *mult, xmlExpNodePtr *remain) {
7225 int i;
7226 xmlExpNodePtr tmp, tmp2;
7227
7228 if (mult != NULL) *mult = NULL;
7229 if (remain != NULL) *remain = NULL;
7230 if (exp->c_max == -1) return(0);
7231 if (IS_NILLABLE(exp) && (!IS_NILLABLE(sub))) return(0);
7232
7233 for (i = 1;i <= exp->c_max;i++) {
7234 sub->ref++;
7235 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7236 sub, NULL, NULL, i, i);
7237 if (tmp == NULL) {
7238 return(-1);
7239 }
7240 if (!xmlExpCheckCard(tmp, exp)) {
7241 xmlExpFree(ctxt, tmp);
7242 continue;
7243 }
7244 tmp2 = xmlExpExpDeriveInt(ctxt, tmp, exp);
7245 if (tmp2 == NULL) {
7246 xmlExpFree(ctxt, tmp);
7247 return(-1);
7248 }
7249 if ((tmp2 != forbiddenExp) && (IS_NILLABLE(tmp2))) {
7250 if (remain != NULL)
7251 *remain = tmp2;
7252 else
7253 xmlExpFree(ctxt, tmp2);
7254 if (mult != NULL)
7255 *mult = tmp;
7256 else
7257 xmlExpFree(ctxt, tmp);
7258 return(i);
7259 }
7260 xmlExpFree(ctxt, tmp);
7261 xmlExpFree(ctxt, tmp2);
7262 }
7263 return(0);
7264}
7265
7266/**
7267 * xmlExpExpDeriveInt:
7268 * @ctxt: the expressions context
7269 * @exp: the englobing expression
7270 * @sub: the subexpression
7271 *
7272 * Try to do a step of Brzozowski derivation but at a higher level
7273 * the input being a subexpression.
7274 *
7275 * Returns the resulting expression or NULL in case of internal error
7276 */
7277static xmlExpNodePtr
7278xmlExpExpDeriveInt(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7279 xmlExpNodePtr ret, tmp, tmp2, tmp3;
7280 const xmlChar **tab;
7281 int len, i;
7282
7283 /*
7284 * In case of equality and if the expression can only consume a finite
7285 * amount, then the derivation is empty
7286 */
7287 if ((exp == sub) && (exp->c_max >= 0)) {
7288 return(emptyExp);
7289 }
7290 /*
7291 * decompose sub sequence first
7292 */
7293 if (sub->type == XML_EXP_EMPTY) {
7294 exp->ref++;
7295 return(exp);
7296 }
7297 if (sub->type == XML_EXP_SEQ) {
7298 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7299 if (tmp == NULL)
7300 return(NULL);
7301 if (tmp == forbiddenExp)
7302 return(tmp);
7303 ret = xmlExpExpDeriveInt(ctxt, tmp, sub->exp_right);
7304 xmlExpFree(ctxt, tmp);
7305 return(ret);
7306 }
7307 if (sub->type == XML_EXP_OR) {
7308 tmp = xmlExpExpDeriveInt(ctxt, exp, sub->exp_left);
7309 if (tmp == forbiddenExp)
7310 return(tmp);
7311 if (tmp == NULL)
7312 return(NULL);
7313 ret = xmlExpExpDeriveInt(ctxt, exp, sub->exp_right);
7314 if ((ret == NULL) || (ret == forbiddenExp)) {
7315 xmlExpFree(ctxt, tmp);
7316 return(ret);
7317 }
7318 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, tmp, ret, NULL, 0, 0));
7319 }
7320 if (!xmlExpCheckCard(exp, sub)) {
7321 return(forbiddenExp);
7322 }
7323 switch (exp->type) {
7324 case XML_EXP_EMPTY:
7325 if (sub == emptyExp)
7326 return(emptyExp);
7327 return(forbiddenExp);
7328 case XML_EXP_FORBID:
7329 return(forbiddenExp);
7330 case XML_EXP_ATOM:
7331 if (sub->type == XML_EXP_ATOM) {
7332 /* TODO: handle wildcards */
7333 if (exp->exp_str == sub->exp_str) {
7334 return(emptyExp);
7335 }
7336 return(forbiddenExp);
7337 }
7338 if ((sub->type == XML_EXP_COUNT) &&
7339 (sub->exp_max == 1) &&
7340 (sub->exp_left->type == XML_EXP_ATOM)) {
7341 /* TODO: handle wildcards */
7342 if (exp->exp_str == sub->exp_left->exp_str) {
7343 return(emptyExp);
7344 }
7345 return(forbiddenExp);
7346 }
7347 return(forbiddenExp);
7348 case XML_EXP_SEQ:
7349 /* try to get the sequence consumed only if possible */
7350 if (xmlExpCheckCard(exp->exp_left, sub)) {
7351 /* See if the sequence can be consumed directly */
7352 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7353 if ((ret != forbiddenExp) && (ret != NULL)) {
7354 /*
7355 * TODO: assumption here that we are determinist
7356 * i.e. we won't get to a nillable exp left
7357 * subset which could be matched by the right
7358 * part too.
7359 * e.g.: (a | b)+,(a | c) and 'a+,a'
7360 */
7361 exp->exp_right->ref++;
7362 return(xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7363 exp->exp_right, NULL, 0, 0));
7364 }
7365 }
7366 /* Try instead to decompose */
7367 if (sub->type == XML_EXP_COUNT) {
7368 int min, max;
7369
7370 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7371 if (ret == NULL)
7372 return(NULL);
7373 if (ret != forbiddenExp) {
7374 if (sub->exp_max < 0)
7375 max = -1;
7376 else
7377 max = sub->exp_max -1;
7378 if (sub->exp_min > 0)
7379 min = sub->exp_min -1;
7380 else
7381 min = 0;
7382 exp->exp_right->ref++;
7383 tmp = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret,
7384 exp->exp_right, NULL, 0, 0);
7385 if (tmp == NULL)
7386 return(NULL);
7387
7388 sub->exp_left->ref++;
7389 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT,
7390 sub->exp_left, NULL, NULL, min, max);
7391 if (tmp2 == NULL) {
7392 xmlExpFree(ctxt, tmp);
7393 return(NULL);
7394 }
7395 ret = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7396 xmlExpFree(ctxt, tmp);
7397 xmlExpFree(ctxt, tmp2);
7398 return(ret);
7399 }
7400 }
7401 /* we made no progress on structured operations */
7402 break;
7403 case XML_EXP_OR:
7404 ret = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7405 if (ret == NULL)
7406 return(NULL);
7407 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_right, sub);
7408 if (tmp == NULL) {
7409 xmlExpFree(ctxt, ret);
7410 return(NULL);
7411 }
7412 return(xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp, NULL, 0, 0));
7413 case XML_EXP_COUNT: {
7414 int min, max;
7415
7416 if (sub->type == XML_EXP_COUNT) {
7417 /*
7418 * Try to see if the loop is completely subsumed
7419 */
7420 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub->exp_left);
7421 if (tmp == NULL)
7422 return(NULL);
7423 if (tmp == forbiddenExp) {
7424 int mult;
7425
7426 mult = xmlExpDivide(ctxt, sub->exp_left, exp->exp_left,
7427 NULL, &tmp);
7428 if (mult <= 0) {
7429 return(forbiddenExp);
7430 }
7431 if (sub->exp_max == -1) {
7432 max = -1;
7433 if (exp->exp_max == -1) {
7434 if (exp->exp_min <= sub->exp_min * mult)
7435 min = 0;
7436 else
7437 min = exp->exp_min - sub->exp_min * mult;
7438 } else {
7439 xmlExpFree(ctxt, tmp);
7440 return(forbiddenExp);
7441 }
7442 } else {
7443 if (exp->exp_max == -1) {
7444 if (exp->exp_min > sub->exp_min * mult) {
7445 max = -1;
7446 min = exp->exp_min - sub->exp_min * mult;
7447 } else {
7448 max = -1;
7449 min = 0;
7450 }
7451 } else {
7452 if (exp->exp_max < sub->exp_max * mult) {
7453 xmlExpFree(ctxt, tmp);
7454 return(forbiddenExp);
7455 }
7456 if (sub->exp_max * mult > exp->exp_min)
7457 min = 0;
7458 else
7459 min = exp->exp_min - sub->exp_max * mult;
7460 max = exp->exp_max - sub->exp_max * mult;
7461 }
7462 }
7463 } else if (!IS_NILLABLE(tmp)) {
7464 /*
7465 * TODO: loop here to try to grow if working on finite
7466 * blocks.
7467 */
7468 xmlExpFree(ctxt, tmp);
7469 return(forbiddenExp);
7470 } else if (sub->exp_max == -1) {
7471 if (exp->exp_max == -1) {
7472 if (exp->exp_min <= sub->exp_min) {
7473 max = -1;
7474 min = 0;
7475 } else {
7476 max = -1;
7477 min = exp->exp_min - sub->exp_min;
7478 }
7479 } else if (exp->exp_min > sub->exp_min) {
7480 xmlExpFree(ctxt, tmp);
7481 return(forbiddenExp);
7482 } else {
7483 max = -1;
7484 min = 0;
7485 }
7486 } else {
7487 if (exp->exp_max == -1) {
7488 if (exp->exp_min > sub->exp_min) {
7489 max = -1;
7490 min = exp->exp_min - sub->exp_min;
7491 } else {
7492 max = -1;
7493 min = 0;
7494 }
7495 } else {
7496 if (exp->exp_max < sub->exp_max) {
7497 xmlExpFree(ctxt, tmp);
7498 return(forbiddenExp);
7499 }
7500 if (sub->exp_max > exp->exp_min)
7501 min = 0;
7502 else
7503 min = exp->exp_min - sub->exp_max;
7504 max = exp->exp_max - sub->exp_max;
7505 }
7506 }
7507 exp->exp_left->ref++;
7508 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7509 NULL, NULL, min, max);
7510 if (tmp2 == NULL) {
7511 return(NULL);
7512 }
7513 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7514 NULL, 0, 0);
7515 return(ret);
7516 }
7517 tmp = xmlExpExpDeriveInt(ctxt, exp->exp_left, sub);
7518 if (tmp == NULL)
7519 return(NULL);
7520 if (tmp == forbiddenExp) {
7521 return(forbiddenExp);
7522 }
7523 if (exp->exp_min > 0)
7524 min = exp->exp_min - 1;
7525 else
7526 min = 0;
7527 if (exp->exp_max < 0)
7528 max = -1;
7529 else
7530 max = exp->exp_max - 1;
7531
7532 exp->exp_left->ref++;
7533 tmp2 = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, exp->exp_left,
7534 NULL, NULL, min, max);
7535 if (tmp2 == NULL)
7536 return(NULL);
7537 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, tmp, tmp2,
7538 NULL, 0, 0);
7539 return(ret);
7540 }
7541 }
7542
7543 if (IS_NILLABLE(sub)) {
7544 if (!(IS_NILLABLE(exp)))
7545 return(forbiddenExp);
7546 else
7547 ret = emptyExp;
7548 } else
7549 ret = NULL;
7550 /*
7551 * here the structured derivation made no progress so
7552 * we use the default token based derivation to force one more step
7553 */
7554 if (ctxt->tabSize == 0)
7555 ctxt->tabSize = 40;
7556
7557 tab = (const xmlChar **) xmlMalloc(ctxt->tabSize *
7558 sizeof(const xmlChar *));
7559 if (tab == NULL) {
7560 return(NULL);
7561 }
7562
7563 /*
7564 * collect all the strings accepted by the subexpression on input
7565 */
7566 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7567 while (len < 0) {
7568 const xmlChar **temp;
7569 temp = (const xmlChar **) xmlRealloc((xmlChar **) tab, ctxt->tabSize * 2 *
7570 sizeof(const xmlChar *));
7571 if (temp == NULL) {
7572 xmlFree((xmlChar **) tab);
7573 return(NULL);
7574 }
7575 tab = temp;
7576 ctxt->tabSize *= 2;
7577 len = xmlExpGetStartInt(ctxt, sub, tab, ctxt->tabSize, 0);
7578 }
7579 for (i = 0;i < len;i++) {
7580 tmp = xmlExpStringDeriveInt(ctxt, exp, tab[i]);
7581 if ((tmp == NULL) || (tmp == forbiddenExp)) {
7582 xmlExpFree(ctxt, ret);
7583 xmlFree((xmlChar **) tab);
7584 return(tmp);
7585 }
7586 tmp2 = xmlExpStringDeriveInt(ctxt, sub, tab[i]);
7587 if ((tmp2 == NULL) || (tmp2 == forbiddenExp)) {
7588 xmlExpFree(ctxt, tmp);
7589 xmlExpFree(ctxt, ret);
7590 xmlFree((xmlChar **) tab);
7591 return(tmp);
7592 }
7593 tmp3 = xmlExpExpDeriveInt(ctxt, tmp, tmp2);
7594 xmlExpFree(ctxt, tmp);
7595 xmlExpFree(ctxt, tmp2);
7596
7597 if ((tmp3 == NULL) || (tmp3 == forbiddenExp)) {
7598 xmlExpFree(ctxt, ret);
7599 xmlFree((xmlChar **) tab);
7600 return(tmp3);
7601 }
7602
7603 if (ret == NULL)
7604 ret = tmp3;
7605 else {
7606 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, tmp3, NULL, 0, 0);
7607 if (ret == NULL) {
7608 xmlFree((xmlChar **) tab);
7609 return(NULL);
7610 }
7611 }
7612 }
7613 xmlFree((xmlChar **) tab);
7614 return(ret);
7615}
7616
7617/**
7618 * xmlExpExpDerive:
7619 * @ctxt: the expressions context
7620 * @exp: the englobing expression
7621 * @sub: the subexpression
7622 *
7623 * Evaluates the expression resulting from @exp consuming a sub expression @sub
7624 * Based on algebraic derivation and sometimes direct Brzozowski derivation
7625 * it usually takes less than linear time and can handle expressions generating
7626 * infinite languages.
7627 *
7628 * Returns the resulting expression or NULL in case of internal error, the
7629 * result must be freed
7630 */
7631xmlExpNodePtr
7632xmlExpExpDerive(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7633 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7634 return(NULL);
7635
7636 /*
7637 * O(1) speedups
7638 */
7639 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7640 return(forbiddenExp);
7641 }
7642 if (xmlExpCheckCard(exp, sub) == 0) {
7643 return(forbiddenExp);
7644 }
7645 return(xmlExpExpDeriveInt(ctxt, exp, sub));
7646}
7647
7648/**
7649 * xmlExpSubsume:
7650 * @ctxt: the expressions context
7651 * @exp: the englobing expression
7652 * @sub: the subexpression
7653 *
7654 * Check whether @exp accepts all the languages accepted by @sub
7655 * the input being a subexpression.
7656 *
7657 * Returns 1 if true 0 if false and -1 in case of failure.
7658 */
7659int
7660xmlExpSubsume(xmlExpCtxtPtr ctxt, xmlExpNodePtr exp, xmlExpNodePtr sub) {
7661 xmlExpNodePtr tmp;
7662
7663 if ((exp == NULL) || (ctxt == NULL) || (sub == NULL))
7664 return(-1);
7665
7666 /*
7667 * TODO: speedup by checking the language of sub is a subset of the
7668 * language of exp
7669 */
7670 /*
7671 * O(1) speedups
7672 */
7673 if (IS_NILLABLE(sub) && (!IS_NILLABLE(exp))) {
7674 return(0);
7675 }
7676 if (xmlExpCheckCard(exp, sub) == 0) {
7677 return(0);
7678 }
7679 tmp = xmlExpExpDeriveInt(ctxt, exp, sub);
7680 if (tmp == NULL)
7681 return(-1);
7682 if (tmp == forbiddenExp)
7683 return(0);
7684 if (tmp == emptyExp)
7685 return(1);
7686 if ((tmp != NULL) && (IS_NILLABLE(tmp))) {
7687 xmlExpFree(ctxt, tmp);
7688 return(1);
7689 }
7690 xmlExpFree(ctxt, tmp);
7691 return(0);
7692}
7693
7694/************************************************************************
7695 * *
7696 * Parsing expression *
7697 * *
7698 ************************************************************************/
7699
7700static xmlExpNodePtr xmlExpParseExpr(xmlExpCtxtPtr ctxt);
7701
7702#undef CUR
7703#define CUR (*ctxt->cur)
7704#undef NEXT
7705#define NEXT ctxt->cur++;
7706#undef IS_BLANK
7707#define IS_BLANK(c) ((c == ' ') || (c == '\n') || (c == '\r') || (c == '\t'))
7708#define SKIP_BLANKS while (IS_BLANK(*ctxt->cur)) ctxt->cur++;
7709
7710static int
7711xmlExpParseNumber(xmlExpCtxtPtr ctxt) {
7712 int ret = 0;
7713
7714 SKIP_BLANKS
7715 if (CUR == '*') {
7716 NEXT
7717 return(-1);
7718 }
7719 if ((CUR < '0') || (CUR > '9'))
7720 return(-1);
7721 while ((CUR >= '0') && (CUR <= '9')) {
7722 ret = ret * 10 + (CUR - '0');
7723 NEXT
7724 }
7725 return(ret);
7726}
7727
7728static xmlExpNodePtr
7729xmlExpParseOr(xmlExpCtxtPtr ctxt) {
7730 const char *base;
7731 xmlExpNodePtr ret;
7732 const xmlChar *val;
7733
7734 SKIP_BLANKS
7735 base = ctxt->cur;
7736 if (*ctxt->cur == '(') {
7737 NEXT
7738 ret = xmlExpParseExpr(ctxt);
7739 SKIP_BLANKS
7740 if (*ctxt->cur != ')') {
7741 fprintf(stderr, "unbalanced '(' : %s\n", base);
7742 xmlExpFree(ctxt, ret);
7743 return(NULL);
7744 }
7745 NEXT;
7746 SKIP_BLANKS
7747 goto parse_quantifier;
7748 }
7749 while ((CUR != 0) && (!(IS_BLANK(CUR))) && (CUR != '(') &&
7750 (CUR != ')') && (CUR != '|') && (CUR != ',') && (CUR != '{') &&
7751 (CUR != '*') && (CUR != '+') && (CUR != '?') && (CUR != '}'))
7752 NEXT;
7753 val = xmlDictLookup(ctxt->dict, BAD_CAST base, ctxt->cur - base);
7754 if (val == NULL)
7755 return(NULL);
7756 ret = xmlExpHashGetEntry(ctxt, XML_EXP_ATOM, NULL, NULL, val, 0, 0);
7757 if (ret == NULL)
7758 return(NULL);
7759 SKIP_BLANKS
7760parse_quantifier:
7761 if (CUR == '{') {
7762 int min, max;
7763
7764 NEXT
7765 min = xmlExpParseNumber(ctxt);
7766 if (min < 0) {
7767 xmlExpFree(ctxt, ret);
7768 return(NULL);
7769 }
7770 SKIP_BLANKS
7771 if (CUR == ',') {
7772 NEXT
7773 max = xmlExpParseNumber(ctxt);
7774 SKIP_BLANKS
7775 } else
7776 max = min;
7777 if (CUR != '}') {
7778 xmlExpFree(ctxt, ret);
7779 return(NULL);
7780 }
7781 NEXT
7782 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7783 min, max);
7784 SKIP_BLANKS
7785 } else if (CUR == '?') {
7786 NEXT
7787 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7788 0, 1);
7789 SKIP_BLANKS
7790 } else if (CUR == '+') {
7791 NEXT
7792 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7793 1, -1);
7794 SKIP_BLANKS
7795 } else if (CUR == '*') {
7796 NEXT
7797 ret = xmlExpHashGetEntry(ctxt, XML_EXP_COUNT, ret, NULL, NULL,
7798 0, -1);
7799 SKIP_BLANKS
7800 }
7801 return(ret);
7802}
7803
7804
7805static xmlExpNodePtr
7806xmlExpParseSeq(xmlExpCtxtPtr ctxt) {
7807 xmlExpNodePtr ret, right;
7808
7809 ret = xmlExpParseOr(ctxt);
7810 SKIP_BLANKS
7811 while (CUR == '|') {
7812 NEXT
7813 right = xmlExpParseOr(ctxt);
7814 if (right == NULL) {
7815 xmlExpFree(ctxt, ret);
7816 return(NULL);
7817 }
7818 ret = xmlExpHashGetEntry(ctxt, XML_EXP_OR, ret, right, NULL, 0, 0);
7819 if (ret == NULL)
7820 return(NULL);
7821 }
7822 return(ret);
7823}
7824
7825static xmlExpNodePtr
7826xmlExpParseExpr(xmlExpCtxtPtr ctxt) {
7827 xmlExpNodePtr ret, right;
7828
7829 ret = xmlExpParseSeq(ctxt);
7830 SKIP_BLANKS
7831 while (CUR == ',') {
7832 NEXT
7833 right = xmlExpParseSeq(ctxt);
7834 if (right == NULL) {
7835 xmlExpFree(ctxt, ret);
7836 return(NULL);
7837 }
7838 ret = xmlExpHashGetEntry(ctxt, XML_EXP_SEQ, ret, right, NULL, 0, 0);
7839 if (ret == NULL)
7840 return(NULL);
7841 }
7842 return(ret);
7843}
7844
7845/**
7846 * xmlExpParse:
7847 * @ctxt: the expressions context
7848 * @expr: the 0 terminated string
7849 *
7850 * Minimal parser for regexps, it understand the following constructs
7851 * - string terminals
7852 * - choice operator |
7853 * - sequence operator ,
7854 * - subexpressions (...)
7855 * - usual cardinality operators + * and ?
7856 * - finite sequences { min, max }
7857 * - infinite sequences { min, * }
7858 * There is minimal checkings made especially no checking on strings values
7859 *
7860 * Returns a new expression or NULL in case of failure
7861 */
7862xmlExpNodePtr
7863xmlExpParse(xmlExpCtxtPtr ctxt, const char *expr) {
7864 xmlExpNodePtr ret;
7865
7866 ctxt->expr = expr;
7867 ctxt->cur = expr;
7868
7869 ret = xmlExpParseExpr(ctxt);
7870 SKIP_BLANKS
7871 if (*ctxt->cur != 0) {
7872 xmlExpFree(ctxt, ret);
7873 return(NULL);
7874 }
7875 return(ret);
7876}
7877
7878static void
7879xmlExpDumpInt(xmlBufferPtr buf, xmlExpNodePtr expr, int glob) {
7880 xmlExpNodePtr c;
7881
7882 if (expr == NULL) return;
7883 if (glob) xmlBufferWriteChar(buf, "(");
7884 switch (expr->type) {
7885 case XML_EXP_EMPTY:
7886 xmlBufferWriteChar(buf, "empty");
7887 break;
7888 case XML_EXP_FORBID:
7889 xmlBufferWriteChar(buf, "forbidden");
7890 break;
7891 case XML_EXP_ATOM:
7892 xmlBufferWriteCHAR(buf, expr->exp_str);
7893 break;
7894 case XML_EXP_SEQ:
7895 c = expr->exp_left;
7896 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7897 xmlExpDumpInt(buf, c, 1);
7898 else
7899 xmlExpDumpInt(buf, c, 0);
7900 xmlBufferWriteChar(buf, " , ");
7901 c = expr->exp_right;
7902 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7903 xmlExpDumpInt(buf, c, 1);
7904 else
7905 xmlExpDumpInt(buf, c, 0);
7906 break;
7907 case XML_EXP_OR:
7908 c = expr->exp_left;
7909 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7910 xmlExpDumpInt(buf, c, 1);
7911 else
7912 xmlExpDumpInt(buf, c, 0);
7913 xmlBufferWriteChar(buf, " | ");
7914 c = expr->exp_right;
7915 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7916 xmlExpDumpInt(buf, c, 1);
7917 else
7918 xmlExpDumpInt(buf, c, 0);
7919 break;
7920 case XML_EXP_COUNT: {
7921 char rep[40];
7922
7923 c = expr->exp_left;
7924 if ((c->type == XML_EXP_SEQ) || (c->type == XML_EXP_OR))
7925 xmlExpDumpInt(buf, c, 1);
7926 else
7927 xmlExpDumpInt(buf, c, 0);
7928 if ((expr->exp_min == 0) && (expr->exp_max == 1)) {
7929 rep[0] = '?';
7930 rep[1] = 0;
7931 } else if ((expr->exp_min == 0) && (expr->exp_max == -1)) {
7932 rep[0] = '*';
7933 rep[1] = 0;
7934 } else if ((expr->exp_min == 1) && (expr->exp_max == -1)) {
7935 rep[0] = '+';
7936 rep[1] = 0;
7937 } else if (expr->exp_max == expr->exp_min) {
7938 snprintf(rep, 39, "{%d}", expr->exp_min);
7939 } else if (expr->exp_max < 0) {
7940 snprintf(rep, 39, "{%d,inf}", expr->exp_min);
7941 } else {
7942 snprintf(rep, 39, "{%d,%d}", expr->exp_min, expr->exp_max);
7943 }
7944 rep[39] = 0;
7945 xmlBufferWriteChar(buf, rep);
7946 break;
7947 }
7948 default:
7949 fprintf(stderr, "Error in tree\n");
7950 }
7951 if (glob)
7952 xmlBufferWriteChar(buf, ")");
7953}
7954/**
7955 * xmlExpDump:
7956 * @buf: a buffer to receive the output
7957 * @expr: the compiled expression
7958 *
7959 * Serialize the expression as compiled to the buffer
7960 */
7961void
7962xmlExpDump(xmlBufferPtr buf, xmlExpNodePtr expr) {
7963 if ((buf == NULL) || (expr == NULL))
7964 return;
7965 xmlExpDumpInt(buf, expr, 0);
7966}
7967
7968/**
7969 * xmlExpMaxToken:
7970 * @expr: a compiled expression
7971 *
7972 * Indicate the maximum number of input a expression can accept
7973 *
7974 * Returns the maximum length or -1 in case of error
7975 */
7976int
7977xmlExpMaxToken(xmlExpNodePtr expr) {
7978 if (expr == NULL)
7979 return(-1);
7980 return(expr->c_max);
7981}
7982
7983/**
7984 * xmlExpCtxtNbNodes:
7985 * @ctxt: an expression context
7986 *
7987 * Debugging facility provides the number of allocated nodes at a that point
7988 *
7989 * Returns the number of nodes in use or -1 in case of error
7990 */
7991int
7992xmlExpCtxtNbNodes(xmlExpCtxtPtr ctxt) {
7993 if (ctxt == NULL)
7994 return(-1);
7995 return(ctxt->nb_nodes);
7996}
7997
7998/**
7999 * xmlExpCtxtNbCons:
8000 * @ctxt: an expression context
8001 *
8002 * Debugging facility provides the number of allocated nodes over lifetime
8003 *
8004 * Returns the number of nodes ever allocated or -1 in case of error
8005 */
8006int
8007xmlExpCtxtNbCons(xmlExpCtxtPtr ctxt) {
8008 if (ctxt == NULL)
8009 return(-1);
8010 return(ctxt->nb_cons);
8011}
8012
8013#endif /* LIBXML_EXPR_ENABLED */
8014
8015#endif /* LIBXML_REGEXP_ENABLED */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette