HTMLparser.c@ 62281

Last change on this file since 62281 was 58072, checked in by vboxsync, 9 years ago
libxml 2.9.2 unmodified
Property svn:eol-style set to `native`
File size: 203.0 KB

Line
1	/*
2	* HTMLparser.c : an HTML 4.0 non-verifying parser
3	*
4	* See Copyright for the status of this software.
5	*
6	* daniel@veillard.com
7	*/
8
9	#define IN_LIBXML
10	#include "libxml.h"
11	#ifdef LIBXML_HTML_ENABLED
12
13	#include <string.h>
14	#ifdef HAVE_CTYPE_H
15	#include <ctype.h>
16	#endif
17	#ifdef HAVE_STDLIB_H
18	#include <stdlib.h>
19	#endif
20	#ifdef HAVE_SYS_STAT_H
21	#include <sys/stat.h>
22	#endif
23	#ifdef HAVE_FCNTL_H
24	#include <fcntl.h>
25	#endif
26	#ifdef HAVE_UNISTD_H
27	#include <unistd.h>
28	#endif
29	#ifdef HAVE_ZLIB_H
30	#include <zlib.h>
31	#endif
32
33	#include <libxml/xmlmemory.h>
34	#include <libxml/tree.h>
35	#include <libxml/parser.h>
36	#include <libxml/parserInternals.h>
37	#include <libxml/xmlerror.h>
38	#include <libxml/HTMLparser.h>
39	#include <libxml/HTMLtree.h>
40	#include <libxml/entities.h>
41	#include <libxml/encoding.h>
42	#include <libxml/valid.h>
43	#include <libxml/xmlIO.h>
44	#include <libxml/globals.h>
45	#include <libxml/uri.h>
46
47	#include "buf.h"
48	#include "enc.h"
49
50	#define HTML_MAX_NAMELEN 1000
51	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52	#define HTML_PARSER_BUFFER_SIZE 100
53
54	/* #define DEBUG */
55	/* #define DEBUG_PUSH */
56
57	static int htmlOmittedDefaultValue = 1;
58
59	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60	xmlChar end, xmlChar end2, xmlChar end3);
61	static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63	/************************************************************************
64	* *
65	* Some factorized error routines *
66	* *
67	************************************************************************/
68
69	/**
70	* htmlErrMemory:
71	* @ctxt: an HTML parser context
72	* @extra: extra informations
73	*
74	* Handle a redefinition of attribute error
75	*/
76	static void
77	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78	{
79	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80	(ctxt->instate == XML_PARSER_EOF))
81	return;
82	if (ctxt != NULL) {
83	ctxt->errNo = XML_ERR_NO_MEMORY;
84	ctxt->instate = XML_PARSER_EOF;
85	ctxt->disableSAX = 1;
86	}
87	if (extra)
88	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90	NULL, NULL, 0, 0,
91	"Memory allocation failed : %s\n", extra);
92	else
93	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95	NULL, NULL, 0, 0, "Memory allocation failed\n");
96	}
97
98	/**
99	* htmlParseErr:
100	* @ctxt: an HTML parser context
101	* @error: the error number
102	* @msg: the error message
103	* @str1: string infor
104	* @str2: string infor
105	*
106	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
107	*/
108	static void
109	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110	const char msg, const xmlChar str1, const xmlChar *str2)
111	{
112	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113	(ctxt->instate == XML_PARSER_EOF))
114	return;
115	if (ctxt != NULL)
116	ctxt->errNo = error;
117	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118	XML_ERR_ERROR, NULL, 0,
119	(const char ) str1, (const char ) str2,
120	NULL, 0, 0,
121	msg, str1, str2);
122	if (ctxt != NULL)
123	ctxt->wellFormed = 0;
124	}
125
126	/**
127	* htmlParseErrInt:
128	* @ctxt: an HTML parser context
129	* @error: the error number
130	* @msg: the error message
131	* @val: integer info
132	*
133	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
134	*/
135	static void
136	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137	const char *msg, int val)
138	{
139	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140	(ctxt->instate == XML_PARSER_EOF))
141	return;
142	if (ctxt != NULL)
143	ctxt->errNo = error;
144	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145	XML_ERR_ERROR, NULL, 0, NULL, NULL,
146	NULL, val, 0, msg, val);
147	if (ctxt != NULL)
148	ctxt->wellFormed = 0;
149	}
150
151	/************************************************************************
152	* *
153	* Parser stacks related functions and macros *
154	* *
155	************************************************************************/
156
157	/**
158	* htmlnamePush:
159	* @ctxt: an HTML parser context
160	* @value: the element name
161	*
162	* Pushes a new element name on top of the name stack
163	*
164	* Returns 0 in case of error, the index in the stack otherwise
165	*/
166	static int
167	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168	{
169	if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170	ctxt->html = 3;
171	if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172	ctxt->html = 10;
173	if (ctxt->nameNr >= ctxt->nameMax) {
174	ctxt->nameMax *= 2;
175	ctxt->nameTab = (const xmlChar * *)
176	xmlRealloc((xmlChar * *)ctxt->nameTab,
177	ctxt->nameMax *
178	sizeof(ctxt->nameTab[0]));
179	if (ctxt->nameTab == NULL) {
180	htmlErrMemory(ctxt, NULL);
181	return (0);
182	}
183	}
184	ctxt->nameTab[ctxt->nameNr] = value;
185	ctxt->name = value;
186	return (ctxt->nameNr++);
187	}
188	/**
189	* htmlnamePop:
190	* @ctxt: an HTML parser context
191	*
192	* Pops the top element name from the name stack
193	*
194	* Returns the name just removed
195	*/
196	static const xmlChar *
197	htmlnamePop(htmlParserCtxtPtr ctxt)
198	{
199	const xmlChar *ret;
200
201	if (ctxt->nameNr <= 0)
202	return (NULL);
203	ctxt->nameNr--;
204	if (ctxt->nameNr < 0)
205	return (NULL);
206	if (ctxt->nameNr > 0)
207	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208	else
209	ctxt->name = NULL;
210	ret = ctxt->nameTab[ctxt->nameNr];
211	ctxt->nameTab[ctxt->nameNr] = NULL;
212	return (ret);
213	}
214
215	/**
216	* htmlNodeInfoPush:
217	* @ctxt: an HTML parser context
218	* @value: the node info
219	*
220	* Pushes a new element name on top of the node info stack
221	*
222	* Returns 0 in case of error, the index in the stack otherwise
223	*/
224	static int
225	htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226	{
227	if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228	if (ctxt->nodeInfoMax == 0)
229	ctxt->nodeInfoMax = 5;
230	ctxt->nodeInfoMax *= 2;
231	ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232	xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233	ctxt->nodeInfoMax *
234	sizeof(ctxt->nodeInfoTab[0]));
235	if (ctxt->nodeInfoTab == NULL) {
236	htmlErrMemory(ctxt, NULL);
237	return (0);
238	}
239	}
240	ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242	return (ctxt->nodeInfoNr++);
243	}
244
245	/**
246	* htmlNodeInfoPop:
247	* @ctxt: an HTML parser context
248	*
249	* Pops the top element name from the node info stack
250	*
251	* Returns 0 in case of error, the pointer to NodeInfo otherwise
252	*/
253	static htmlParserNodeInfo *
254	htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255	{
256	if (ctxt->nodeInfoNr <= 0)
257	return (NULL);
258	ctxt->nodeInfoNr--;
259	if (ctxt->nodeInfoNr < 0)
260	return (NULL);
261	if (ctxt->nodeInfoNr > 0)
262	ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263	else
264	ctxt->nodeInfo = NULL;
265	return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266	}
267
268	/*
269	* Macros for accessing the content. Those should be used only by the parser,
270	* and not exported.
271	*
272	* Dirty macros, i.e. one need to make assumption on the context to use them
273	*
274	* CUR_PTR return the current pointer to the xmlChar to be parsed.
275	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
276	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277	* in UNICODE mode. This should be used internally by the parser
278	* only to compare to ASCII values otherwise it would break when
279	* running with UTF-8 encoding.
280	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
281	* to compare on ASCII based substring.
282	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
283	* it should be used only to compare on ASCII based substring.
284	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285	* strings without newlines within the parser.
286	*
287	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288	*
289	* CURRENT Returns the current char value, with the full decoding of
290	* UTF-8 if we are using this mode. It returns an int.
291	* NEXT Skip to the next character, this does the proper decoding
292	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
293	* NEXTL(l) Skip the current unicode character of l xmlChars long.
294	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295	*/
296
297	#define UPPER (toupper(*ctxt->input->cur))
298
299	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301	#define NXT(val) ctxt->input->cur[(val)]
302
303	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305	#define CUR_PTR ctxt->input->cur
306
307	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
308	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
309	xmlParserInputShrink(ctxt->input)
310
311	#define GROW if ((ctxt->progressive == 0) && \
312	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
313	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
314
315	#define CURRENT ((int) (*ctxt->input->cur))
316
317	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
318
319	/* Inported from XML */
320
321	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
322	#define CUR ((int) (*ctxt->input->cur))
323	#define NEXT xmlNextChar(ctxt)
324
325	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
326
327
328	#define NEXTL(l) do { \
329	if (*(ctxt->input->cur) == '\n') { \
330	ctxt->input->line++; ctxt->input->col = 1; \
331	} else ctxt->input->col++; \
332	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
333	} while (0)
334
335	/************
336	\
337	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
338	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
339	************/
340
341	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
342	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
343
344	#define COPY_BUF(l,b,i,v) \
345	if (l == 1) b[i++] = (xmlChar) v; \
346	else i += xmlCopyChar(l,&b[i],v)
347
348	/**
349	* htmlFindEncoding:
350	* @the HTML parser context
351	*
352	* Ty to find and encoding in the current data available in the input
353	* buffer this is needed to try to switch to the proper encoding when
354	* one face a character error.
355	* That's an heuristic, since it's operating outside of parsing it could
356	* try to use a meta which had been commented out, that's the reason it
357	* should only be used in case of error, not as a default.
358	*
359	* Returns an encoding string or NULL if not found, the string need to
360	* be freed
361	*/
362	static xmlChar *
363	htmlFindEncoding(xmlParserCtxtPtr ctxt) {
364	const xmlChar start, cur, *end;
365
366	if ((ctxt == NULL) \|\| (ctxt->input == NULL) \|\|
367	(ctxt->input->encoding != NULL) \|\| (ctxt->input->buf == NULL) \|\|
368	(ctxt->input->buf->encoder != NULL))
369	return(NULL);
370	if ((ctxt->input->cur == NULL) \|\| (ctxt->input->end == NULL))
371	return(NULL);
372
373	start = ctxt->input->cur;
374	end = ctxt->input->end;
375	/* we also expect the input buffer to be zero terminated */
376	if (*end != 0)
377	return(NULL);
378
379	cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
380	if (cur == NULL)
381	return(NULL);
382	cur = xmlStrcasestr(cur, BAD_CAST "CONTENT");
383	if (cur == NULL)
384	return(NULL);
385	cur = xmlStrcasestr(cur, BAD_CAST "CHARSET=");
386	if (cur == NULL)
387	return(NULL);
388	cur += 8;
389	start = cur;
390	while (((cur >= 'A') && (cur <= 'Z')) \|\|
391	((cur >= 'a') && (cur <= 'z')) \|\|
392	((cur >= '0') && (cur <= '9')) \|\|
393	(cur == '-') \|\| (cur == '_') \|\| (cur == ':') \|\| (cur == '/'))
394	cur++;
395	if (cur == start)
396	return(NULL);
397	return(xmlStrndup(start, cur - start));
398	}
399
400	/**
401	* htmlCurrentChar:
402	* @ctxt: the HTML parser context
403	* @len: pointer to the length of the char read
404	*
405	* The current char value, if using UTF-8 this may actually span multiple
406	* bytes in the input buffer. Implement the end of line normalization:
407	* 2.11 End-of-Line Handling
408	* If the encoding is unspecified, in the case we find an ISO-Latin-1
409	* char, then the encoding converter is plugged in automatically.
410	*
411	* Returns the current char value and its length
412	*/
413
414	static int
415	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
416	if (ctxt->instate == XML_PARSER_EOF)
417	return(0);
418
419	if (ctxt->token != 0) {
420	*len = 0;
421	return(ctxt->token);
422	}
423	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
424	/*
425	* We are supposed to handle UTF8, check it's valid
426	* From rfc2044: encoding of the Unicode values on UTF-8:
427	*
428	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
429	* 0000 0000-0000 007F 0xxxxxxx
430	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
431	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
432	*
433	* Check for the 0x110000 limit too
434	*/
435	const unsigned char *cur = ctxt->input->cur;
436	unsigned char c;
437	unsigned int val;
438
439	c = *cur;
440	if (c & 0x80) {
441	if (cur[1] == 0) {
442	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
443	cur = ctxt->input->cur;
444	}
445	if ((cur[1] & 0xc0) != 0x80)
446	goto encoding_error;
447	if ((c & 0xe0) == 0xe0) {
448
449	if (cur[2] == 0) {
450	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
451	cur = ctxt->input->cur;
452	}
453	if ((cur[2] & 0xc0) != 0x80)
454	goto encoding_error;
455	if ((c & 0xf0) == 0xf0) {
456	if (cur[3] == 0) {
457	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
458	cur = ctxt->input->cur;
459	}
460	if (((c & 0xf8) != 0xf0) \|\|
461	((cur[3] & 0xc0) != 0x80))
462	goto encoding_error;
463	/* 4-byte code */
464	*len = 4;
465	val = (cur[0] & 0x7) << 18;
466	val \|= (cur[1] & 0x3f) << 12;
467	val \|= (cur[2] & 0x3f) << 6;
468	val \|= cur[3] & 0x3f;
469	} else {
470	/* 3-byte code */
471	*len = 3;
472	val = (cur[0] & 0xf) << 12;
473	val \|= (cur[1] & 0x3f) << 6;
474	val \|= cur[2] & 0x3f;
475	}
476	} else {
477	/* 2-byte code */
478	*len = 2;
479	val = (cur[0] & 0x1f) << 6;
480	val \|= cur[1] & 0x3f;
481	}
482	if (!IS_CHAR(val)) {
483	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
484	"Char 0x%X out of allowed range\n", val);
485	}
486	return(val);
487	} else {
488	if ((*ctxt->input->cur == 0) &&
489	(ctxt->input->cur < ctxt->input->end)) {
490	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
491	"Char 0x%X out of allowed range\n", 0);
492	*len = 1;
493	return(' ');
494	}
495	/* 1-byte code */
496	*len = 1;
497	return((int) *ctxt->input->cur);
498	}
499	}
500	/*
501	* Assume it's a fixed length encoding (1) with
502	* a compatible encoding for the ASCII set, since
503	* XML constructs only use < 128 chars
504	*/
505	*len = 1;
506	if ((int) *ctxt->input->cur < 0x80)
507	return((int) *ctxt->input->cur);
508
509	/*
510	* Humm this is bad, do an automatic flow conversion
511	*/
512	{
513	xmlChar * guess;
514	xmlCharEncodingHandlerPtr handler;
515
516	guess = htmlFindEncoding(ctxt);
517	if (guess == NULL) {
518	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
519	} else {
520	if (ctxt->input->encoding != NULL)
521	xmlFree((xmlChar *) ctxt->input->encoding);
522	ctxt->input->encoding = guess;
523	handler = xmlFindCharEncodingHandler((const char *) guess);
524	if (handler != NULL) {
525	xmlSwitchToEncoding(ctxt, handler);
526	} else {
527	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
528	"Unsupported encoding %s", guess, NULL);
529	}
530	}
531	ctxt->charset = XML_CHAR_ENCODING_UTF8;
532	}
533
534	return(xmlCurrentChar(ctxt, len));
535
536	encoding_error:
537	/*
538	* If we detect an UTF8 error that probably mean that the
539	* input encoding didn't get properly advertized in the
540	* declaration header. Report the error and switch the encoding
541	* to ISO-Latin-1 (if you don't like this policy, just declare the
542	* encoding !)
543	*/
544	{
545	char buffer[150];
546
547	if (ctxt->input->end - ctxt->input->cur >= 4) {
548	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
549	ctxt->input->cur[0], ctxt->input->cur[1],
550	ctxt->input->cur[2], ctxt->input->cur[3]);
551	} else {
552	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
553	}
554	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
555	"Input is not proper UTF-8, indicate encoding !\n",
556	BAD_CAST buffer, NULL);
557	}
558
559	ctxt->charset = XML_CHAR_ENCODING_8859_1;
560	*len = 1;
561	return((int) *ctxt->input->cur);
562	}
563
564	/**
565	* htmlSkipBlankChars:
566	* @ctxt: the HTML parser context
567	*
568	* skip all blanks character found at that point in the input streams.
569	*
570	* Returns the number of space chars skipped
571	*/
572
573	static int
574	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
575	int res = 0;
576
577	while (IS_BLANK_CH(*(ctxt->input->cur))) {
578	if ((*ctxt->input->cur == 0) &&
579	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
580	xmlPopInput(ctxt);
581	} else {
582	if (*(ctxt->input->cur) == '\n') {
583	ctxt->input->line++; ctxt->input->col = 1;
584	} else ctxt->input->col++;
585	ctxt->input->cur++;
586	ctxt->nbChars++;
587	if (*ctxt->input->cur == 0)
588	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
589	}
590	res++;
591	}
592	return(res);
593	}
594
595
596
597	/************************************************************************
598	* *
599	* The list of HTML elements and their properties *
600	* *
601	************************************************************************/
602
603	/*
604	* Start Tag: 1 means the start tag can be ommited
605	* End Tag: 1 means the end tag can be ommited
606	* 2 means it's forbidden (empty elements)
607	* 3 means the tag is stylistic and should be closed easily
608	* Depr: this element is deprecated
609	* DTD: 1 means that this element is valid only in the Loose DTD
610	* 2 means that this element is valid only in the Frameset DTD
611	*
612	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
613	, subElements , impliedsubelt , Attributes, userdata
614	*/
615
616	/* Definitions and a couple of vars for HTML Elements */
617
618	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
619	#define NB_FONTSTYLE 8
620	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
621	#define NB_PHRASE 10
622	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
623	#define NB_SPECIAL 16
624	#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
625	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
626	#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
627	#define NB_BLOCK NB_HEADING + NB_LIST + 14
628	#define FORMCTRL "input", "select", "textarea", "label", "button"
629	#define NB_FORMCTRL 5
630	#define PCDATA
631	#define NB_PCDATA 0
632	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
633	#define NB_HEADING 6
634	#define LIST "ul", "ol", "dir", "menu"
635	#define NB_LIST 4
636	#define MODIFIER
637	#define NB_MODIFIER 0
638	#define FLOW BLOCK,INLINE
639	#define NB_FLOW NB_BLOCK + NB_INLINE
640	#define EMPTY NULL
641
642
643	static const char* const html_flow[] = { FLOW, NULL } ;
644	static const char* const html_inline[] = { INLINE, NULL } ;
645
646	/* placeholders: elts with content but no subelements */
647	static const char* const html_pcdata[] = { NULL } ;
648	#define html_cdata html_pcdata
649
650
651	/* ... and for HTML Attributes */
652
653	#define COREATTRS "id", "class", "style", "title"
654	#define NB_COREATTRS 4
655	#define I18N "lang", "dir"
656	#define NB_I18N 2
657	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
658	#define NB_EVENTS 9
659	#define ATTRS COREATTRS,I18N,EVENTS
660	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
661	#define CELLHALIGN "align", "char", "charoff"
662	#define NB_CELLHALIGN 3
663	#define CELLVALIGN "valign"
664	#define NB_CELLVALIGN 1
665
666	static const char* const html_attrs[] = { ATTRS, NULL } ;
667	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
668	static const char* const core_attrs[] = { COREATTRS, NULL } ;
669	static const char* const i18n_attrs[] = { I18N, NULL } ;
670
671
672	/* Other declarations that should go inline ... */
673	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
674	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
675	"tabindex", "onfocus", "onblur", NULL } ;
676	static const char* const target_attr[] = { "target", NULL } ;
677	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
678	static const char* const alt_attr[] = { "alt", NULL } ;
679	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
680	static const char* const href_attrs[] = { "href", NULL } ;
681	static const char* const clear_attrs[] = { "clear", NULL } ;
682	static const char* const inline_p[] = { INLINE, "p", NULL } ;
683
684	static const char* const flow_param[] = { FLOW, "param", NULL } ;
685	static const char* const applet_attrs[] = { COREATTRS , "codebase",
686	"archive", "alt", "name", "height", "width", "align",
687	"hspace", "vspace", NULL } ;
688	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
689	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
690	static const char* const basefont_attrs[] =
691	{ "id", "size", "color", "face", NULL } ;
692	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
693	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
694	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
695	static const char* const body_depr[] = { "background", "bgcolor", "text",
696	"link", "vlink", "alink", NULL } ;
697	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
698	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
699
700
701	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
702	static const char* const col_elt[] = { "col", NULL } ;
703	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
704	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
705	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
706	static const char* const compact_attr[] = { "compact", NULL } ;
707	static const char* const label_attr[] = { "label", NULL } ;
708	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
709	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
710	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
711	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
712	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
713	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
714	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
715	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
716	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
717	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
718	static const char* const version_attr[] = { "version", NULL } ;
719	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
720	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
721	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
722	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
723	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
724	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
725	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
726	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
727	static const char* const align_attr[] = { "align", NULL } ;
728	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
729	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
730	static const char* const name_attr[] = { "name", NULL } ;
731	static const char* const action_attr[] = { "action", NULL } ;
732	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
733	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
734	static const char* const content_attr[] = { "content", NULL } ;
735	static const char* const type_attr[] = { "type", NULL } ;
736	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
737	static const char* const object_contents[] = { FLOW, "param", NULL } ;
738	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
739	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
740	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
741	static const char* const option_elt[] = { "option", NULL } ;
742	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
743	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
744	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
745	static const char* const width_attr[] = { "width", NULL } ;
746	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
747	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
748	static const char* const language_attr[] = { "language", NULL } ;
749	static const char* const select_content[] = { "optgroup", "option", NULL } ;
750	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
751	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
752	static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
753	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
754	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
755	static const char* const tr_elt[] = { "tr", NULL } ;
756	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
757	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
758	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
759	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
760	static const char* const tr_contents[] = { "th", "td", NULL } ;
761	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
762	static const char* const li_elt[] = { "li", NULL } ;
763	static const char* const ul_depr[] = { "type", "compact", NULL} ;
764	static const char* const dir_attr[] = { "dir", NULL} ;
765
766	#define DECL (const char**)
767
768	static const htmlElemDesc
769	html40ElementTable[] = {
770	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
771	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
772	},
773	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
774	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
775	},
776	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
777	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
778	},
779	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
780	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
781	},
782	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
783	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
784	},
785	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
786	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
787	},
788	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
789	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
790	},
791	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
792	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
793	},
794	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
795	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
796	},
797	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
798	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
799	},
800	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
801	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802	},
803	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
804	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
805	},
806	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
807	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
808	},
809	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
810	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
811	},
812	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
813	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
814	},
815	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
816	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817	},
818	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
819	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
820	},
821	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
822	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
823	},
824	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
825	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
826	},
827	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
828	EMPTY , NULL , DECL col_attrs , NULL, NULL
829	},
830	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
831	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
832	},
833	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
834	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
835	},
836	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
837	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
838	},
839	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
840	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
841	},
842	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
843	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
844	},
845	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
846	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
847	},
848	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
849	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
850	},
851	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
852	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
853	},
854	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
855	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
856	},
857	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
858	EMPTY, NULL, DECL embed_attrs, NULL, NULL
859	},
860	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
861	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
862	},
863	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
864	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
865	},
866	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
867	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
868	},
869	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
870	EMPTY, NULL, NULL, DECL frame_attrs, NULL
871	},
872	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
873	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
874	},
875	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
876	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
877	},
878	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
879	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
880	},
881	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
882	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
883	},
884	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
885	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
886	},
887	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
888	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
889	},
890	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
891	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
892	},
893	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
894	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
895	},
896	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
897	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
898	},
899	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
900	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
901	},
902	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
903	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
904	},
905	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
906	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
907	},
908	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
909	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
910	},
911	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
912	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
913	},
914	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
915	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
916	},
917	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
918	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
919	},
920	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
921	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
922	},
923	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
924	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
925	},
926	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
927	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
928	},
929	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
930	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
931	},
932	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
933	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
934	},
935	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
936	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
937	},
938	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
939	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
940	},
941	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
942	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
943	},
944	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
945	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
946	},
947	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
948	DECL html_flow, "div", DECL html_attrs, NULL, NULL
949	},
950	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
951	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
952	},
953	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
954	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
955	},
956	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
957	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
958	},
959	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
960	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
961	},
962	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
963	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
964	},
965	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
966	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
967	},
968	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
969	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
970	},
971	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
972	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
973	},
974	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
975	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
976	},
977	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
978	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
979	},
980	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
981	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
982	},
983	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
984	DECL select_content, NULL, DECL select_attrs, NULL, NULL
985	},
986	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
987	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
988	},
989	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
990	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
991	},
992	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
993	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
994	},
995	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
996	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
997	},
998	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
999	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1000	},
1001	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
1002	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1003	},
1004	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
1005	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006	},
1007	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
1008	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1009	},
1010	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
1011	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1012	},
1013	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
1014	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1015	},
1016	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1017	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1018	},
1019	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
1020	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1021	},
1022	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
1023	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1024	},
1025	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
1026	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1027	},
1028	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
1029	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1030	},
1031	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
1032	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1033	},
1034	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1035	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1036	},
1037	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
1038	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1039	},
1040	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
1041	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1042	},
1043	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1044	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1045	}
1046	};
1047
1048	/*
1049	* start tags that imply the end of current element
1050	*/
1051	static const char * const htmlStartClose[] = {
1052	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1053	"dl", "ul", "ol", "menu", "dir", "address", "pre",
1054	"listing", "xmp", "head", NULL,
1055	"head", "p", NULL,
1056	"title", "p", NULL,
1057	"body", "head", "style", "link", "title", "p", NULL,
1058	"frameset", "head", "style", "link", "title", "p", NULL,
1059	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1060	"pre", "listing", "xmp", "head", "li", NULL,
1061	"hr", "p", "head", NULL,
1062	"h1", "p", "head", NULL,
1063	"h2", "p", "head", NULL,
1064	"h3", "p", "head", NULL,
1065	"h4", "p", "head", NULL,
1066	"h5", "p", "head", NULL,
1067	"h6", "p", "head", NULL,
1068	"dir", "p", "head", NULL,
1069	"address", "p", "head", "ul", NULL,
1070	"pre", "p", "head", "ul", NULL,
1071	"listing", "p", "head", NULL,
1072	"xmp", "p", "head", NULL,
1073	"blockquote", "p", "head", NULL,
1074	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
1075	"xmp", "head", NULL,
1076	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1077	"head", "dd", NULL,
1078	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
1079	"head", "dt", NULL,
1080	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
1081	"listing", "xmp", NULL,
1082	"ol", "p", "head", "ul", NULL,
1083	"menu", "p", "head", "ul", NULL,
1084	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1085	"div", "p", "head", NULL,
1086	"noscript", "p", NULL,
1087	"center", "font", "b", "i", "p", "head", NULL,
1088	"a", "a", "head", NULL,
1089	"caption", "p", NULL,
1090	"colgroup", "caption", "colgroup", "col", "p", NULL,
1091	"col", "caption", "col", "p", NULL,
1092	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1093	"listing", "xmp", "a", NULL,
1094	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1095	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1097	"thead", "caption", "col", "colgroup", NULL,
1098	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1099	"tbody", "p", NULL,
1100	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
1101	"tfoot", "tbody", "p", NULL,
1102	"optgroup", "option", NULL,
1103	"option", "option", NULL,
1104	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1105	"pre", "listing", "xmp", "a", NULL,
1106	/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1107	"tt", "head", NULL,
1108	"i", "head", NULL,
1109	"b", "head", NULL,
1110	"u", "head", NULL,
1111	"s", "head", NULL,
1112	"strike", "head", NULL,
1113	"big", "head", NULL,
1114	"small", "head", NULL,
1115
1116	"em", "head", NULL,
1117	"strong", "head", NULL,
1118	"dfn", "head", NULL,
1119	"code", "head", NULL,
1120	"samp", "head", NULL,
1121	"kbd", "head", NULL,
1122	"var", "head", NULL,
1123	"cite", "head", NULL,
1124	"abbr", "head", NULL,
1125	"acronym", "head", NULL,
1126
1127	/* "a" */
1128	"img", "head", NULL,
1129	/* "applet" */
1130	/* "embed" */
1131	/* "object" */
1132	"font", "head", NULL,
1133	/* "basefont" */
1134	"br", "head", NULL,
1135	/* "script" */
1136	"map", "head", NULL,
1137	"q", "head", NULL,
1138	"sub", "head", NULL,
1139	"sup", "head", NULL,
1140	"span", "head", NULL,
1141	"bdo", "head", NULL,
1142	"iframe", "head", NULL,
1143	NULL
1144	};
1145
1146	/*
1147	* The list of HTML elements which are supposed not to have
1148	* CDATA content and where a p element will be implied
1149	*
1150	* TODO: extend that list by reading the HTML SGML DTD on
1151	* implied paragraph
1152	*/
1153	static const char *const htmlNoContentElements[] = {
1154	"html",
1155	"head",
1156	NULL
1157	};
1158
1159	/*
1160	* The list of HTML attributes which are of content %Script;
1161	* NOTE: when adding ones, check htmlIsScriptAttribute() since
1162	* it assumes the name starts with 'on'
1163	*/
1164	static const char *const htmlScriptAttributes[] = {
1165	"onclick",
1166	"ondblclick",
1167	"onmousedown",
1168	"onmouseup",
1169	"onmouseover",
1170	"onmousemove",
1171	"onmouseout",
1172	"onkeypress",
1173	"onkeydown",
1174	"onkeyup",
1175	"onload",
1176	"onunload",
1177	"onfocus",
1178	"onblur",
1179	"onsubmit",
1180	"onreset",
1181	"onchange",
1182	"onselect"
1183	};
1184
1185	/*
1186	* This table is used by the htmlparser to know what to do with
1187	* broken html pages. By assigning different priorities to different
1188	* elements the parser can decide how to handle extra endtags.
1189	* Endtags are only allowed to close elements with lower or equal
1190	* priority.
1191	*/
1192
1193	typedef struct {
1194	const char *name;
1195	int priority;
1196	} elementPriority;
1197
1198	static const elementPriority htmlEndPriority[] = {
1199	{"div", 150},
1200	{"td", 160},
1201	{"th", 160},
1202	{"tr", 170},
1203	{"thead", 180},
1204	{"tbody", 180},
1205	{"tfoot", 180},
1206	{"table", 190},
1207	{"head", 200},
1208	{"body", 200},
1209	{"html", 220},
1210	{NULL, 100} /* Default priority */
1211	};
1212
1213	static const char** htmlStartCloseIndex[100];
1214	static int htmlStartCloseIndexinitialized = 0;
1215
1216	/************************************************************************
1217	* *
1218	* functions to handle HTML specific data *
1219	* *
1220	************************************************************************/
1221
1222	/**
1223	* htmlInitAutoClose:
1224	*
1225	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1226	* This is not reentrant. Call xmlInitParser() once before processing in
1227	* case of use in multithreaded programs.
1228	*/
1229	void
1230	htmlInitAutoClose(void) {
1231	int indx, i = 0;
1232
1233	if (htmlStartCloseIndexinitialized) return;
1234
1235	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1236	indx = 0;
1237	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1238	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1239	while (htmlStartClose[i] != NULL) i++;
1240	i++;
1241	}
1242	htmlStartCloseIndexinitialized = 1;
1243	}
1244
1245	/**
1246	* htmlTagLookup:
1247	* @tag: The tag name in lowercase
1248	*
1249	* Lookup the HTML tag in the ElementTable
1250	*
1251	* Returns the related htmlElemDescPtr or NULL if not found.
1252	*/
1253	const htmlElemDesc *
1254	htmlTagLookup(const xmlChar *tag) {
1255	unsigned int i;
1256
1257	for (i = 0; i < (sizeof(html40ElementTable) /
1258	sizeof(html40ElementTable[0]));i++) {
1259	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1260	return((htmlElemDescPtr) &html40ElementTable[i]);
1261	}
1262	return(NULL);
1263	}
1264
1265	/**
1266	* htmlGetEndPriority:
1267	* @name: The name of the element to look up the priority for.
1268	*
1269	* Return value: The "endtag" priority.
1270	**/
1271	static int
1272	htmlGetEndPriority (const xmlChar *name) {
1273	int i = 0;
1274
1275	while ((htmlEndPriority[i].name != NULL) &&
1276	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1277	i++;
1278
1279	return(htmlEndPriority[i].priority);
1280	}
1281
1282
1283	/**
1284	* htmlCheckAutoClose:
1285	* @newtag: The new tag name
1286	* @oldtag: The old tag name
1287	*
1288	* Checks whether the new tag is one of the registered valid tags for
1289	* closing old.
1290	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1291	*
1292	* Returns 0 if no, 1 if yes.
1293	*/
1294	static int
1295	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1296	{
1297	int i, indx;
1298	const char **closed = NULL;
1299
1300	if (htmlStartCloseIndexinitialized == 0)
1301	htmlInitAutoClose();
1302
1303	/* inefficient, but not a big deal */
1304	for (indx = 0; indx < 100; indx++) {
1305	closed = htmlStartCloseIndex[indx];
1306	if (closed == NULL)
1307	return (0);
1308	if (xmlStrEqual(BAD_CAST * closed, newtag))
1309	break;
1310	}
1311
1312	i = closed - htmlStartClose;
1313	i++;
1314	while (htmlStartClose[i] != NULL) {
1315	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1316	return (1);
1317	}
1318	i++;
1319	}
1320	return (0);
1321	}
1322
1323	/**
1324	* htmlAutoCloseOnClose:
1325	* @ctxt: an HTML parser context
1326	* @newtag: The new tag name
1327	* @force: force the tag closure
1328	*
1329	* The HTML DTD allows an ending tag to implicitly close other tags.
1330	*/
1331	static void
1332	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1333	{
1334	const htmlElemDesc *info;
1335	int i, priority;
1336
1337	priority = htmlGetEndPriority(newtag);
1338
1339	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1340
1341	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1342	break;
1343	/*
1344	* A missplaced endtag can only close elements with lower
1345	* or equal priority, so if we find an element with higher
1346	* priority before we find an element with
1347	* matching name, we just ignore this endtag
1348	*/
1349	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1350	return;
1351	}
1352	if (i < 0)
1353	return;
1354
1355	while (!xmlStrEqual(newtag, ctxt->name)) {
1356	info = htmlTagLookup(ctxt->name);
1357	if ((info != NULL) && (info->endTag == 3)) {
1358	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1359	"Opening and ending tag mismatch: %s and %s\n",
1360	newtag, ctxt->name);
1361	}
1362	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1363	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1364	htmlnamePop(ctxt);
1365	}
1366	}
1367
1368	/**
1369	* htmlAutoCloseOnEnd:
1370	* @ctxt: an HTML parser context
1371	*
1372	* Close all remaining tags at the end of the stream
1373	*/
1374	static void
1375	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1376	{
1377	int i;
1378
1379	if (ctxt->nameNr == 0)
1380	return;
1381	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1382	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1383	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1384	htmlnamePop(ctxt);
1385	}
1386	}
1387
1388	/**
1389	* htmlAutoClose:
1390	* @ctxt: an HTML parser context
1391	* @newtag: The new tag name or NULL
1392	*
1393	* The HTML DTD allows a tag to implicitly close other tags.
1394	* The list is kept in htmlStartClose array. This function is
1395	* called when a new tag has been detected and generates the
1396	* appropriates closes if possible/needed.
1397	* If newtag is NULL this mean we are at the end of the resource
1398	* and we should check
1399	*/
1400	static void
1401	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1402	{
1403	while ((newtag != NULL) && (ctxt->name != NULL) &&
1404	(htmlCheckAutoClose(newtag, ctxt->name))) {
1405	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1406	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1407	htmlnamePop(ctxt);
1408	}
1409	if (newtag == NULL) {
1410	htmlAutoCloseOnEnd(ctxt);
1411	return;
1412	}
1413	while ((newtag == NULL) && (ctxt->name != NULL) &&
1414	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
1415	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
1416	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1417	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1418	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1419	htmlnamePop(ctxt);
1420	}
1421	}
1422
1423	/**
1424	* htmlAutoCloseTag:
1425	* @doc: the HTML document
1426	* @name: The tag name
1427	* @elem: the HTML element
1428	*
1429	* The HTML DTD allows a tag to implicitly close other tags.
1430	* The list is kept in htmlStartClose array. This function checks
1431	* if the element or one of it's children would autoclose the
1432	* given tag.
1433	*
1434	* Returns 1 if autoclose, 0 otherwise
1435	*/
1436	int
1437	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1438	htmlNodePtr child;
1439
1440	if (elem == NULL) return(1);
1441	if (xmlStrEqual(name, elem->name)) return(0);
1442	if (htmlCheckAutoClose(elem->name, name)) return(1);
1443	child = elem->children;
1444	while (child != NULL) {
1445	if (htmlAutoCloseTag(doc, name, child)) return(1);
1446	child = child->next;
1447	}
1448	return(0);
1449	}
1450
1451	/**
1452	* htmlIsAutoClosed:
1453	* @doc: the HTML document
1454	* @elem: the HTML element
1455	*
1456	* The HTML DTD allows a tag to implicitly close other tags.
1457	* The list is kept in htmlStartClose array. This function checks
1458	* if a tag is autoclosed by one of it's child
1459	*
1460	* Returns 1 if autoclosed, 0 otherwise
1461	*/
1462	int
1463	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1464	htmlNodePtr child;
1465
1466	if (elem == NULL) return(1);
1467	child = elem->children;
1468	while (child != NULL) {
1469	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1470	child = child->next;
1471	}
1472	return(0);
1473	}
1474
1475	/**
1476	* htmlCheckImplied:
1477	* @ctxt: an HTML parser context
1478	* @newtag: The new tag name
1479	*
1480	* The HTML DTD allows a tag to exists only implicitly
1481	* called when a new tag has been detected and generates the
1482	* appropriates implicit tags if missing
1483	*/
1484	static void
1485	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1486	int i;
1487
1488	if (ctxt->options & HTML_PARSE_NOIMPLIED)
1489	return;
1490	if (!htmlOmittedDefaultValue)
1491	return;
1492	if (xmlStrEqual(newtag, BAD_CAST"html"))
1493	return;
1494	if (ctxt->nameNr <= 0) {
1495	htmlnamePush(ctxt, BAD_CAST"html");
1496	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1498	}
1499	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
1500	return;
1501	if ((ctxt->nameNr <= 1) &&
1502	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
1503	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
1504	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
1505	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
1506	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
1507	(xmlStrEqual(newtag, BAD_CAST"base")))) {
1508	if (ctxt->html >= 3) {
1509	/* we already saw or generated an <head> before */
1510	return;
1511	}
1512	/*
1513	* dropped OBJECT ... i you put it first BODY will be
1514	* assumed !
1515	*/
1516	htmlnamePush(ctxt, BAD_CAST"head");
1517	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1518	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1519	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1520	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1521	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1522	if (ctxt->html >= 10) {
1523	/* we already saw or generated a <body> before */
1524	return;
1525	}
1526	for (i = 0;i < ctxt->nameNr;i++) {
1527	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1528	return;
1529	}
1530	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1531	return;
1532	}
1533	}
1534
1535	htmlnamePush(ctxt, BAD_CAST"body");
1536	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1537	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1538	}
1539	}
1540
1541	/**
1542	* htmlCheckParagraph
1543	* @ctxt: an HTML parser context
1544	*
1545	* Check whether a p element need to be implied before inserting
1546	* characters in the current element.
1547	*
1548	* Returns 1 if a paragraph has been inserted, 0 if not and -1
1549	* in case of error.
1550	*/
1551
1552	static int
1553	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1554	const xmlChar *tag;
1555	int i;
1556
1557	if (ctxt == NULL)
1558	return(-1);
1559	tag = ctxt->name;
1560	if (tag == NULL) {
1561	htmlAutoClose(ctxt, BAD_CAST"p");
1562	htmlCheckImplied(ctxt, BAD_CAST"p");
1563	htmlnamePush(ctxt, BAD_CAST"p");
1564	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1565	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1566	return(1);
1567	}
1568	if (!htmlOmittedDefaultValue)
1569	return(0);
1570	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1571	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1572	htmlAutoClose(ctxt, BAD_CAST"p");
1573	htmlCheckImplied(ctxt, BAD_CAST"p");
1574	htmlnamePush(ctxt, BAD_CAST"p");
1575	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1576	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1577	return(1);
1578	}
1579	}
1580	return(0);
1581	}
1582
1583	/**
1584	* htmlIsScriptAttribute:
1585	* @name: an attribute name
1586	*
1587	* Check if an attribute is of content type Script
1588	*
1589	* Returns 1 is the attribute is a script 0 otherwise
1590	*/
1591	int
1592	htmlIsScriptAttribute(const xmlChar *name) {
1593	unsigned int i;
1594
1595	if (name == NULL)
1596	return(0);
1597	/*
1598	* all script attributes start with 'on'
1599	*/
1600	if ((name[0] != 'o') \|\| (name[1] != 'n'))
1601	return(0);
1602	for (i = 0;
1603	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1604	i++) {
1605	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1606	return(1);
1607	}
1608	return(0);
1609	}
1610
1611	/************************************************************************
1612	* *
1613	* The list of HTML predefined entities *
1614	* *
1615	************************************************************************/
1616
1617
1618	static const htmlEntityDesc html40EntitiesTable[] = {
1619	/*
1620	* the 4 absolute ones, plus apostrophe.
1621	*/
1622	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1623	{ 38, "amp", "ampersand, U+0026 ISOnum" },
1624	{ 39, "apos", "single quote" },
1625	{ 60, "lt", "less-than sign, U+003C ISOnum" },
1626	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1627
1628	/*
1629	* A bunch still in the 128-255 range
1630	* Replacing them depend really on the charset used.
1631	*/
1632	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1633	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1634	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1635	{ 163, "pound","pound sign, U+00A3 ISOnum" },
1636	{ 164, "curren","currency sign, U+00A4 ISOnum" },
1637	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1638	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1639	{ 167, "sect", "section sign, U+00A7 ISOnum" },
1640	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1641	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1642	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1643	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1644	{ 172, "not", "not sign, U+00AC ISOnum" },
1645	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1646	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1647	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1648	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1649	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1650	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1651	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1652	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1653	{ 181, "micro","micro sign, U+00B5 ISOnum" },
1654	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1655	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1656	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1657	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1658	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1659	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1660	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1661	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1662	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1663	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1664	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1665	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1666	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1667	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1668	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1669	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1670	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1671	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1672	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1673	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1674	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1675	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1676	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1677	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1678	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1679	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1680	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1681	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1682	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1683	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1684	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1685	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1686	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1687	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1688	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1689	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1690	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1691	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1692	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1693	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1694	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1695	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1696	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1697	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1698	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1699	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1700	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1701	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1702	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1703	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1704	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1705	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1706	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1707	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1708	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1709	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1710	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1711	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1712	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1713	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1714	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1715	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1716	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1717	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1718	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1719	{ 247, "divide","division sign, U+00F7 ISOnum" },
1720	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1721	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1722	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1723	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1724	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1725	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1726	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1727	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1728
1729	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1730	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1731	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1732	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1733	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1734
1735	/*
1736	* Anything below should really be kept as entities references
1737	*/
1738	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1739
1740	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1741	{ 732, "tilde","small tilde, U+02DC ISOdia" },
1742
1743	{ 913, "Alpha","greek capital letter alpha, U+0391" },
1744	{ 914, "Beta", "greek capital letter beta, U+0392" },
1745	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1746	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1747	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1748	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1749	{ 919, "Eta", "greek capital letter eta, U+0397" },
1750	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1751	{ 921, "Iota", "greek capital letter iota, U+0399" },
1752	{ 922, "Kappa","greek capital letter kappa, U+039A" },
1753	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1754	{ 924, "Mu", "greek capital letter mu, U+039C" },
1755	{ 925, "Nu", "greek capital letter nu, U+039D" },
1756	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1757	{ 927, "Omicron","greek capital letter omicron, U+039F" },
1758	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1759	{ 929, "Rho", "greek capital letter rho, U+03A1" },
1760	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1761	{ 932, "Tau", "greek capital letter tau, U+03A4" },
1762	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1763	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1764	{ 935, "Chi", "greek capital letter chi, U+03A7" },
1765	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1766	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1767
1768	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1769	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1770	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1771	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1772	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1773	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1774	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1775	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1776	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1777	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1778	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1779	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1780	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1781	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1782	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1783	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1784	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1785	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1786	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1787	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1788	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1789	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1790	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1791	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1792	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1793	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1794	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1795	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1796
1797	{ 8194, "ensp", "en space, U+2002 ISOpub" },
1798	{ 8195, "emsp", "em space, U+2003 ISOpub" },
1799	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1800	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1801	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1802	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1803	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1804	{ 8211, "ndash","en dash, U+2013 ISOpub" },
1805	{ 8212, "mdash","em dash, U+2014 ISOpub" },
1806	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1807	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1808	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1809	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1810	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1811	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1812	{ 8224, "dagger","dagger, U+2020 ISOpub" },
1813	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1814
1815	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1816	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1817
1818	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1819
1820	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1821	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1822
1823	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1824	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1825
1826	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1827	{ 8260, "frasl","fraction slash, U+2044 NEW" },
1828
1829	{ 8364, "euro", "euro sign, U+20AC NEW" },
1830
1831	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1832	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1833	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1834	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1835	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1836	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1837	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1838	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1839	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1840	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1841	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1842	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1843	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1844	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1845	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1846	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1847
1848	{ 8704, "forall","for all, U+2200 ISOtech" },
1849	{ 8706, "part", "partial differential, U+2202 ISOtech" },
1850	{ 8707, "exist","there exists, U+2203 ISOtech" },
1851	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1852	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1853	{ 8712, "isin", "element of, U+2208 ISOtech" },
1854	{ 8713, "notin","not an element of, U+2209 ISOtech" },
1855	{ 8715, "ni", "contains as member, U+220B ISOtech" },
1856	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1857	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1858	{ 8722, "minus","minus sign, U+2212 ISOtech" },
1859	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1860	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1861	{ 8733, "prop", "proportional to, U+221D ISOtech" },
1862	{ 8734, "infin","infinity, U+221E ISOtech" },
1863	{ 8736, "ang", "angle, U+2220 ISOamso" },
1864	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1865	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1866	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1867	{ 8746, "cup", "union = cup, U+222A ISOtech" },
1868	{ 8747, "int", "integral, U+222B ISOtech" },
1869	{ 8756, "there4","therefore, U+2234 ISOtech" },
1870	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1871	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1872	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1873	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1874	{ 8801, "equiv","identical to, U+2261 ISOtech" },
1875	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1876	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1877	{ 8834, "sub", "subset of, U+2282 ISOtech" },
1878	{ 8835, "sup", "superset of, U+2283 ISOtech" },
1879	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1880	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1881	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1882	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1883	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1884	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1885	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1886	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1887	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1888	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1889	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1890	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1891	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1892	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1893
1894	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1895	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1896	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1897	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1898
1899	};
1900
1901	/************************************************************************
1902	* *
1903	* Commodity functions to handle entities *
1904	* *
1905	************************************************************************/
1906
1907	/*
1908	* Macro used to grow the current buffer.
1909	*/
1910	#define growBuffer(buffer) { \
1911	xmlChar *tmp; \
1912	buffer##_size *= 2; \
1913	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
1914	if (tmp == NULL) { \
1915	htmlErrMemory(ctxt, "growing buffer\n"); \
1916	xmlFree(buffer); \
1917	return(NULL); \
1918	} \
1919	buffer = tmp; \
1920	}
1921
1922	/**
1923	* htmlEntityLookup:
1924	* @name: the entity name
1925	*
1926	* Lookup the given entity in EntitiesTable
1927	*
1928	* TODO: the linear scan is really ugly, an hash table is really needed.
1929	*
1930	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1931	*/
1932	const htmlEntityDesc *
1933	htmlEntityLookup(const xmlChar *name) {
1934	unsigned int i;
1935
1936	for (i = 0;i < (sizeof(html40EntitiesTable)/
1937	sizeof(html40EntitiesTable[0]));i++) {
1938	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1939	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1940	}
1941	}
1942	return(NULL);
1943	}
1944
1945	/**
1946	* htmlEntityValueLookup:
1947	* @value: the entity's unicode value
1948	*
1949	* Lookup the given entity in EntitiesTable
1950	*
1951	* TODO: the linear scan is really ugly, an hash table is really needed.
1952	*
1953	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1954	*/
1955	const htmlEntityDesc *
1956	htmlEntityValueLookup(unsigned int value) {
1957	unsigned int i;
1958
1959	for (i = 0;i < (sizeof(html40EntitiesTable)/
1960	sizeof(html40EntitiesTable[0]));i++) {
1961	if (html40EntitiesTable[i].value >= value) {
1962	if (html40EntitiesTable[i].value > value)
1963	break;
1964	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1965	}
1966	}
1967	return(NULL);
1968	}
1969
1970	/**
1971	* UTF8ToHtml:
1972	* @out: a pointer to an array of bytes to store the result
1973	* @outlen: the length of @out
1974	* @in: a pointer to an array of UTF-8 chars
1975	* @inlen: the length of @in
1976	*
1977	* Take a block of UTF-8 chars in and try to convert it to an ASCII
1978	* plus HTML entities block of chars out.
1979	*
1980	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1981	* The value of @inlen after return is the number of octets consumed
1982	* as the return value is positive, else unpredictable.
1983	* The value of @outlen after return is the number of octets consumed.
1984	*/
1985	int
1986	UTF8ToHtml(unsigned char* out, int *outlen,
1987	const unsigned char* in, int *inlen) {
1988	const unsigned char* processed = in;
1989	const unsigned char* outend;
1990	const unsigned char* outstart = out;
1991	const unsigned char* instart = in;
1992	const unsigned char* inend;
1993	unsigned int c, d;
1994	int trailing;
1995
1996	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
1997	if (in == NULL) {
1998	/*
1999	* initialization nothing to do
2000	*/
2001	*outlen = 0;
2002	*inlen = 0;
2003	return(0);
2004	}
2005	inend = in + (*inlen);
2006	outend = out + (*outlen);
2007	while (in < inend) {
2008	d = *in++;
2009	if (d < 0x80) { c= d; trailing= 0; }
2010	else if (d < 0xC0) {
2011	/* trailing byte in leading position */
2012	*outlen = out - outstart;
2013	*inlen = processed - instart;
2014	return(-2);
2015	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2016	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2017	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2018	else {
2019	/* no chance for this in Ascii */
2020	*outlen = out - outstart;
2021	*inlen = processed - instart;
2022	return(-2);
2023	}
2024
2025	if (inend - in < trailing) {
2026	break;
2027	}
2028
2029	for ( ; trailing; trailing--) {
2030	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
2031	break;
2032	c <<= 6;
2033	c \|= d & 0x3F;
2034	}
2035
2036	/* assertion: c is a single UTF-4 value */
2037	if (c < 0x80) {
2038	if (out + 1 >= outend)
2039	break;
2040	*out++ = c;
2041	} else {
2042	int len;
2043	const htmlEntityDesc * ent;
2044	const char *cp;
2045	char nbuf[16];
2046
2047	/*
2048	* Try to lookup a predefined HTML entity for it
2049	*/
2050
2051	ent = htmlEntityValueLookup(c);
2052	if (ent == NULL) {
2053	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2054	cp = nbuf;
2055	}
2056	else
2057	cp = ent->name;
2058	len = strlen(cp);
2059	if (out + 2 + len >= outend)
2060	break;
2061	*out++ = '&';
2062	memcpy(out, cp, len);
2063	out += len;
2064	*out++ = ';';
2065	}
2066	processed = in;
2067	}
2068	*outlen = out - outstart;
2069	*inlen = processed - instart;
2070	return(0);
2071	}
2072
2073	/**
2074	* htmlEncodeEntities:
2075	* @out: a pointer to an array of bytes to store the result
2076	* @outlen: the length of @out
2077	* @in: a pointer to an array of UTF-8 chars
2078	* @inlen: the length of @in
2079	* @quoteChar: the quote character to escape (' or ") or zero.
2080	*
2081	* Take a block of UTF-8 chars in and try to convert it to an ASCII
2082	* plus HTML entities block of chars out.
2083	*
2084	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2085	* The value of @inlen after return is the number of octets consumed
2086	* as the return value is positive, else unpredictable.
2087	* The value of @outlen after return is the number of octets consumed.
2088	*/
2089	int
2090	htmlEncodeEntities(unsigned char* out, int *outlen,
2091	const unsigned char* in, int *inlen, int quoteChar) {
2092	const unsigned char* processed = in;
2093	const unsigned char* outend;
2094	const unsigned char* outstart = out;
2095	const unsigned char* instart = in;
2096	const unsigned char* inend;
2097	unsigned int c, d;
2098	int trailing;
2099
2100	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
2101	return(-1);
2102	outend = out + (*outlen);
2103	inend = in + (*inlen);
2104	while (in < inend) {
2105	d = *in++;
2106	if (d < 0x80) { c= d; trailing= 0; }
2107	else if (d < 0xC0) {
2108	/* trailing byte in leading position */
2109	*outlen = out - outstart;
2110	*inlen = processed - instart;
2111	return(-2);
2112	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
2113	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
2114	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
2115	else {
2116	/* no chance for this in Ascii */
2117	*outlen = out - outstart;
2118	*inlen = processed - instart;
2119	return(-2);
2120	}
2121
2122	if (inend - in < trailing)
2123	break;
2124
2125	while (trailing--) {
2126	if (((d= *in++) & 0xC0) != 0x80) {
2127	*outlen = out - outstart;
2128	*inlen = processed - instart;
2129	return(-2);
2130	}
2131	c <<= 6;
2132	c \|= d & 0x3F;
2133	}
2134
2135	/* assertion: c is a single UTF-4 value */
2136	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2137	(c != '&') && (c != '<') && (c != '>')) {
2138	if (out >= outend)
2139	break;
2140	*out++ = c;
2141	} else {
2142	const htmlEntityDesc * ent;
2143	const char *cp;
2144	char nbuf[16];
2145	int len;
2146
2147	/*
2148	* Try to lookup a predefined HTML entity for it
2149	*/
2150	ent = htmlEntityValueLookup(c);
2151	if (ent == NULL) {
2152	snprintf(nbuf, sizeof(nbuf), "#%u", c);
2153	cp = nbuf;
2154	}
2155	else
2156	cp = ent->name;
2157	len = strlen(cp);
2158	if (out + 2 + len > outend)
2159	break;
2160	*out++ = '&';
2161	memcpy(out, cp, len);
2162	out += len;
2163	*out++ = ';';
2164	}
2165	processed = in;
2166	}
2167	*outlen = out - outstart;
2168	*inlen = processed - instart;
2169	return(0);
2170	}
2171
2172	/************************************************************************
2173	* *
2174	* Commodity functions to handle streams *
2175	* *
2176	************************************************************************/
2177
2178	/**
2179	* htmlNewInputStream:
2180	* @ctxt: an HTML parser context
2181	*
2182	* Create a new input stream structure
2183	* Returns the new input stream or NULL
2184	*/
2185	static htmlParserInputPtr
2186	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2187	htmlParserInputPtr input;
2188
2189	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2190	if (input == NULL) {
2191	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2192	return(NULL);
2193	}
2194	memset(input, 0, sizeof(htmlParserInput));
2195	input->filename = NULL;
2196	input->directory = NULL;
2197	input->base = NULL;
2198	input->cur = NULL;
2199	input->buf = NULL;
2200	input->line = 1;
2201	input->col = 1;
2202	input->buf = NULL;
2203	input->free = NULL;
2204	input->version = NULL;
2205	input->consumed = 0;
2206	input->length = 0;
2207	return(input);
2208	}
2209
2210
2211	/************************************************************************
2212	* *
2213	* Commodity functions, cleanup needed ? *
2214	* *
2215	************************************************************************/
2216	/*
2217	* all tags allowing pc data from the html 4.01 loose dtd
2218	* NOTE: it might be more apropriate to integrate this information
2219	* into the html40ElementTable array but I don't want to risk any
2220	* binary incomptibility
2221	*/
2222	static const char *allowPCData[] = {
2223	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2224	"blockquote", "body", "button", "caption", "center", "cite", "code",
2225	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2226	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2227	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2228	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2229	};
2230
2231	/**
2232	* areBlanks:
2233	* @ctxt: an HTML parser context
2234	* @str: a xmlChar *
2235	* @len: the size of @str
2236	*
2237	* Is this a sequence of blank chars that one can ignore ?
2238	*
2239	* Returns 1 if ignorable 0 otherwise.
2240	*/
2241
2242	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2243	unsigned int i;
2244	int j;
2245	xmlNodePtr lastChild;
2246	xmlDtdPtr dtd;
2247
2248	for (j = 0;j < len;j++)
2249	if (!(IS_BLANK_CH(str[j]))) return(0);
2250
2251	if (CUR == 0) return(1);
2252	if (CUR != '<') return(0);
2253	if (ctxt->name == NULL)
2254	return(1);
2255	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2256	return(1);
2257	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2258	return(1);
2259
2260	/* Only strip CDATA children of the body tag for strict HTML DTDs */
2261	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2262	dtd = xmlGetIntSubset(ctxt->myDoc);
2263	if (dtd != NULL && dtd->ExternalID != NULL) {
2264	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
2265	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2266	return(1);
2267	}
2268	}
2269
2270	if (ctxt->node == NULL) return(0);
2271	lastChild = xmlGetLastChild(ctxt->node);
2272	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2273	lastChild = lastChild->prev;
2274	if (lastChild == NULL) {
2275	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2276	(ctxt->node->content != NULL)) return(0);
2277	/* keep ws in constructs like ...<b> </b>...
2278	for all tags "b" allowing PCDATA */
2279	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2280	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2281	return(0);
2282	}
2283	}
2284	} else if (xmlNodeIsText(lastChild)) {
2285	return(0);
2286	} else {
2287	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2288	for all tags "p" allowing PCDATA */
2289	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2290	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2291	return(0);
2292	}
2293	}
2294	}
2295	return(1);
2296	}
2297
2298	/**
2299	* htmlNewDocNoDtD:
2300	* @URI: URI for the dtd, or NULL
2301	* @ExternalID: the external ID of the DTD, or NULL
2302	*
2303	* Creates a new HTML document without a DTD node if @URI and @ExternalID
2304	* are NULL
2305	*
2306	* Returns a new document, do not initialize the DTD if not provided
2307	*/
2308	htmlDocPtr
2309	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
2310	xmlDocPtr cur;
2311
2312	/*
2313	* Allocate a new document and fill the fields.
2314	*/
2315	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2316	if (cur == NULL) {
2317	htmlErrMemory(NULL, "HTML document creation failed\n");
2318	return(NULL);
2319	}
2320	memset(cur, 0, sizeof(xmlDoc));
2321
2322	cur->type = XML_HTML_DOCUMENT_NODE;
2323	cur->version = NULL;
2324	cur->intSubset = NULL;
2325	cur->doc = cur;
2326	cur->name = NULL;
2327	cur->children = NULL;
2328	cur->extSubset = NULL;
2329	cur->oldNs = NULL;
2330	cur->encoding = NULL;
2331	cur->standalone = 1;
2332	cur->compression = 0;
2333	cur->ids = NULL;
2334	cur->refs = NULL;
2335	cur->_private = NULL;
2336	cur->charset = XML_CHAR_ENCODING_UTF8;
2337	cur->properties = XML_DOC_HTML \| XML_DOC_USERBUILT;
2338	if ((ExternalID != NULL) \|\|
2339	(URI != NULL))
2340	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2341	return(cur);
2342	}
2343
2344	/**
2345	* htmlNewDoc:
2346	* @URI: URI for the dtd, or NULL
2347	* @ExternalID: the external ID of the DTD, or NULL
2348	*
2349	* Creates a new HTML document
2350	*
2351	* Returns a new document
2352	*/
2353	htmlDocPtr
2354	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
2355	if ((URI == NULL) && (ExternalID == NULL))
2356	return(htmlNewDocNoDtD(
2357	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2358	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2359
2360	return(htmlNewDocNoDtD(URI, ExternalID));
2361	}
2362
2363
2364	/************************************************************************
2365	* *
2366	* The parser itself *
2367	* Relates to http://www.w3.org/TR/html40 *
2368	* *
2369	************************************************************************/
2370
2371	/************************************************************************
2372	* *
2373	* The parser itself *
2374	* *
2375	************************************************************************/
2376
2377	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2378
2379	/**
2380	* htmlParseHTMLName:
2381	* @ctxt: an HTML parser context
2382	*
2383	* parse an HTML tag or attribute name, note that we convert it to lowercase
2384	* since HTML names are not case-sensitive.
2385	*
2386	* Returns the Tag Name parsed or NULL
2387	*/
2388
2389	static const xmlChar *
2390	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2391	int i = 0;
2392	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2393
2394	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2395	(CUR != ':') && (CUR != '.')) return(NULL);
2396
2397	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2398	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
2399	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_') \|\|
2400	(CUR == '.'))) {
2401	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2402	else loc[i] = CUR;
2403	i++;
2404
2405	NEXT;
2406	}
2407
2408	return(xmlDictLookup(ctxt->dict, loc, i));
2409	}
2410
2411
2412	/**
2413	* htmlParseHTMLName_nonInvasive:
2414	* @ctxt: an HTML parser context
2415	*
2416	* parse an HTML tag or attribute name, note that we convert it to lowercase
2417	* since HTML names are not case-sensitive, this doesn't consume the data
2418	* from the stream, it's a look-ahead
2419	*
2420	* Returns the Tag Name parsed or NULL
2421	*/
2422
2423	static const xmlChar *
2424	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2425	int i = 0;
2426	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2427
2428	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2429	(NXT(1) != ':')) return(NULL);
2430
2431	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2432	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
2433	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
2434	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2435	else loc[i] = NXT(1+i);
2436	i++;
2437	}
2438
2439	return(xmlDictLookup(ctxt->dict, loc, i));
2440	}
2441
2442
2443	/**
2444	* htmlParseName:
2445	* @ctxt: an HTML parser context
2446	*
2447	* parse an HTML name, this routine is case sensitive.
2448	*
2449	* Returns the Name parsed or NULL
2450	*/
2451
2452	static const xmlChar *
2453	htmlParseName(htmlParserCtxtPtr ctxt) {
2454	const xmlChar *in;
2455	const xmlChar *ret;
2456	int count = 0;
2457
2458	GROW;
2459
2460	/*
2461	* Accelerator for simple ASCII names
2462	*/
2463	in = ctxt->input->cur;
2464	if (((in >= 0x61) && (in <= 0x7A)) \|\|
2465	((in >= 0x41) && (in <= 0x5A)) \|\|
2466	(in == '_') \|\| (in == ':')) {
2467	in++;
2468	while (((in >= 0x61) && (in <= 0x7A)) \|\|
2469	((in >= 0x41) && (in <= 0x5A)) \|\|
2470	((in >= 0x30) && (in <= 0x39)) \|\|
2471	(in == '_') \|\| (in == '-') \|\|
2472	(in == ':') \|\| (in == '.'))
2473	in++;
2474	if ((in > 0) && (in < 0x80)) {
2475	count = in - ctxt->input->cur;
2476	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2477	ctxt->input->cur = in;
2478	ctxt->nbChars += count;
2479	ctxt->input->col += count;
2480	return(ret);
2481	}
2482	}
2483	return(htmlParseNameComplex(ctxt));
2484	}
2485
2486	static const xmlChar *
2487	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2488	int len = 0, l;
2489	int c;
2490	int count = 0;
2491
2492	/*
2493	* Handler for more complex cases
2494	*/
2495	GROW;
2496	c = CUR_CHAR(l);
2497	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
2498	(!IS_LETTER(c) && (c != '_') &&
2499	(c != ':'))) {
2500	return(NULL);
2501	}
2502
2503	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2504	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
2505	(c == '.') \|\| (c == '-') \|\|
2506	(c == '_') \|\| (c == ':') \|\|
2507	(IS_COMBINING(c)) \|\|
2508	(IS_EXTENDER(c)))) {
2509	if (count++ > 100) {
2510	count = 0;
2511	GROW;
2512	}
2513	len += l;
2514	NEXTL(l);
2515	c = CUR_CHAR(l);
2516	}
2517	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2518	}
2519
2520
2521	/**
2522	* htmlParseHTMLAttribute:
2523	* @ctxt: an HTML parser context
2524	* @stop: a char stop value
2525	*
2526	* parse an HTML attribute value till the stop (quote), if
2527	* stop is 0 then it stops at the first space
2528	*
2529	* Returns the attribute parsed or NULL
2530	*/
2531
2532	static xmlChar *
2533	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2534	xmlChar *buffer = NULL;
2535	int buffer_size = 0;
2536	xmlChar *out = NULL;
2537	const xmlChar *name = NULL;
2538	const xmlChar *cur = NULL;
2539	const htmlEntityDesc * ent;
2540
2541	/*
2542	* allocate a translation buffer.
2543	*/
2544	buffer_size = HTML_PARSER_BUFFER_SIZE;
2545	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
2546	if (buffer == NULL) {
2547	htmlErrMemory(ctxt, "buffer allocation failed\n");
2548	return(NULL);
2549	}
2550	out = buffer;
2551
2552	/*
2553	* Ok loop until we reach one of the ending chars
2554	*/
2555	while ((CUR != 0) && (CUR != stop)) {
2556	if ((stop == 0) && (CUR == '>')) break;
2557	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2558	if (CUR == '&') {
2559	if (NXT(1) == '#') {
2560	unsigned int c;
2561	int bits;
2562
2563	c = htmlParseCharRef(ctxt);
2564	if (c < 0x80)
2565	{ *out++ = c; bits= -6; }
2566	else if (c < 0x800)
2567	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2568	else if (c < 0x10000)
2569	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2570	else
2571	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2572
2573	for ( ; bits >= 0; bits-= 6) {
2574	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2575	}
2576
2577	if (out - buffer > buffer_size - 100) {
2578	int indx = out - buffer;
2579
2580	growBuffer(buffer);
2581	out = &buffer[indx];
2582	}
2583	} else {
2584	ent = htmlParseEntityRef(ctxt, &name);
2585	if (name == NULL) {
2586	*out++ = '&';
2587	if (out - buffer > buffer_size - 100) {
2588	int indx = out - buffer;
2589
2590	growBuffer(buffer);
2591	out = &buffer[indx];
2592	}
2593	} else if (ent == NULL) {
2594	*out++ = '&';
2595	cur = name;
2596	while (*cur != 0) {
2597	if (out - buffer > buffer_size - 100) {
2598	int indx = out - buffer;
2599
2600	growBuffer(buffer);
2601	out = &buffer[indx];
2602	}
2603	out++ = cur++;
2604	}
2605	} else {
2606	unsigned int c;
2607	int bits;
2608
2609	if (out - buffer > buffer_size - 100) {
2610	int indx = out - buffer;
2611
2612	growBuffer(buffer);
2613	out = &buffer[indx];
2614	}
2615	c = ent->value;
2616	if (c < 0x80)
2617	{ *out++ = c; bits= -6; }
2618	else if (c < 0x800)
2619	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2620	else if (c < 0x10000)
2621	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2622	else
2623	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2624
2625	for ( ; bits >= 0; bits-= 6) {
2626	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2627	}
2628	}
2629	}
2630	} else {
2631	unsigned int c;
2632	int bits, l;
2633
2634	if (out - buffer > buffer_size - 100) {
2635	int indx = out - buffer;
2636
2637	growBuffer(buffer);
2638	out = &buffer[indx];
2639	}
2640	c = CUR_CHAR(l);
2641	if (c < 0x80)
2642	{ *out++ = c; bits= -6; }
2643	else if (c < 0x800)
2644	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2645	else if (c < 0x10000)
2646	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2647	else
2648	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2649
2650	for ( ; bits >= 0; bits-= 6) {
2651	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2652	}
2653	NEXT;
2654	}
2655	}
2656	*out = 0;
2657	return(buffer);
2658	}
2659
2660	/**
2661	* htmlParseEntityRef:
2662	* @ctxt: an HTML parser context
2663	* @str: location to store the entity name
2664	*
2665	* parse an HTML ENTITY references
2666	*
2667	* [68] EntityRef ::= '&' Name ';'
2668	*
2669	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2670	* if non-NULL *str will have to be freed by the caller.
2671	*/
2672	const htmlEntityDesc *
2673	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2674	const xmlChar *name;
2675	const htmlEntityDesc * ent = NULL;
2676
2677	if (str != NULL) *str = NULL;
2678	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
2679
2680	if (CUR == '&') {
2681	NEXT;
2682	name = htmlParseName(ctxt);
2683	if (name == NULL) {
2684	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2685	"htmlParseEntityRef: no name\n", NULL, NULL);
2686	} else {
2687	GROW;
2688	if (CUR == ';') {
2689	if (str != NULL)
2690	*str = name;
2691
2692	/*
2693	* Lookup the entity in the table.
2694	*/
2695	ent = htmlEntityLookup(name);
2696	if (ent != NULL) /* OK that's ugly !!! */
2697	NEXT;
2698	} else {
2699	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2700	"htmlParseEntityRef: expecting ';'\n",
2701	NULL, NULL);
2702	if (str != NULL)
2703	*str = name;
2704	}
2705	}
2706	}
2707	return(ent);
2708	}
2709
2710	/**
2711	* htmlParseAttValue:
2712	* @ctxt: an HTML parser context
2713	*
2714	* parse a value for an attribute
2715	* Note: the parser won't do substitution of entities here, this
2716	* will be handled later in xmlStringGetNodeList, unless it was
2717	* asked for ctxt->replaceEntities != 0
2718	*
2719	* Returns the AttValue parsed or NULL.
2720	*/
2721
2722	static xmlChar *
2723	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2724	xmlChar *ret = NULL;
2725
2726	if (CUR == '"') {
2727	NEXT;
2728	ret = htmlParseHTMLAttribute(ctxt, '"');
2729	if (CUR != '"') {
2730	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2731	"AttValue: \" expected\n", NULL, NULL);
2732	} else
2733	NEXT;
2734	} else if (CUR == '\'') {
2735	NEXT;
2736	ret = htmlParseHTMLAttribute(ctxt, '\'');
2737	if (CUR != '\'') {
2738	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2739	"AttValue: ' expected\n", NULL, NULL);
2740	} else
2741	NEXT;
2742	} else {
2743	/*
2744	* That's an HTMLism, the attribute value may not be quoted
2745	*/
2746	ret = htmlParseHTMLAttribute(ctxt, 0);
2747	if (ret == NULL) {
2748	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2749	"AttValue: no value found\n", NULL, NULL);
2750	}
2751	}
2752	return(ret);
2753	}
2754
2755	/**
2756	* htmlParseSystemLiteral:
2757	* @ctxt: an HTML parser context
2758	*
2759	* parse an HTML Literal
2760	*
2761	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
2762	*
2763	* Returns the SystemLiteral parsed or NULL
2764	*/
2765
2766	static xmlChar *
2767	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2768	const xmlChar *q;
2769	xmlChar *ret = NULL;
2770
2771	if (CUR == '"') {
2772	NEXT;
2773	q = CUR_PTR;
2774	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2775	NEXT;
2776	if (!IS_CHAR_CH(CUR)) {
2777	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2778	"Unfinished SystemLiteral\n", NULL, NULL);
2779	} else {
2780	ret = xmlStrndup(q, CUR_PTR - q);
2781	NEXT;
2782	}
2783	} else if (CUR == '\'') {
2784	NEXT;
2785	q = CUR_PTR;
2786	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2787	NEXT;
2788	if (!IS_CHAR_CH(CUR)) {
2789	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2790	"Unfinished SystemLiteral\n", NULL, NULL);
2791	} else {
2792	ret = xmlStrndup(q, CUR_PTR - q);
2793	NEXT;
2794	}
2795	} else {
2796	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2797	" or ' expected\n", NULL, NULL);
2798	}
2799
2800	return(ret);
2801	}
2802
2803	/**
2804	* htmlParsePubidLiteral:
2805	* @ctxt: an HTML parser context
2806	*
2807	* parse an HTML public literal
2808	*
2809	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
2810	*
2811	* Returns the PubidLiteral parsed or NULL.
2812	*/
2813
2814	static xmlChar *
2815	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2816	const xmlChar *q;
2817	xmlChar *ret = NULL;
2818	/*
2819	* Name ::= (Letter \| '_') (NameChar)*
2820	*/
2821	if (CUR == '"') {
2822	NEXT;
2823	q = CUR_PTR;
2824	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2825	if (CUR != '"') {
2826	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2827	"Unfinished PubidLiteral\n", NULL, NULL);
2828	} else {
2829	ret = xmlStrndup(q, CUR_PTR - q);
2830	NEXT;
2831	}
2832	} else if (CUR == '\'') {
2833	NEXT;
2834	q = CUR_PTR;
2835	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2836	NEXT;
2837	if (CUR != '\'') {
2838	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2839	"Unfinished PubidLiteral\n", NULL, NULL);
2840	} else {
2841	ret = xmlStrndup(q, CUR_PTR - q);
2842	NEXT;
2843	}
2844	} else {
2845	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2846	"PubidLiteral \" or ' expected\n", NULL, NULL);
2847	}
2848
2849	return(ret);
2850	}
2851
2852	/**
2853	* htmlParseScript:
2854	* @ctxt: an HTML parser context
2855	*
2856	* parse the content of an HTML SCRIPT or STYLE element
2857	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
2858	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2859	* http://www.w3.org/TR/html4/types.html#type-script
2860	* http://www.w3.org/TR/html4/types.html#h-6.15
2861	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2862	*
2863	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
2864	* element and the value of intrinsic event attributes. User agents must
2865	* not evaluate script data as HTML markup but instead must pass it on as
2866	* data to a script engine.
2867	* NOTES:
2868	* - The content is passed like CDATA
2869	* - the attributes for style and scripting "onXXX" are also described
2870	* as CDATA but SGML allows entities references in attributes so their
2871	* processing is identical as other attributes
2872	*/
2873	static void
2874	htmlParseScript(htmlParserCtxtPtr ctxt) {
2875	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2876	int nbchar = 0;
2877	int cur,l;
2878
2879	SHRINK;
2880	cur = CUR_CHAR(l);
2881	while (IS_CHAR_CH(cur)) {
2882	if ((cur == '<') && (NXT(1) == '/')) {
2883	/*
2884	* One should break here, the specification is clear:
2885	* Authors should therefore escape "</" within the content.
2886	* Escape mechanisms are specific to each scripting or
2887	* style sheet language.
2888	*
2889	* In recovery mode, only break if end tag match the
2890	* current tag, effectively ignoring all tags inside the
2891	* script/style block and treating the entire block as
2892	* CDATA.
2893	*/
2894	if (ctxt->recovery) {
2895	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2896	xmlStrlen(ctxt->name)) == 0)
2897	{
2898	break; /* while */
2899	} else {
2900	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2901	"Element %s embeds close tag\n",
2902	ctxt->name, NULL);
2903	}
2904	} else {
2905	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
2906	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2907	{
2908	break; /* while */
2909	}
2910	}
2911	}
2912	COPY_BUF(l,buf,nbchar,cur);
2913	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2914	if (ctxt->sax->cdataBlock!= NULL) {
2915	/*
2916	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2917	*/
2918	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2919	} else if (ctxt->sax->characters != NULL) {
2920	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2921	}
2922	nbchar = 0;
2923	}
2924	GROW;
2925	NEXTL(l);
2926	cur = CUR_CHAR(l);
2927	}
2928
2929	if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2930	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2931	"Invalid char in CDATA 0x%X\n", cur);
2932	if (ctxt->input->cur < ctxt->input->end) {
2933	NEXT;
2934	}
2935	}
2936
2937	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2938	if (ctxt->sax->cdataBlock!= NULL) {
2939	/*
2940	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2941	*/
2942	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2943	} else if (ctxt->sax->characters != NULL) {
2944	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2945	}
2946	}
2947	}
2948
2949
2950	/**
2951	* htmlParseCharData:
2952	* @ctxt: an HTML parser context
2953	*
2954	* parse a CharData section.
2955	* if we are within a CDATA section ']]>' marks an end of section.
2956	*
2957	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2958	*/
2959
2960	static void
2961	htmlParseCharData(htmlParserCtxtPtr ctxt) {
2962	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2963	int nbchar = 0;
2964	int cur, l;
2965	int chunk = 0;
2966
2967	SHRINK;
2968	cur = CUR_CHAR(l);
2969	while (((cur != '<') \|\| (ctxt->token == '<')) &&
2970	((cur != '&') \|\| (ctxt->token == '&')) &&
2971	(cur != 0)) {
2972	if (!(IS_CHAR(cur))) {
2973	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2974	"Invalid char in CDATA 0x%X\n", cur);
2975	} else {
2976	COPY_BUF(l,buf,nbchar,cur);
2977	}
2978	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2979	/*
2980	* Ok the segment is to be consumed as chars.
2981	*/
2982	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2983	if (areBlanks(ctxt, buf, nbchar)) {
2984	if (ctxt->keepBlanks) {
2985	if (ctxt->sax->characters != NULL)
2986	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2987	} else {
2988	if (ctxt->sax->ignorableWhitespace != NULL)
2989	ctxt->sax->ignorableWhitespace(ctxt->userData,
2990	buf, nbchar);
2991	}
2992	} else {
2993	htmlCheckParagraph(ctxt);
2994	if (ctxt->sax->characters != NULL)
2995	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2996	}
2997	}
2998	nbchar = 0;
2999	}
3000	NEXTL(l);
3001	chunk++;
3002	if (chunk > HTML_PARSER_BUFFER_SIZE) {
3003	chunk = 0;
3004	SHRINK;
3005	GROW;
3006	}
3007	cur = CUR_CHAR(l);
3008	if (cur == 0) {
3009	SHRINK;
3010	GROW;
3011	cur = CUR_CHAR(l);
3012	}
3013	}
3014	if (nbchar != 0) {
3015	buf[nbchar] = 0;
3016
3017	/*
3018	* Ok the segment is to be consumed as chars.
3019	*/
3020	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3021	if (areBlanks(ctxt, buf, nbchar)) {
3022	if (ctxt->keepBlanks) {
3023	if (ctxt->sax->characters != NULL)
3024	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3025	} else {
3026	if (ctxt->sax->ignorableWhitespace != NULL)
3027	ctxt->sax->ignorableWhitespace(ctxt->userData,
3028	buf, nbchar);
3029	}
3030	} else {
3031	htmlCheckParagraph(ctxt);
3032	if (ctxt->sax->characters != NULL)
3033	ctxt->sax->characters(ctxt->userData, buf, nbchar);
3034	}
3035	}
3036	} else {
3037	/*
3038	* Loop detection
3039	*/
3040	if (cur == 0)
3041	ctxt->instate = XML_PARSER_EOF;
3042	}
3043	}
3044
3045	/**
3046	* htmlParseExternalID:
3047	* @ctxt: an HTML parser context
3048	* @publicID: a xmlChar** receiving PubidLiteral
3049	*
3050	* Parse an External ID or a Public ID
3051	*
3052	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3053	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
3054	*
3055	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
3056	*
3057	* Returns the function returns SystemLiteral and in the second
3058	* case publicID receives PubidLiteral, is strict is off
3059	* it is possible to return NULL and have publicID set.
3060	*/
3061
3062	static xmlChar *
3063	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3064	xmlChar *URI = NULL;
3065
3066	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3067	(UPP(2) == 'S') && (UPP(3) == 'T') &&
3068	(UPP(4) == 'E') && (UPP(5) == 'M')) {
3069	SKIP(6);
3070	if (!IS_BLANK_CH(CUR)) {
3071	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3072	"Space required after 'SYSTEM'\n", NULL, NULL);
3073	}
3074	SKIP_BLANKS;
3075	URI = htmlParseSystemLiteral(ctxt);
3076	if (URI == NULL) {
3077	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3078	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3079	}
3080	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3081	(UPP(2) == 'B') && (UPP(3) == 'L') &&
3082	(UPP(4) == 'I') && (UPP(5) == 'C')) {
3083	SKIP(6);
3084	if (!IS_BLANK_CH(CUR)) {
3085	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3086	"Space required after 'PUBLIC'\n", NULL, NULL);
3087	}
3088	SKIP_BLANKS;
3089	*publicID = htmlParsePubidLiteral(ctxt);
3090	if (*publicID == NULL) {
3091	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3092	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
3093	NULL, NULL);
3094	}
3095	SKIP_BLANKS;
3096	if ((CUR == '"') \|\| (CUR == '\'')) {
3097	URI = htmlParseSystemLiteral(ctxt);
3098	}
3099	}
3100	return(URI);
3101	}
3102
3103	/**
3104	* xmlParsePI:
3105	* @ctxt: an XML parser context
3106	*
3107	* parse an XML Processing Instruction.
3108	*
3109	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3110	*/
3111	static void
3112	htmlParsePI(htmlParserCtxtPtr ctxt) {
3113	xmlChar *buf = NULL;
3114	int len = 0;
3115	int size = HTML_PARSER_BUFFER_SIZE;
3116	int cur, l;
3117	const xmlChar *target;
3118	xmlParserInputState state;
3119	int count = 0;
3120
3121	if ((RAW == '<') && (NXT(1) == '?')) {
3122	state = ctxt->instate;
3123	ctxt->instate = XML_PARSER_PI;
3124	/*
3125	* this is a Processing Instruction.
3126	*/
3127	SKIP(2);
3128	SHRINK;
3129
3130	/*
3131	* Parse the target name and check for special support like
3132	* namespace.
3133	*/
3134	target = htmlParseName(ctxt);
3135	if (target != NULL) {
3136	if (RAW == '>') {
3137	SKIP(1);
3138
3139	/*
3140	* SAX: PI detected.
3141	*/
3142	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3143	(ctxt->sax->processingInstruction != NULL))
3144	ctxt->sax->processingInstruction(ctxt->userData,
3145	target, NULL);
3146	ctxt->instate = state;
3147	return;
3148	}
3149	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
3150	if (buf == NULL) {
3151	htmlErrMemory(ctxt, NULL);
3152	ctxt->instate = state;
3153	return;
3154	}
3155	cur = CUR;
3156	if (!IS_BLANK(cur)) {
3157	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3158	"ParsePI: PI %s space expected\n", target, NULL);
3159	}
3160	SKIP_BLANKS;
3161	cur = CUR_CHAR(l);
3162	while (IS_CHAR(cur) && (cur != '>')) {
3163	if (len + 5 >= size) {
3164	xmlChar *tmp;
3165
3166	size *= 2;
3167	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
3168	if (tmp == NULL) {
3169	htmlErrMemory(ctxt, NULL);
3170	xmlFree(buf);
3171	ctxt->instate = state;
3172	return;
3173	}
3174	buf = tmp;
3175	}
3176	count++;
3177	if (count > 50) {
3178	GROW;
3179	count = 0;
3180	}
3181	COPY_BUF(l,buf,len,cur);
3182	NEXTL(l);
3183	cur = CUR_CHAR(l);
3184	if (cur == 0) {
3185	SHRINK;
3186	GROW;
3187	cur = CUR_CHAR(l);
3188	}
3189	}
3190	buf[len] = 0;
3191	if (cur != '>') {
3192	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3193	"ParsePI: PI %s never end ...\n", target, NULL);
3194	} else {
3195	SKIP(1);
3196
3197	/*
3198	* SAX: PI detected.
3199	*/
3200	if ((ctxt->sax) && (!ctxt->disableSAX) &&
3201	(ctxt->sax->processingInstruction != NULL))
3202	ctxt->sax->processingInstruction(ctxt->userData,
3203	target, buf);
3204	}
3205	xmlFree(buf);
3206	} else {
3207	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3208	"PI is not started correctly", NULL, NULL);
3209	}
3210	ctxt->instate = state;
3211	}
3212	}
3213
3214	/**
3215	* htmlParseComment:
3216	* @ctxt: an HTML parser context
3217	*
3218	* Parse an XML (SGML) comment <!-- .... -->
3219	*
3220	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
3221	*/
3222	static void
3223	htmlParseComment(htmlParserCtxtPtr ctxt) {
3224	xmlChar *buf = NULL;
3225	int len;
3226	int size = HTML_PARSER_BUFFER_SIZE;
3227	int q, ql;
3228	int r, rl;
3229	int cur, l;
3230	xmlParserInputState state;
3231
3232	/*
3233	* Check that there is a comment right here.
3234	*/
3235	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
3236	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
3237
3238	state = ctxt->instate;
3239	ctxt->instate = XML_PARSER_COMMENT;
3240	SHRINK;
3241	SKIP(4);
3242	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
3243	if (buf == NULL) {
3244	htmlErrMemory(ctxt, "buffer allocation failed\n");
3245	ctxt->instate = state;
3246	return;
3247	}
3248	q = CUR_CHAR(ql);
3249	NEXTL(ql);
3250	r = CUR_CHAR(rl);
3251	NEXTL(rl);
3252	cur = CUR_CHAR(l);
3253	len = 0;
3254	while (IS_CHAR(cur) &&
3255	((cur != '>') \|\|
3256	(r != '-') \|\| (q != '-'))) {
3257	if (len + 5 >= size) {
3258	xmlChar *tmp;
3259
3260	size *= 2;
3261	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
3262	if (tmp == NULL) {
3263	xmlFree(buf);
3264	htmlErrMemory(ctxt, "growing buffer failed\n");
3265	ctxt->instate = state;
3266	return;
3267	}
3268	buf = tmp;
3269	}
3270	COPY_BUF(ql,buf,len,q);
3271	q = r;
3272	ql = rl;
3273	r = cur;
3274	rl = l;
3275	NEXTL(l);
3276	cur = CUR_CHAR(l);
3277	if (cur == 0) {
3278	SHRINK;
3279	GROW;
3280	cur = CUR_CHAR(l);
3281	}
3282	}
3283	buf[len] = 0;
3284	if (!IS_CHAR(cur)) {
3285	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3286	"Comment not terminated \n<!--%.50s\n", buf, NULL);
3287	xmlFree(buf);
3288	} else {
3289	NEXT;
3290	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3291	(!ctxt->disableSAX))
3292	ctxt->sax->comment(ctxt->userData, buf);
3293	xmlFree(buf);
3294	}
3295	ctxt->instate = state;
3296	}
3297
3298	/**
3299	* htmlParseCharRef:
3300	* @ctxt: an HTML parser context
3301	*
3302	* parse Reference declarations
3303	*
3304	* [66] CharRef ::= '&#' [0-9]+ ';' \|
3305	* '&#x' [0-9a-fA-F]+ ';'
3306	*
3307	* Returns the value parsed (as an int)
3308	*/
3309	int
3310	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3311	int val = 0;
3312
3313	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3314	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3315	"htmlParseCharRef: context error\n",
3316	NULL, NULL);
3317	return(0);
3318	}
3319	if ((CUR == '&') && (NXT(1) == '#') &&
3320	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
3321	SKIP(3);
3322	while (CUR != ';') {
3323	if ((CUR >= '0') && (CUR <= '9'))
3324	val = val * 16 + (CUR - '0');
3325	else if ((CUR >= 'a') && (CUR <= 'f'))
3326	val = val * 16 + (CUR - 'a') + 10;
3327	else if ((CUR >= 'A') && (CUR <= 'F'))
3328	val = val * 16 + (CUR - 'A') + 10;
3329	else {
3330	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3331	"htmlParseCharRef: missing semicolon\n",
3332	NULL, NULL);
3333	break;
3334	}
3335	NEXT;
3336	}
3337	if (CUR == ';')
3338	NEXT;
3339	} else if ((CUR == '&') && (NXT(1) == '#')) {
3340	SKIP(2);
3341	while (CUR != ';') {
3342	if ((CUR >= '0') && (CUR <= '9'))
3343	val = val * 10 + (CUR - '0');
3344	else {
3345	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3346	"htmlParseCharRef: missing semicolon\n",
3347	NULL, NULL);
3348	break;
3349	}
3350	NEXT;
3351	}
3352	if (CUR == ';')
3353	NEXT;
3354	} else {
3355	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3356	"htmlParseCharRef: invalid value\n", NULL, NULL);
3357	}
3358	/*
3359	* Check the value IS_CHAR ...
3360	*/
3361	if (IS_CHAR(val)) {
3362	return(val);
3363	} else {
3364	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3365	"htmlParseCharRef: invalid xmlChar value %d\n",
3366	val);
3367	}
3368	return(0);
3369	}
3370
3371
3372	/**
3373	* htmlParseDocTypeDecl:
3374	* @ctxt: an HTML parser context
3375	*
3376	* parse a DOCTYPE declaration
3377	*
3378	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3379	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
3380	*/
3381
3382	static void
3383	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3384	const xmlChar *name;
3385	xmlChar *ExternalID = NULL;
3386	xmlChar *URI = NULL;
3387
3388	/*
3389	* We know that '<!DOCTYPE' has been detected.
3390	*/
3391	SKIP(9);
3392
3393	SKIP_BLANKS;
3394
3395	/*
3396	* Parse the DOCTYPE name.
3397	*/
3398	name = htmlParseName(ctxt);
3399	if (name == NULL) {
3400	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3401	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3402	NULL, NULL);
3403	}
3404	/*
3405	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3406	*/
3407
3408	SKIP_BLANKS;
3409
3410	/*
3411	* Check for SystemID and ExternalID
3412	*/
3413	URI = htmlParseExternalID(ctxt, &ExternalID);
3414	SKIP_BLANKS;
3415
3416	/*
3417	* We should be at the end of the DOCTYPE declaration.
3418	*/
3419	if (CUR != '>') {
3420	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3421	"DOCTYPE improperly terminated\n", NULL, NULL);
3422	/* We shouldn't try to resynchronize ... */
3423	}
3424	NEXT;
3425
3426	/*
3427	* Create or update the document accordingly to the DOCTYPE
3428	*/
3429	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3430	(!ctxt->disableSAX))
3431	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3432
3433	/*
3434	* Cleanup, since we don't use all those identifiers
3435	*/
3436	if (URI != NULL) xmlFree(URI);
3437	if (ExternalID != NULL) xmlFree(ExternalID);
3438	}
3439
3440	/**
3441	* htmlParseAttribute:
3442	* @ctxt: an HTML parser context
3443	* @value: a xmlChar ** used to store the value of the attribute
3444	*
3445	* parse an attribute
3446	*
3447	* [41] Attribute ::= Name Eq AttValue
3448	*
3449	* [25] Eq ::= S? '=' S?
3450	*
3451	* With namespace:
3452	*
3453	* [NS 11] Attribute ::= QName Eq AttValue
3454	*
3455	* Also the case QName == xmlns:??? is handled independently as a namespace
3456	* definition.
3457	*
3458	* Returns the attribute name, and the value in *value.
3459	*/
3460
3461	static const xmlChar *
3462	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3463	const xmlChar *name;
3464	xmlChar *val = NULL;
3465
3466	*value = NULL;
3467	name = htmlParseHTMLName(ctxt);
3468	if (name == NULL) {
3469	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3470	"error parsing attribute name\n", NULL, NULL);
3471	return(NULL);
3472	}
3473
3474	/*
3475	* read the value
3476	*/
3477	SKIP_BLANKS;
3478	if (CUR == '=') {
3479	NEXT;
3480	SKIP_BLANKS;
3481	val = htmlParseAttValue(ctxt);
3482	}
3483
3484	*value = val;
3485	return(name);
3486	}
3487
3488	/**
3489	* htmlCheckEncodingDirect:
3490	* @ctxt: an HTML parser context
3491	* @attvalue: the attribute value
3492	*
3493	* Checks an attribute value to detect
3494	* the encoding
3495	* If a new encoding is detected the parser is switched to decode
3496	* it and pass UTF8
3497	*/
3498	static void
3499	htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3500
3501	if ((ctxt == NULL) \|\| (encoding == NULL) \|\|
3502	(ctxt->options & HTML_PARSE_IGNORE_ENC))
3503	return;
3504
3505	/* do not change encoding */
3506	if (ctxt->input->encoding != NULL)
3507	return;
3508
3509	if (encoding != NULL) {
3510	xmlCharEncoding enc;
3511	xmlCharEncodingHandlerPtr handler;
3512
3513	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
3514
3515	if (ctxt->input->encoding != NULL)
3516	xmlFree((xmlChar *) ctxt->input->encoding);
3517	ctxt->input->encoding = xmlStrdup(encoding);
3518
3519	enc = xmlParseCharEncoding((const char *) encoding);
3520	/*
3521	* registered set of known encodings
3522	*/
3523	if (enc != XML_CHAR_ENCODING_ERROR) {
3524	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
3525	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
3526	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
3527	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
3528	(ctxt->input->buf != NULL) &&
3529	(ctxt->input->buf->encoder == NULL)) {
3530	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3531	"htmlCheckEncoding: wrong encoding meta\n",
3532	NULL, NULL);
3533	} else {
3534	xmlSwitchEncoding(ctxt, enc);
3535	}
3536	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3537	} else {
3538	/*
3539	* fallback for unknown encodings
3540	*/
3541	handler = xmlFindCharEncodingHandler((const char *) encoding);
3542	if (handler != NULL) {
3543	xmlSwitchToEncoding(ctxt, handler);
3544	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3545	} else {
3546	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3547	"htmlCheckEncoding: unknown encoding %s\n",
3548	encoding, NULL);
3549	}
3550	}
3551
3552	if ((ctxt->input->buf != NULL) &&
3553	(ctxt->input->buf->encoder != NULL) &&
3554	(ctxt->input->buf->raw != NULL) &&
3555	(ctxt->input->buf->buffer != NULL)) {
3556	int nbchars;
3557	int processed;
3558
3559	/*
3560	* convert as much as possible to the parser reading buffer.
3561	*/
3562	processed = ctxt->input->cur - ctxt->input->base;
3563	xmlBufShrink(ctxt->input->buf->buffer, processed);
3564	nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3565	if (nbchars < 0) {
3566	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3567	"htmlCheckEncoding: encoder error\n",
3568	NULL, NULL);
3569	}
3570	xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3571	}
3572	}
3573	}
3574
3575	/**
3576	* htmlCheckEncoding:
3577	* @ctxt: an HTML parser context
3578	* @attvalue: the attribute value
3579	*
3580	* Checks an http-equiv attribute from a Meta tag to detect
3581	* the encoding
3582	* If a new encoding is detected the parser is switched to decode
3583	* it and pass UTF8
3584	*/
3585	static void
3586	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3587	const xmlChar *encoding;
3588
3589	if (!attvalue)
3590	return;
3591
3592	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3593	if (encoding != NULL) {
3594	encoding += 7;
3595	}
3596	/*
3597	* skip blank
3598	*/
3599	if (encoding && IS_BLANK_CH(*encoding))
3600	encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3601	if (encoding && *encoding == '=') {
3602	encoding ++;
3603	htmlCheckEncodingDirect(ctxt, encoding);
3604	}
3605	}
3606
3607	/**
3608	* htmlCheckMeta:
3609	* @ctxt: an HTML parser context
3610	* @atts: the attributes values
3611	*
3612	* Checks an attributes from a Meta tag
3613	*/
3614	static void
3615	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3616	int i;
3617	const xmlChar att, value;
3618	int http = 0;
3619	const xmlChar *content = NULL;
3620
3621	if ((ctxt == NULL) \|\| (atts == NULL))
3622	return;
3623
3624	i = 0;
3625	att = atts[i++];
3626	while (att != NULL) {
3627	value = atts[i++];
3628	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3629	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3630	http = 1;
3631	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3632	htmlCheckEncodingDirect(ctxt, value);
3633	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3634	content = value;
3635	att = atts[i++];
3636	}
3637	if ((http) && (content != NULL))
3638	htmlCheckEncoding(ctxt, content);
3639
3640	}
3641
3642	/**
3643	* htmlParseStartTag:
3644	* @ctxt: an HTML parser context
3645	*
3646	* parse a start of tag either for rule element or
3647	* EmptyElement. In both case we don't parse the tag closing chars.
3648	*
3649	* [40] STag ::= '<' Name (S Attribute)* S? '>'
3650	*
3651	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3652	*
3653	* With namespace:
3654	*
3655	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3656	*
3657	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3658	*
3659	* Returns 0 in case of success, -1 in case of error and 1 if discarded
3660	*/
3661
3662	static int
3663	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3664	const xmlChar *name;
3665	const xmlChar *attname;
3666	xmlChar *attvalue;
3667	const xmlChar **atts;
3668	int nbatts = 0;
3669	int maxatts;
3670	int meta = 0;
3671	int i;
3672	int discardtag = 0;
3673
3674	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3675	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3676	"htmlParseStartTag: context error\n", NULL, NULL);
3677	return -1;
3678	}
3679	if (ctxt->instate == XML_PARSER_EOF)
3680	return(-1);
3681	if (CUR != '<') return -1;
3682	NEXT;
3683
3684	atts = ctxt->atts;
3685	maxatts = ctxt->maxatts;
3686
3687	GROW;
3688	name = htmlParseHTMLName(ctxt);
3689	if (name == NULL) {
3690	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3691	"htmlParseStartTag: invalid element name\n",
3692	NULL, NULL);
3693	/* Dump the bogus tag like browsers do */
3694	while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3695	(ctxt->instate != XML_PARSER_EOF))
3696	NEXT;
3697	return -1;
3698	}
3699	if (xmlStrEqual(name, BAD_CAST"meta"))
3700	meta = 1;
3701
3702	/*
3703	* Check for auto-closure of HTML elements.
3704	*/
3705	htmlAutoClose(ctxt, name);
3706
3707	/*
3708	* Check for implied HTML elements.
3709	*/
3710	htmlCheckImplied(ctxt, name);
3711
3712	/*
3713	* Avoid html at any level > 0, head at any level != 1
3714	* or any attempt to recurse body
3715	*/
3716	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3717	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3718	"htmlParseStartTag: misplaced <html> tag\n",
3719	name, NULL);
3720	discardtag = 1;
3721	ctxt->depth++;
3722	}
3723	if ((ctxt->nameNr != 1) &&
3724	(xmlStrEqual(name, BAD_CAST"head"))) {
3725	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3726	"htmlParseStartTag: misplaced <head> tag\n",
3727	name, NULL);
3728	discardtag = 1;
3729	ctxt->depth++;
3730	}
3731	if (xmlStrEqual(name, BAD_CAST"body")) {
3732	int indx;
3733	for (indx = 0;indx < ctxt->nameNr;indx++) {
3734	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3735	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3736	"htmlParseStartTag: misplaced <body> tag\n",
3737	name, NULL);
3738	discardtag = 1;
3739	ctxt->depth++;
3740	}
3741	}
3742	}
3743
3744	/*
3745	* Now parse the attributes, it ends up with the ending
3746	*
3747	* (S Attribute)* S?
3748	*/
3749	SKIP_BLANKS;
3750	while ((IS_CHAR_CH(CUR)) &&
3751	(CUR != '>') &&
3752	((CUR != '/') \|\| (NXT(1) != '>'))) {
3753	long cons = ctxt->nbChars;
3754
3755	GROW;
3756	attname = htmlParseAttribute(ctxt, &attvalue);
3757	if (attname != NULL) {
3758
3759	/*
3760	* Well formedness requires at most one declaration of an attribute
3761	*/
3762	for (i = 0; i < nbatts;i += 2) {
3763	if (xmlStrEqual(atts[i], attname)) {
3764	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3765	"Attribute %s redefined\n", attname, NULL);
3766	if (attvalue != NULL)
3767	xmlFree(attvalue);
3768	goto failed;
3769	}
3770	}
3771
3772	/*
3773	* Add the pair to atts
3774	*/
3775	if (atts == NULL) {
3776	maxatts = 22; /* allow for 10 attrs by default */
3777	atts = (const xmlChar **)
3778	xmlMalloc(maxatts * sizeof(xmlChar *));
3779	if (atts == NULL) {
3780	htmlErrMemory(ctxt, NULL);
3781	if (attvalue != NULL)
3782	xmlFree(attvalue);
3783	goto failed;
3784	}
3785	ctxt->atts = atts;
3786	ctxt->maxatts = maxatts;
3787	} else if (nbatts + 4 > maxatts) {
3788	const xmlChar **n;
3789
3790	maxatts *= 2;
3791	n = (const xmlChar *) xmlRealloc((void ) atts,
3792	maxatts * sizeof(const xmlChar *));
3793	if (n == NULL) {
3794	htmlErrMemory(ctxt, NULL);
3795	if (attvalue != NULL)
3796	xmlFree(attvalue);
3797	goto failed;
3798	}
3799	atts = n;
3800	ctxt->atts = atts;
3801	ctxt->maxatts = maxatts;
3802	}
3803	atts[nbatts++] = attname;
3804	atts[nbatts++] = attvalue;
3805	atts[nbatts] = NULL;
3806	atts[nbatts + 1] = NULL;
3807	}
3808	else {
3809	if (attvalue != NULL)
3810	xmlFree(attvalue);
3811	/* Dump the bogus attribute string up to the next blank or
3812	* the end of the tag. */
3813	while ((IS_CHAR_CH(CUR)) &&
3814	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3815	((CUR != '/') \|\| (NXT(1) != '>')))
3816	NEXT;
3817	}
3818
3819	failed:
3820	SKIP_BLANKS;
3821	if (cons == ctxt->nbChars) {
3822	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3823	"htmlParseStartTag: problem parsing attributes\n",
3824	NULL, NULL);
3825	break;
3826	}
3827	}
3828
3829	/*
3830	* Handle specific association to the META tag
3831	*/
3832	if (meta && (nbatts != 0))
3833	htmlCheckMeta(ctxt, atts);
3834
3835	/*
3836	* SAX: Start of Element !
3837	*/
3838	if (!discardtag) {
3839	htmlnamePush(ctxt, name);
3840	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3841	if (nbatts != 0)
3842	ctxt->sax->startElement(ctxt->userData, name, atts);
3843	else
3844	ctxt->sax->startElement(ctxt->userData, name, NULL);
3845	}
3846	}
3847
3848	if (atts != NULL) {
3849	for (i = 1;i < nbatts;i += 2) {
3850	if (atts[i] != NULL)
3851	xmlFree((xmlChar *) atts[i]);
3852	}
3853	}
3854
3855	return(discardtag);
3856	}
3857
3858	/**
3859	* htmlParseEndTag:
3860	* @ctxt: an HTML parser context
3861	*
3862	* parse an end of tag
3863	*
3864	* [42] ETag ::= '</' Name S? '>'
3865	*
3866	* With namespace
3867	*
3868	* [NS 9] ETag ::= '</' QName S? '>'
3869	*
3870	* Returns 1 if the current level should be closed.
3871	*/
3872
3873	static int
3874	htmlParseEndTag(htmlParserCtxtPtr ctxt)
3875	{
3876	const xmlChar *name;
3877	const xmlChar *oldname;
3878	int i, ret;
3879
3880	if ((CUR != '<') \|\| (NXT(1) != '/')) {
3881	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3882	"htmlParseEndTag: '</' not found\n", NULL, NULL);
3883	return (0);
3884	}
3885	SKIP(2);
3886
3887	name = htmlParseHTMLName(ctxt);
3888	if (name == NULL)
3889	return (0);
3890	/*
3891	* We should definitely be at the ending "S? '>'" part
3892	*/
3893	SKIP_BLANKS;
3894	if ((!IS_CHAR_CH(CUR)) \|\| (CUR != '>')) {
3895	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3896	"End tag : expected '>'\n", NULL, NULL);
3897	if (ctxt->recovery) {
3898	/*
3899	* We're not at the ending > !!
3900	* Error, unless in recover mode where we search forwards
3901	* until we find a >
3902	*/
3903	while (CUR != '\0' && CUR != '>') NEXT;
3904	NEXT;
3905	}
3906	} else
3907	NEXT;
3908
3909	/*
3910	* if we ignored misplaced tags in htmlParseStartTag don't pop them
3911	* out now.
3912	*/
3913	if ((ctxt->depth > 0) &&
3914	(xmlStrEqual(name, BAD_CAST "html") \|\|
3915	xmlStrEqual(name, BAD_CAST "body") \|\|
3916	xmlStrEqual(name, BAD_CAST "head"))) {
3917	ctxt->depth--;
3918	return (0);
3919	}
3920
3921	/*
3922	* If the name read is not one of the element in the parsing stack
3923	* then return, it's just an error.
3924	*/
3925	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3926	if (xmlStrEqual(name, ctxt->nameTab[i]))
3927	break;
3928	}
3929	if (i < 0) {
3930	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3931	"Unexpected end tag : %s\n", name, NULL);
3932	return (0);
3933	}
3934
3935
3936	/*
3937	* Check for auto-closure of HTML elements.
3938	*/
3939
3940	htmlAutoCloseOnClose(ctxt, name);
3941
3942	/*
3943	* Well formedness constraints, opening and closing must match.
3944	* With the exception that the autoclose may have popped stuff out
3945	* of the stack.
3946	*/
3947	if (!xmlStrEqual(name, ctxt->name)) {
3948	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3949	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3950	"Opening and ending tag mismatch: %s and %s\n",
3951	name, ctxt->name);
3952	}
3953	}
3954
3955	/*
3956	* SAX: End of Tag
3957	*/
3958	oldname = ctxt->name;
3959	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3960	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3961	ctxt->sax->endElement(ctxt->userData, name);
3962	htmlNodeInfoPop(ctxt);
3963	htmlnamePop(ctxt);
3964	ret = 1;
3965	} else {
3966	ret = 0;
3967	}
3968
3969	return (ret);
3970	}
3971
3972
3973	/**
3974	* htmlParseReference:
3975	* @ctxt: an HTML parser context
3976	*
3977	* parse and handle entity references in content,
3978	* this will end-up in a call to character() since this is either a
3979	* CharRef, or a predefined entity.
3980	*/
3981	static void
3982	htmlParseReference(htmlParserCtxtPtr ctxt) {
3983	const htmlEntityDesc * ent;
3984	xmlChar out[6];
3985	const xmlChar *name;
3986	if (CUR != '&') return;
3987
3988	if (NXT(1) == '#') {
3989	unsigned int c;
3990	int bits, i = 0;
3991
3992	c = htmlParseCharRef(ctxt);
3993	if (c == 0)
3994	return;
3995
3996	if (c < 0x80) { out[i++]= c; bits= -6; }
3997	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
3998	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
3999	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4000
4001	for ( ; bits >= 0; bits-= 6) {
4002	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4003	}
4004	out[i] = 0;
4005
4006	htmlCheckParagraph(ctxt);
4007	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4008	ctxt->sax->characters(ctxt->userData, out, i);
4009	} else {
4010	ent = htmlParseEntityRef(ctxt, &name);
4011	if (name == NULL) {
4012	htmlCheckParagraph(ctxt);
4013	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4014	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4015	return;
4016	}
4017	if ((ent == NULL) \|\| !(ent->value > 0)) {
4018	htmlCheckParagraph(ctxt);
4019	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4020	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4021	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4022	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4023	}
4024	} else {
4025	unsigned int c;
4026	int bits, i = 0;
4027
4028	c = ent->value;
4029	if (c < 0x80)
4030	{ out[i++]= c; bits= -6; }
4031	else if (c < 0x800)
4032	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
4033	else if (c < 0x10000)
4034	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
4035	else
4036	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
4037
4038	for ( ; bits >= 0; bits-= 6) {
4039	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
4040	}
4041	out[i] = 0;
4042
4043	htmlCheckParagraph(ctxt);
4044	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4045	ctxt->sax->characters(ctxt->userData, out, i);
4046	}
4047	}
4048	}
4049
4050	/**
4051	* htmlParseContent:
4052	* @ctxt: an HTML parser context
4053	*
4054	* Parse a content: comment, sub-element, reference or text.
4055	* Kept for compatibility with old code
4056	*/
4057
4058	static void
4059	htmlParseContent(htmlParserCtxtPtr ctxt) {
4060	xmlChar *currentNode;
4061	int depth;
4062	const xmlChar *name;
4063
4064	currentNode = xmlStrdup(ctxt->name);
4065	depth = ctxt->nameNr;
4066	while (1) {
4067	long cons = ctxt->nbChars;
4068
4069	GROW;
4070
4071	if (ctxt->instate == XML_PARSER_EOF)
4072	break;
4073
4074	/*
4075	* Our tag or one of it's parent or children is ending.
4076	*/
4077	if ((CUR == '<') && (NXT(1) == '/')) {
4078	if (htmlParseEndTag(ctxt) &&
4079	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4080	if (currentNode != NULL)
4081	xmlFree(currentNode);
4082	return;
4083	}
4084	continue; /* while */
4085	}
4086
4087	else if ((CUR == '<') &&
4088	((IS_ASCII_LETTER(NXT(1))) \|\|
4089	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4090	name = htmlParseHTMLName_nonInvasive(ctxt);
4091	if (name == NULL) {
4092	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4093	"htmlParseStartTag: invalid element name\n",
4094	NULL, NULL);
4095	/* Dump the bogus tag like browsers do */
4096	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4097	NEXT;
4098
4099	if (currentNode != NULL)
4100	xmlFree(currentNode);
4101	return;
4102	}
4103
4104	if (ctxt->name != NULL) {
4105	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4106	htmlAutoClose(ctxt, name);
4107	continue;
4108	}
4109	}
4110	}
4111
4112	/*
4113	* Has this node been popped out during parsing of
4114	* the next element
4115	*/
4116	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4117	(!xmlStrEqual(currentNode, ctxt->name)))
4118	{
4119	if (currentNode != NULL) xmlFree(currentNode);
4120	return;
4121	}
4122
4123	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4124	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4125	/*
4126	* Handle SCRIPT/STYLE separately
4127	*/
4128	htmlParseScript(ctxt);
4129	} else {
4130	/*
4131	* Sometimes DOCTYPE arrives in the middle of the document
4132	*/
4133	if ((CUR == '<') && (NXT(1) == '!') &&
4134	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4135	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4136	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4137	(UPP(8) == 'E')) {
4138	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4139	"Misplaced DOCTYPE declaration\n",
4140	BAD_CAST "DOCTYPE" , NULL);
4141	htmlParseDocTypeDecl(ctxt);
4142	}
4143
4144	/*
4145	* First case : a comment
4146	*/
4147	if ((CUR == '<') && (NXT(1) == '!') &&
4148	(NXT(2) == '-') && (NXT(3) == '-')) {
4149	htmlParseComment(ctxt);
4150	}
4151
4152	/*
4153	* Second case : a Processing Instruction.
4154	*/
4155	else if ((CUR == '<') && (NXT(1) == '?')) {
4156	htmlParsePI(ctxt);
4157	}
4158
4159	/*
4160	* Third case : a sub-element.
4161	*/
4162	else if (CUR == '<') {
4163	htmlParseElement(ctxt);
4164	}
4165
4166	/*
4167	* Fourth case : a reference. If if has not been resolved,
4168	* parsing returns it's Name, create the node
4169	*/
4170	else if (CUR == '&') {
4171	htmlParseReference(ctxt);
4172	}
4173
4174	/*
4175	* Fifth case : end of the resource
4176	*/
4177	else if (CUR == 0) {
4178	htmlAutoCloseOnEnd(ctxt);
4179	break;
4180	}
4181
4182	/*
4183	* Last case, text. Note that References are handled directly.
4184	*/
4185	else {
4186	htmlParseCharData(ctxt);
4187	}
4188
4189	if (cons == ctxt->nbChars) {
4190	if (ctxt->node != NULL) {
4191	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4192	"detected an error in element content\n",
4193	NULL, NULL);
4194	}
4195	break;
4196	}
4197	}
4198	GROW;
4199	}
4200	if (currentNode != NULL) xmlFree(currentNode);
4201	}
4202
4203	/**
4204	* htmlParseElement:
4205	* @ctxt: an HTML parser context
4206	*
4207	* parse an HTML element, this is highly recursive
4208	* this is kept for compatibility with previous code versions
4209	*
4210	* [39] element ::= EmptyElemTag \| STag content ETag
4211	*
4212	* [41] Attribute ::= Name Eq AttValue
4213	*/
4214
4215	void
4216	htmlParseElement(htmlParserCtxtPtr ctxt) {
4217	const xmlChar *name;
4218	xmlChar *currentNode = NULL;
4219	const htmlElemDesc * info;
4220	htmlParserNodeInfo node_info;
4221	int failed;
4222	int depth;
4223	const xmlChar *oldptr;
4224
4225	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4226	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4227	"htmlParseElement: context error\n", NULL, NULL);
4228	return;
4229	}
4230
4231	if (ctxt->instate == XML_PARSER_EOF)
4232	return;
4233
4234	/* Capture start position */
4235	if (ctxt->record_info) {
4236	node_info.begin_pos = ctxt->input->consumed +
4237	(CUR_PTR - ctxt->input->base);
4238	node_info.begin_line = ctxt->input->line;
4239	}
4240
4241	failed = htmlParseStartTag(ctxt);
4242	name = ctxt->name;
4243	if ((failed == -1) \|\| (name == NULL)) {
4244	if (CUR == '>')
4245	NEXT;
4246	return;
4247	}
4248
4249	/*
4250	* Lookup the info for that element.
4251	*/
4252	info = htmlTagLookup(name);
4253	if (info == NULL) {
4254	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4255	"Tag %s invalid\n", name, NULL);
4256	}
4257
4258	/*
4259	* Check for an Empty Element labeled the XML/SGML way
4260	*/
4261	if ((CUR == '/') && (NXT(1) == '>')) {
4262	SKIP(2);
4263	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4264	ctxt->sax->endElement(ctxt->userData, name);
4265	htmlnamePop(ctxt);
4266	return;
4267	}
4268
4269	if (CUR == '>') {
4270	NEXT;
4271	} else {
4272	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4273	"Couldn't find end of Start Tag %s\n", name, NULL);
4274
4275	/*
4276	* end of parsing of this node.
4277	*/
4278	if (xmlStrEqual(name, ctxt->name)) {
4279	nodePop(ctxt);
4280	htmlnamePop(ctxt);
4281	}
4282
4283	/*
4284	* Capture end position and add node
4285	*/
4286	if (ctxt->record_info) {
4287	node_info.end_pos = ctxt->input->consumed +
4288	(CUR_PTR - ctxt->input->base);
4289	node_info.end_line = ctxt->input->line;
4290	node_info.node = ctxt->node;
4291	xmlParserAddNodeInfo(ctxt, &node_info);
4292	}
4293	return;
4294	}
4295
4296	/*
4297	* Check for an Empty Element from DTD definition
4298	*/
4299	if ((info != NULL) && (info->empty)) {
4300	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4301	ctxt->sax->endElement(ctxt->userData, name);
4302	htmlnamePop(ctxt);
4303	return;
4304	}
4305
4306	/*
4307	* Parse the content of the element:
4308	*/
4309	currentNode = xmlStrdup(ctxt->name);
4310	depth = ctxt->nameNr;
4311	while (IS_CHAR_CH(CUR)) {
4312	oldptr = ctxt->input->cur;
4313	htmlParseContent(ctxt);
4314	if (oldptr==ctxt->input->cur) break;
4315	if (ctxt->nameNr < depth) break;
4316	}
4317
4318	/*
4319	* Capture end position and add node
4320	*/
4321	if ( currentNode != NULL && ctxt->record_info ) {
4322	node_info.end_pos = ctxt->input->consumed +
4323	(CUR_PTR - ctxt->input->base);
4324	node_info.end_line = ctxt->input->line;
4325	node_info.node = ctxt->node;
4326	xmlParserAddNodeInfo(ctxt, &node_info);
4327	}
4328	if (!IS_CHAR_CH(CUR)) {
4329	htmlAutoCloseOnEnd(ctxt);
4330	}
4331
4332	if (currentNode != NULL)
4333	xmlFree(currentNode);
4334	}
4335
4336	static void
4337	htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4338	/*
4339	* Capture end position and add node
4340	*/
4341	if ( ctxt->node != NULL && ctxt->record_info ) {
4342	ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4343	(CUR_PTR - ctxt->input->base);
4344	ctxt->nodeInfo->end_line = ctxt->input->line;
4345	ctxt->nodeInfo->node = ctxt->node;
4346	xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4347	htmlNodeInfoPop(ctxt);
4348	}
4349	if (!IS_CHAR_CH(CUR)) {
4350	htmlAutoCloseOnEnd(ctxt);
4351	}
4352	}
4353
4354	/**
4355	* htmlParseElementInternal:
4356	* @ctxt: an HTML parser context
4357	*
4358	* parse an HTML element, new version, non recursive
4359	*
4360	* [39] element ::= EmptyElemTag \| STag content ETag
4361	*
4362	* [41] Attribute ::= Name Eq AttValue
4363	*/
4364
4365	static void
4366	htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4367	const xmlChar *name;
4368	const htmlElemDesc * info;
4369	htmlParserNodeInfo node_info = { 0, };
4370	int failed;
4371
4372	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4373	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4374	"htmlParseElementInternal: context error\n", NULL, NULL);
4375	return;
4376	}
4377
4378	if (ctxt->instate == XML_PARSER_EOF)
4379	return;
4380
4381	/* Capture start position */
4382	if (ctxt->record_info) {
4383	node_info.begin_pos = ctxt->input->consumed +
4384	(CUR_PTR - ctxt->input->base);
4385	node_info.begin_line = ctxt->input->line;
4386	}
4387
4388	failed = htmlParseStartTag(ctxt);
4389	name = ctxt->name;
4390	if ((failed == -1) \|\| (name == NULL)) {
4391	if (CUR == '>')
4392	NEXT;
4393	return;
4394	}
4395
4396	/*
4397	* Lookup the info for that element.
4398	*/
4399	info = htmlTagLookup(name);
4400	if (info == NULL) {
4401	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4402	"Tag %s invalid\n", name, NULL);
4403	}
4404
4405	/*
4406	* Check for an Empty Element labeled the XML/SGML way
4407	*/
4408	if ((CUR == '/') && (NXT(1) == '>')) {
4409	SKIP(2);
4410	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4411	ctxt->sax->endElement(ctxt->userData, name);
4412	htmlnamePop(ctxt);
4413	return;
4414	}
4415
4416	if (CUR == '>') {
4417	NEXT;
4418	} else {
4419	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4420	"Couldn't find end of Start Tag %s\n", name, NULL);
4421
4422	/*
4423	* end of parsing of this node.
4424	*/
4425	if (xmlStrEqual(name, ctxt->name)) {
4426	nodePop(ctxt);
4427	htmlnamePop(ctxt);
4428	}
4429
4430	if (ctxt->record_info)
4431	htmlNodeInfoPush(ctxt, &node_info);
4432	htmlParserFinishElementParsing(ctxt);
4433	return;
4434	}
4435
4436	/*
4437	* Check for an Empty Element from DTD definition
4438	*/
4439	if ((info != NULL) && (info->empty)) {
4440	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4441	ctxt->sax->endElement(ctxt->userData, name);
4442	htmlnamePop(ctxt);
4443	return;
4444	}
4445
4446	if (ctxt->record_info)
4447	htmlNodeInfoPush(ctxt, &node_info);
4448	}
4449
4450	/**
4451	* htmlParseContentInternal:
4452	* @ctxt: an HTML parser context
4453	*
4454	* Parse a content: comment, sub-element, reference or text.
4455	* New version for non recursive htmlParseElementInternal
4456	*/
4457
4458	static void
4459	htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4460	xmlChar *currentNode;
4461	int depth;
4462	const xmlChar *name;
4463
4464	currentNode = xmlStrdup(ctxt->name);
4465	depth = ctxt->nameNr;
4466	while (1) {
4467	long cons = ctxt->nbChars;
4468
4469	GROW;
4470
4471	if (ctxt->instate == XML_PARSER_EOF)
4472	break;
4473
4474	/*
4475	* Our tag or one of it's parent or children is ending.
4476	*/
4477	if ((CUR == '<') && (NXT(1) == '/')) {
4478	if (htmlParseEndTag(ctxt) &&
4479	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
4480	if (currentNode != NULL)
4481	xmlFree(currentNode);
4482
4483	currentNode = xmlStrdup(ctxt->name);
4484	depth = ctxt->nameNr;
4485	}
4486	continue; /* while */
4487	}
4488
4489	else if ((CUR == '<') &&
4490	((IS_ASCII_LETTER(NXT(1))) \|\|
4491	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
4492	name = htmlParseHTMLName_nonInvasive(ctxt);
4493	if (name == NULL) {
4494	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4495	"htmlParseStartTag: invalid element name\n",
4496	NULL, NULL);
4497	/* Dump the bogus tag like browsers do */
4498	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4499	NEXT;
4500
4501	htmlParserFinishElementParsing(ctxt);
4502	if (currentNode != NULL)
4503	xmlFree(currentNode);
4504
4505	currentNode = xmlStrdup(ctxt->name);
4506	depth = ctxt->nameNr;
4507	continue;
4508	}
4509
4510	if (ctxt->name != NULL) {
4511	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4512	htmlAutoClose(ctxt, name);
4513	continue;
4514	}
4515	}
4516	}
4517
4518	/*
4519	* Has this node been popped out during parsing of
4520	* the next element
4521	*/
4522	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4523	(!xmlStrEqual(currentNode, ctxt->name)))
4524	{
4525	htmlParserFinishElementParsing(ctxt);
4526	if (currentNode != NULL) xmlFree(currentNode);
4527
4528	currentNode = xmlStrdup(ctxt->name);
4529	depth = ctxt->nameNr;
4530	continue;
4531	}
4532
4533	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
4534	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
4535	/*
4536	* Handle SCRIPT/STYLE separately
4537	*/
4538	htmlParseScript(ctxt);
4539	} else {
4540	/*
4541	* Sometimes DOCTYPE arrives in the middle of the document
4542	*/
4543	if ((CUR == '<') && (NXT(1) == '!') &&
4544	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4545	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4546	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4547	(UPP(8) == 'E')) {
4548	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4549	"Misplaced DOCTYPE declaration\n",
4550	BAD_CAST "DOCTYPE" , NULL);
4551	htmlParseDocTypeDecl(ctxt);
4552	}
4553
4554	/*
4555	* First case : a comment
4556	*/
4557	if ((CUR == '<') && (NXT(1) == '!') &&
4558	(NXT(2) == '-') && (NXT(3) == '-')) {
4559	htmlParseComment(ctxt);
4560	}
4561
4562	/*
4563	* Second case : a Processing Instruction.
4564	*/
4565	else if ((CUR == '<') && (NXT(1) == '?')) {
4566	htmlParsePI(ctxt);
4567	}
4568
4569	/*
4570	* Third case : a sub-element.
4571	*/
4572	else if (CUR == '<') {
4573	htmlParseElementInternal(ctxt);
4574	if (currentNode != NULL) xmlFree(currentNode);
4575
4576	currentNode = xmlStrdup(ctxt->name);
4577	depth = ctxt->nameNr;
4578	}
4579
4580	/*
4581	* Fourth case : a reference. If if has not been resolved,
4582	* parsing returns it's Name, create the node
4583	*/
4584	else if (CUR == '&') {
4585	htmlParseReference(ctxt);
4586	}
4587
4588	/*
4589	* Fifth case : end of the resource
4590	*/
4591	else if (CUR == 0) {
4592	htmlAutoCloseOnEnd(ctxt);
4593	break;
4594	}
4595
4596	/*
4597	* Last case, text. Note that References are handled directly.
4598	*/
4599	else {
4600	htmlParseCharData(ctxt);
4601	}
4602
4603	if (cons == ctxt->nbChars) {
4604	if (ctxt->node != NULL) {
4605	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4606	"detected an error in element content\n",
4607	NULL, NULL);
4608	}
4609	break;
4610	}
4611	}
4612	GROW;
4613	}
4614	if (currentNode != NULL) xmlFree(currentNode);
4615	}
4616
4617	/**
4618	* htmlParseContent:
4619	* @ctxt: an HTML parser context
4620	*
4621	* Parse a content: comment, sub-element, reference or text.
4622	* This is the entry point when called from parser.c
4623	*/
4624
4625	void
4626	__htmlParseContent(void *ctxt) {
4627	if (ctxt != NULL)
4628	htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4629	}
4630
4631	/**
4632	* htmlParseDocument:
4633	* @ctxt: an HTML parser context
4634	*
4635	* parse an HTML document (and build a tree if using the standard SAX
4636	* interface).
4637	*
4638	* Returns 0, -1 in case of error. the parser context is augmented
4639	* as a result of the parsing.
4640	*/
4641
4642	int
4643	htmlParseDocument(htmlParserCtxtPtr ctxt) {
4644	xmlChar start[4];
4645	xmlCharEncoding enc;
4646	xmlDtdPtr dtd;
4647
4648	xmlInitParser();
4649
4650	htmlDefaultSAXHandlerInit();
4651
4652	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4653	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4654	"htmlParseDocument: context error\n", NULL, NULL);
4655	return(XML_ERR_INTERNAL_ERROR);
4656	}
4657	ctxt->html = 1;
4658	ctxt->linenumbers = 1;
4659	GROW;
4660	/*
4661	* SAX: beginning of the document processing.
4662	*/
4663	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4664	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4665
4666	if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4667	((ctxt->input->end - ctxt->input->cur) >= 4)) {
4668	/*
4669	* Get the 4 first bytes and decode the charset
4670	* if enc != XML_CHAR_ENCODING_NONE
4671	* plug some encoding conversion routines.
4672	*/
4673	start[0] = RAW;
4674	start[1] = NXT(1);
4675	start[2] = NXT(2);
4676	start[3] = NXT(3);
4677	enc = xmlDetectCharEncoding(&start[0], 4);
4678	if (enc != XML_CHAR_ENCODING_NONE) {
4679	xmlSwitchEncoding(ctxt, enc);
4680	}
4681	}
4682
4683	/*
4684	* Wipe out everything which is before the first '<'
4685	*/
4686	SKIP_BLANKS;
4687	if (CUR == 0) {
4688	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4689	"Document is empty\n", NULL, NULL);
4690	}
4691
4692	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4693	ctxt->sax->startDocument(ctxt->userData);
4694
4695
4696	/*
4697	* Parse possible comments and PIs before any content
4698	*/
4699	while (((CUR == '<') && (NXT(1) == '!') &&
4700	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4701	((CUR == '<') && (NXT(1) == '?'))) {
4702	htmlParseComment(ctxt);
4703	htmlParsePI(ctxt);
4704	SKIP_BLANKS;
4705	}
4706
4707
4708	/*
4709	* Then possibly doc type declaration(s) and more Misc
4710	* (doctypedecl Misc*)?
4711	*/
4712	if ((CUR == '<') && (NXT(1) == '!') &&
4713	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4714	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4715	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4716	(UPP(8) == 'E')) {
4717	htmlParseDocTypeDecl(ctxt);
4718	}
4719	SKIP_BLANKS;
4720
4721	/*
4722	* Parse possible comments and PIs before any content
4723	*/
4724	while (((CUR == '<') && (NXT(1) == '!') &&
4725	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4726	((CUR == '<') && (NXT(1) == '?'))) {
4727	htmlParseComment(ctxt);
4728	htmlParsePI(ctxt);
4729	SKIP_BLANKS;
4730	}
4731
4732	/*
4733	* Time to start parsing the tree itself
4734	*/
4735	htmlParseContentInternal(ctxt);
4736
4737	/*
4738	* autoclose
4739	*/
4740	if (CUR == 0)
4741	htmlAutoCloseOnEnd(ctxt);
4742
4743
4744	/*
4745	* SAX: end of the document processing.
4746	*/
4747	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4748	ctxt->sax->endDocument(ctxt->userData);
4749
4750	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4751	dtd = xmlGetIntSubset(ctxt->myDoc);
4752	if (dtd == NULL)
4753	ctxt->myDoc->intSubset =
4754	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4755	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4756	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4757	}
4758	if (! ctxt->wellFormed) return(-1);
4759	return(0);
4760	}
4761
4762
4763	/************************************************************************
4764	* *
4765	* Parser contexts handling *
4766	* *
4767	************************************************************************/
4768
4769	/**
4770	* htmlInitParserCtxt:
4771	* @ctxt: an HTML parser context
4772	*
4773	* Initialize a parser context
4774	*
4775	* Returns 0 in case of success and -1 in case of error
4776	*/
4777
4778	static int
4779	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4780	{
4781	htmlSAXHandler *sax;
4782
4783	if (ctxt == NULL) return(-1);
4784	memset(ctxt, 0, sizeof(htmlParserCtxt));
4785
4786	ctxt->dict = xmlDictCreate();
4787	if (ctxt->dict == NULL) {
4788	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4789	return(-1);
4790	}
4791	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4792	if (sax == NULL) {
4793	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4794	return(-1);
4795	}
4796	else
4797	memset(sax, 0, sizeof(htmlSAXHandler));
4798
4799	/* Allocate the Input stack */
4800	ctxt->inputTab = (htmlParserInputPtr *)
4801	xmlMalloc(5 * sizeof(htmlParserInputPtr));
4802	if (ctxt->inputTab == NULL) {
4803	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4804	ctxt->inputNr = 0;
4805	ctxt->inputMax = 0;
4806	ctxt->input = NULL;
4807	return(-1);
4808	}
4809	ctxt->inputNr = 0;
4810	ctxt->inputMax = 5;
4811	ctxt->input = NULL;
4812	ctxt->version = NULL;
4813	ctxt->encoding = NULL;
4814	ctxt->standalone = -1;
4815	ctxt->instate = XML_PARSER_START;
4816
4817	/* Allocate the Node stack */
4818	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
4819	if (ctxt->nodeTab == NULL) {
4820	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4821	ctxt->nodeNr = 0;
4822	ctxt->nodeMax = 0;
4823	ctxt->node = NULL;
4824	ctxt->inputNr = 0;
4825	ctxt->inputMax = 0;
4826	ctxt->input = NULL;
4827	return(-1);
4828	}
4829	ctxt->nodeNr = 0;
4830	ctxt->nodeMax = 10;
4831	ctxt->node = NULL;
4832
4833	/* Allocate the Name stack */
4834	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
4835	if (ctxt->nameTab == NULL) {
4836	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4837	ctxt->nameNr = 0;
4838	ctxt->nameMax = 0;
4839	ctxt->name = NULL;
4840	ctxt->nodeNr = 0;
4841	ctxt->nodeMax = 0;
4842	ctxt->node = NULL;
4843	ctxt->inputNr = 0;
4844	ctxt->inputMax = 0;
4845	ctxt->input = NULL;
4846	return(-1);
4847	}
4848	ctxt->nameNr = 0;
4849	ctxt->nameMax = 10;
4850	ctxt->name = NULL;
4851
4852	ctxt->nodeInfoTab = NULL;
4853	ctxt->nodeInfoNr = 0;
4854	ctxt->nodeInfoMax = 0;
4855
4856	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4857	else {
4858	ctxt->sax = sax;
4859	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4860	}
4861	ctxt->userData = ctxt;
4862	ctxt->myDoc = NULL;
4863	ctxt->wellFormed = 1;
4864	ctxt->replaceEntities = 0;
4865	ctxt->linenumbers = xmlLineNumbersDefaultValue;
4866	ctxt->html = 1;
4867	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4868	ctxt->vctxt.userData = ctxt;
4869	ctxt->vctxt.error = xmlParserValidityError;
4870	ctxt->vctxt.warning = xmlParserValidityWarning;
4871	ctxt->record_info = 0;
4872	ctxt->validate = 0;
4873	ctxt->nbChars = 0;
4874	ctxt->checkIndex = 0;
4875	ctxt->catalogs = NULL;
4876	xmlInitNodeInfoSeq(&ctxt->node_seq);
4877	return(0);
4878	}
4879
4880	/**
4881	* htmlFreeParserCtxt:
4882	* @ctxt: an HTML parser context
4883	*
4884	* Free all the memory used by a parser context. However the parsed
4885	* document in ctxt->myDoc is not freed.
4886	*/
4887
4888	void
4889	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4890	{
4891	xmlFreeParserCtxt(ctxt);
4892	}
4893
4894	/**
4895	* htmlNewParserCtxt:
4896	*
4897	* Allocate and initialize a new parser context.
4898	*
4899	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
4900	*/
4901
4902	htmlParserCtxtPtr
4903	htmlNewParserCtxt(void)
4904	{
4905	xmlParserCtxtPtr ctxt;
4906
4907	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4908	if (ctxt == NULL) {
4909	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4910	return(NULL);
4911	}
4912	memset(ctxt, 0, sizeof(xmlParserCtxt));
4913	if (htmlInitParserCtxt(ctxt) < 0) {
4914	htmlFreeParserCtxt(ctxt);
4915	return(NULL);
4916	}
4917	return(ctxt);
4918	}
4919
4920	/**
4921	* htmlCreateMemoryParserCtxt:
4922	* @buffer: a pointer to a char array
4923	* @size: the size of the array
4924	*
4925	* Create a parser context for an HTML in-memory document.
4926	*
4927	* Returns the new parser context or NULL
4928	*/
4929	htmlParserCtxtPtr
4930	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4931	xmlParserCtxtPtr ctxt;
4932	xmlParserInputPtr input;
4933	xmlParserInputBufferPtr buf;
4934
4935	if (buffer == NULL)
4936	return(NULL);
4937	if (size <= 0)
4938	return(NULL);
4939
4940	ctxt = htmlNewParserCtxt();
4941	if (ctxt == NULL)
4942	return(NULL);
4943
4944	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4945	if (buf == NULL) return(NULL);
4946
4947	input = xmlNewInputStream(ctxt);
4948	if (input == NULL) {
4949	xmlFreeParserCtxt(ctxt);
4950	return(NULL);
4951	}
4952
4953	input->filename = NULL;
4954	input->buf = buf;
4955	xmlBufResetInput(buf->buffer, input);
4956
4957	inputPush(ctxt, input);
4958	return(ctxt);
4959	}
4960
4961	/**
4962	* htmlCreateDocParserCtxt:
4963	* @cur: a pointer to an array of xmlChar
4964	* @encoding: a free form C string describing the HTML document encoding, or NULL
4965	*
4966	* Create a parser context for an HTML document.
4967	*
4968	* TODO: check the need to add encoding handling there
4969	*
4970	* Returns the new parser context or NULL
4971	*/
4972	static htmlParserCtxtPtr
4973	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
4974	int len;
4975	htmlParserCtxtPtr ctxt;
4976
4977	if (cur == NULL)
4978	return(NULL);
4979	len = xmlStrlen(cur);
4980	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4981	if (ctxt == NULL)
4982	return(NULL);
4983
4984	if (encoding != NULL) {
4985	xmlCharEncoding enc;
4986	xmlCharEncodingHandlerPtr handler;
4987
4988	if (ctxt->input->encoding != NULL)
4989	xmlFree((xmlChar *) ctxt->input->encoding);
4990	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4991
4992	enc = xmlParseCharEncoding(encoding);
4993	/*
4994	* registered set of known encodings
4995	*/
4996	if (enc != XML_CHAR_ENCODING_ERROR) {
4997	xmlSwitchEncoding(ctxt, enc);
4998	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4999	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5000	"Unsupported encoding %s\n",
5001	(const xmlChar *) encoding, NULL);
5002	}
5003	} else {
5004	/*
5005	* fallback for unknown encodings
5006	*/
5007	handler = xmlFindCharEncodingHandler((const char *) encoding);
5008	if (handler != NULL) {
5009	xmlSwitchToEncoding(ctxt, handler);
5010	} else {
5011	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5012	"Unsupported encoding %s\n",
5013	(const xmlChar *) encoding, NULL);
5014	}
5015	}
5016	}
5017	return(ctxt);
5018	}
5019
5020	#ifdef LIBXML_PUSH_ENABLED
5021	/************************************************************************
5022	* *
5023	* Progressive parsing interfaces *
5024	* *
5025	************************************************************************/
5026
5027	/**
5028	* htmlParseLookupSequence:
5029	* @ctxt: an HTML parser context
5030	* @first: the first char to lookup
5031	* @next: the next char to lookup or zero
5032	* @third: the next char to lookup or zero
5033	* @comment: flag to force checking inside comments
5034	*
5035	* Try to find if a sequence (first, next, third) or just (first next) or
5036	* (first) is available in the input stream.
5037	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5038	* to avoid rescanning sequences of bytes, it DOES change the state of the
5039	* parser, do not use liberally.
5040	* This is basically similar to xmlParseLookupSequence()
5041	*
5042	* Returns the index to the current parsing point if the full sequence
5043	* is available, -1 otherwise.
5044	*/
5045	static int
5046	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5047	xmlChar next, xmlChar third, int iscomment,
5048	int ignoreattrval)
5049	{
5050	int base, len;
5051	htmlParserInputPtr in;
5052	const xmlChar *buf;
5053	int incomment = 0;
5054	int invalue = 0;
5055	char valdellim = 0x0;
5056
5057	in = ctxt->input;
5058	if (in == NULL)
5059	return (-1);
5060
5061	base = in->cur - in->base;
5062	if (base < 0)
5063	return (-1);
5064
5065	if (ctxt->checkIndex > base)
5066	base = ctxt->checkIndex;
5067
5068	if (in->buf == NULL) {
5069	buf = in->base;
5070	len = in->length;
5071	} else {
5072	buf = xmlBufContent(in->buf->buffer);
5073	len = xmlBufUse(in->buf->buffer);
5074	}
5075
5076	/* take into account the sequence length */
5077	if (third)
5078	len -= 2;
5079	else if (next)
5080	len--;
5081	for (; base < len; base++) {
5082	if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5083	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5084	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5085	incomment = 1;
5086	/* do not increment past <! - some people use <!--> */
5087	base += 2;
5088	}
5089	}
5090	if (ignoreattrval) {
5091	if (buf[base] == '"' \|\| buf[base] == '\'') {
5092	if (invalue) {
5093	if (buf[base] == valdellim) {
5094	invalue = 0;
5095	continue;
5096	}
5097	} else {
5098	valdellim = buf[base];
5099	invalue = 1;
5100	continue;
5101	}
5102	} else if (invalue) {
5103	continue;
5104	}
5105	}
5106	if (incomment) {
5107	if (base + 3 > len)
5108	return (-1);
5109	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5110	(buf[base + 2] == '>')) {
5111	incomment = 0;
5112	base += 2;
5113	}
5114	continue;
5115	}
5116	if (buf[base] == first) {
5117	if (third != 0) {
5118	if ((buf[base + 1] != next) \|\| (buf[base + 2] != third))
5119	continue;
5120	} else if (next != 0) {
5121	if (buf[base + 1] != next)
5122	continue;
5123	}
5124	ctxt->checkIndex = 0;
5125	#ifdef DEBUG_PUSH
5126	if (next == 0)
5127	xmlGenericError(xmlGenericErrorContext,
5128	"HPP: lookup '%c' found at %d\n",
5129	first, base);
5130	else if (third == 0)
5131	xmlGenericError(xmlGenericErrorContext,
5132	"HPP: lookup '%c%c' found at %d\n",
5133	first, next, base);
5134	else
5135	xmlGenericError(xmlGenericErrorContext,
5136	"HPP: lookup '%c%c%c' found at %d\n",
5137	first, next, third, base);
5138	#endif
5139	return (base - (in->cur - in->base));
5140	}
5141	}
5142	if ((!incomment) && (!invalue))
5143	ctxt->checkIndex = base;
5144	#ifdef DEBUG_PUSH
5145	if (next == 0)
5146	xmlGenericError(xmlGenericErrorContext,
5147	"HPP: lookup '%c' failed\n", first);
5148	else if (third == 0)
5149	xmlGenericError(xmlGenericErrorContext,
5150	"HPP: lookup '%c%c' failed\n", first, next);
5151	else
5152	xmlGenericError(xmlGenericErrorContext,
5153	"HPP: lookup '%c%c%c' failed\n", first, next,
5154	third);
5155	#endif
5156	return (-1);
5157	}
5158
5159	/**
5160	* htmlParseLookupChars:
5161	* @ctxt: an HTML parser context
5162	* @stop: Array of chars, which stop the lookup.
5163	* @stopLen: Length of stop-Array
5164	*
5165	* Try to find if any char of the stop-Array is available in the input
5166	* stream.
5167	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
5168	* to avoid rescanning sequences of bytes, it DOES change the state of the
5169	* parser, do not use liberally.
5170	*
5171	* Returns the index to the current parsing point if a stopChar
5172	* is available, -1 otherwise.
5173	*/
5174	static int
5175	htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5176	int stopLen)
5177	{
5178	int base, len;
5179	htmlParserInputPtr in;
5180	const xmlChar *buf;
5181	int incomment = 0;
5182	int i;
5183
5184	in = ctxt->input;
5185	if (in == NULL)
5186	return (-1);
5187
5188	base = in->cur - in->base;
5189	if (base < 0)
5190	return (-1);
5191
5192	if (ctxt->checkIndex > base)
5193	base = ctxt->checkIndex;
5194
5195	if (in->buf == NULL) {
5196	buf = in->base;
5197	len = in->length;
5198	} else {
5199	buf = xmlBufContent(in->buf->buffer);
5200	len = xmlBufUse(in->buf->buffer);
5201	}
5202
5203	for (; base < len; base++) {
5204	if (!incomment && (base + 4 < len)) {
5205	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5206	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5207	incomment = 1;
5208	/* do not increment past <! - some people use <!--> */
5209	base += 2;
5210	}
5211	}
5212	if (incomment) {
5213	if (base + 3 > len)
5214	return (-1);
5215	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5216	(buf[base + 2] == '>')) {
5217	incomment = 0;
5218	base += 2;
5219	}
5220	continue;
5221	}
5222	for (i = 0; i < stopLen; ++i) {
5223	if (buf[base] == stop[i]) {
5224	ctxt->checkIndex = 0;
5225	return (base - (in->cur - in->base));
5226	}
5227	}
5228	}
5229	ctxt->checkIndex = base;
5230	return (-1);
5231	}
5232
5233	/**
5234	* htmlParseTryOrFinish:
5235	* @ctxt: an HTML parser context
5236	* @terminate: last chunk indicator
5237	*
5238	* Try to progress on parsing
5239	*
5240	* Returns zero if no parsing was possible
5241	*/
5242	static int
5243	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5244	int ret = 0;
5245	htmlParserInputPtr in;
5246	int avail = 0;
5247	xmlChar cur, next;
5248
5249	htmlParserNodeInfo node_info;
5250
5251	#ifdef DEBUG_PUSH
5252	switch (ctxt->instate) {
5253	case XML_PARSER_EOF:
5254	xmlGenericError(xmlGenericErrorContext,
5255	"HPP: try EOF\n"); break;
5256	case XML_PARSER_START:
5257	xmlGenericError(xmlGenericErrorContext,
5258	"HPP: try START\n"); break;
5259	case XML_PARSER_MISC:
5260	xmlGenericError(xmlGenericErrorContext,
5261	"HPP: try MISC\n");break;
5262	case XML_PARSER_COMMENT:
5263	xmlGenericError(xmlGenericErrorContext,
5264	"HPP: try COMMENT\n");break;
5265	case XML_PARSER_PROLOG:
5266	xmlGenericError(xmlGenericErrorContext,
5267	"HPP: try PROLOG\n");break;
5268	case XML_PARSER_START_TAG:
5269	xmlGenericError(xmlGenericErrorContext,
5270	"HPP: try START_TAG\n");break;
5271	case XML_PARSER_CONTENT:
5272	xmlGenericError(xmlGenericErrorContext,
5273	"HPP: try CONTENT\n");break;
5274	case XML_PARSER_CDATA_SECTION:
5275	xmlGenericError(xmlGenericErrorContext,
5276	"HPP: try CDATA_SECTION\n");break;
5277	case XML_PARSER_END_TAG:
5278	xmlGenericError(xmlGenericErrorContext,
5279	"HPP: try END_TAG\n");break;
5280	case XML_PARSER_ENTITY_DECL:
5281	xmlGenericError(xmlGenericErrorContext,
5282	"HPP: try ENTITY_DECL\n");break;
5283	case XML_PARSER_ENTITY_VALUE:
5284	xmlGenericError(xmlGenericErrorContext,
5285	"HPP: try ENTITY_VALUE\n");break;
5286	case XML_PARSER_ATTRIBUTE_VALUE:
5287	xmlGenericError(xmlGenericErrorContext,
5288	"HPP: try ATTRIBUTE_VALUE\n");break;
5289	case XML_PARSER_DTD:
5290	xmlGenericError(xmlGenericErrorContext,
5291	"HPP: try DTD\n");break;
5292	case XML_PARSER_EPILOG:
5293	xmlGenericError(xmlGenericErrorContext,
5294	"HPP: try EPILOG\n");break;
5295	case XML_PARSER_PI:
5296	xmlGenericError(xmlGenericErrorContext,
5297	"HPP: try PI\n");break;
5298	case XML_PARSER_SYSTEM_LITERAL:
5299	xmlGenericError(xmlGenericErrorContext,
5300	"HPP: try SYSTEM_LITERAL\n");break;
5301	}
5302	#endif
5303
5304	while (1) {
5305
5306	in = ctxt->input;
5307	if (in == NULL) break;
5308	if (in->buf == NULL)
5309	avail = in->length - (in->cur - in->base);
5310	else
5311	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5312	if ((avail == 0) && (terminate)) {
5313	htmlAutoCloseOnEnd(ctxt);
5314	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5315	/*
5316	* SAX: end of the document processing.
5317	*/
5318	ctxt->instate = XML_PARSER_EOF;
5319	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5320	ctxt->sax->endDocument(ctxt->userData);
5321	}
5322	}
5323	if (avail < 1)
5324	goto done;
5325	cur = in->cur[0];
5326	if (cur == 0) {
5327	SKIP(1);
5328	continue;
5329	}
5330
5331	switch (ctxt->instate) {
5332	case XML_PARSER_EOF:
5333	/*
5334	* Document parsing is done !
5335	*/
5336	goto done;
5337	case XML_PARSER_START:
5338	/*
5339	* Very first chars read from the document flow.
5340	*/
5341	cur = in->cur[0];
5342	if (IS_BLANK_CH(cur)) {
5343	SKIP_BLANKS;
5344	if (in->buf == NULL)
5345	avail = in->length - (in->cur - in->base);
5346	else
5347	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5348	}
5349	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5350	ctxt->sax->setDocumentLocator(ctxt->userData,
5351	&xmlDefaultSAXLocator);
5352	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5353	(!ctxt->disableSAX))
5354	ctxt->sax->startDocument(ctxt->userData);
5355
5356	cur = in->cur[0];
5357	next = in->cur[1];
5358	if ((cur == '<') && (next == '!') &&
5359	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5360	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5361	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5362	(UPP(8) == 'E')) {
5363	if ((!terminate) &&
5364	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5365	goto done;
5366	#ifdef DEBUG_PUSH
5367	xmlGenericError(xmlGenericErrorContext,
5368	"HPP: Parsing internal subset\n");
5369	#endif
5370	htmlParseDocTypeDecl(ctxt);
5371	ctxt->instate = XML_PARSER_PROLOG;
5372	#ifdef DEBUG_PUSH
5373	xmlGenericError(xmlGenericErrorContext,
5374	"HPP: entering PROLOG\n");
5375	#endif
5376	} else {
5377	ctxt->instate = XML_PARSER_MISC;
5378	#ifdef DEBUG_PUSH
5379	xmlGenericError(xmlGenericErrorContext,
5380	"HPP: entering MISC\n");
5381	#endif
5382	}
5383	break;
5384	case XML_PARSER_MISC:
5385	SKIP_BLANKS;
5386	if (in->buf == NULL)
5387	avail = in->length - (in->cur - in->base);
5388	else
5389	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5390	/*
5391	* no chars in buffer
5392	*/
5393	if (avail < 1)
5394	goto done;
5395	/*
5396	* not enouth chars in buffer
5397	*/
5398	if (avail < 2) {
5399	if (!terminate)
5400	goto done;
5401	else
5402	next = ' ';
5403	} else {
5404	next = in->cur[1];
5405	}
5406	cur = in->cur[0];
5407	if ((cur == '<') && (next == '!') &&
5408	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5409	if ((!terminate) &&
5410	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5411	goto done;
5412	#ifdef DEBUG_PUSH
5413	xmlGenericError(xmlGenericErrorContext,
5414	"HPP: Parsing Comment\n");
5415	#endif
5416	htmlParseComment(ctxt);
5417	ctxt->instate = XML_PARSER_MISC;
5418	} else if ((cur == '<') && (next == '?')) {
5419	if ((!terminate) &&
5420	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5421	goto done;
5422	#ifdef DEBUG_PUSH
5423	xmlGenericError(xmlGenericErrorContext,
5424	"HPP: Parsing PI\n");
5425	#endif
5426	htmlParsePI(ctxt);
5427	ctxt->instate = XML_PARSER_MISC;
5428	} else if ((cur == '<') && (next == '!') &&
5429	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5430	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5431	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5432	(UPP(8) == 'E')) {
5433	if ((!terminate) &&
5434	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5435	goto done;
5436	#ifdef DEBUG_PUSH
5437	xmlGenericError(xmlGenericErrorContext,
5438	"HPP: Parsing internal subset\n");
5439	#endif
5440	htmlParseDocTypeDecl(ctxt);
5441	ctxt->instate = XML_PARSER_PROLOG;
5442	#ifdef DEBUG_PUSH
5443	xmlGenericError(xmlGenericErrorContext,
5444	"HPP: entering PROLOG\n");
5445	#endif
5446	} else if ((cur == '<') && (next == '!') &&
5447	(avail < 9)) {
5448	goto done;
5449	} else {
5450	ctxt->instate = XML_PARSER_START_TAG;
5451	#ifdef DEBUG_PUSH
5452	xmlGenericError(xmlGenericErrorContext,
5453	"HPP: entering START_TAG\n");
5454	#endif
5455	}
5456	break;
5457	case XML_PARSER_PROLOG:
5458	SKIP_BLANKS;
5459	if (in->buf == NULL)
5460	avail = in->length - (in->cur - in->base);
5461	else
5462	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5463	if (avail < 2)
5464	goto done;
5465	cur = in->cur[0];
5466	next = in->cur[1];
5467	if ((cur == '<') && (next == '!') &&
5468	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5469	if ((!terminate) &&
5470	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5471	goto done;
5472	#ifdef DEBUG_PUSH
5473	xmlGenericError(xmlGenericErrorContext,
5474	"HPP: Parsing Comment\n");
5475	#endif
5476	htmlParseComment(ctxt);
5477	ctxt->instate = XML_PARSER_PROLOG;
5478	} else if ((cur == '<') && (next == '?')) {
5479	if ((!terminate) &&
5480	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5481	goto done;
5482	#ifdef DEBUG_PUSH
5483	xmlGenericError(xmlGenericErrorContext,
5484	"HPP: Parsing PI\n");
5485	#endif
5486	htmlParsePI(ctxt);
5487	ctxt->instate = XML_PARSER_PROLOG;
5488	} else if ((cur == '<') && (next == '!') &&
5489	(avail < 4)) {
5490	goto done;
5491	} else {
5492	ctxt->instate = XML_PARSER_START_TAG;
5493	#ifdef DEBUG_PUSH
5494	xmlGenericError(xmlGenericErrorContext,
5495	"HPP: entering START_TAG\n");
5496	#endif
5497	}
5498	break;
5499	case XML_PARSER_EPILOG:
5500	if (in->buf == NULL)
5501	avail = in->length - (in->cur - in->base);
5502	else
5503	avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5504	if (avail < 1)
5505	goto done;
5506	cur = in->cur[0];
5507	if (IS_BLANK_CH(cur)) {
5508	htmlParseCharData(ctxt);
5509	goto done;
5510	}
5511	if (avail < 2)
5512	goto done;
5513	next = in->cur[1];
5514	if ((cur == '<') && (next == '!') &&
5515	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5516	if ((!terminate) &&
5517	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5518	goto done;
5519	#ifdef DEBUG_PUSH
5520	xmlGenericError(xmlGenericErrorContext,
5521	"HPP: Parsing Comment\n");
5522	#endif
5523	htmlParseComment(ctxt);
5524	ctxt->instate = XML_PARSER_EPILOG;
5525	} else if ((cur == '<') && (next == '?')) {
5526	if ((!terminate) &&
5527	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5528	goto done;
5529	#ifdef DEBUG_PUSH
5530	xmlGenericError(xmlGenericErrorContext,
5531	"HPP: Parsing PI\n");
5532	#endif
5533	htmlParsePI(ctxt);
5534	ctxt->instate = XML_PARSER_EPILOG;
5535	} else if ((cur == '<') && (next == '!') &&
5536	(avail < 4)) {
5537	goto done;
5538	} else {
5539	ctxt->errNo = XML_ERR_DOCUMENT_END;
5540	ctxt->wellFormed = 0;
5541	ctxt->instate = XML_PARSER_EOF;
5542	#ifdef DEBUG_PUSH
5543	xmlGenericError(xmlGenericErrorContext,
5544	"HPP: entering EOF\n");
5545	#endif
5546	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5547	ctxt->sax->endDocument(ctxt->userData);
5548	goto done;
5549	}
5550	break;
5551	case XML_PARSER_START_TAG: {
5552	const xmlChar *name;
5553	int failed;
5554	const htmlElemDesc * info;
5555
5556	/*
5557	* no chars in buffer
5558	*/
5559	if (avail < 1)
5560	goto done;
5561	/*
5562	* not enouth chars in buffer
5563	*/
5564	if (avail < 2) {
5565	if (!terminate)
5566	goto done;
5567	else
5568	next = ' ';
5569	} else {
5570	next = in->cur[1];
5571	}
5572	cur = in->cur[0];
5573	if (cur != '<') {
5574	ctxt->instate = XML_PARSER_CONTENT;
5575	#ifdef DEBUG_PUSH
5576	xmlGenericError(xmlGenericErrorContext,
5577	"HPP: entering CONTENT\n");
5578	#endif
5579	break;
5580	}
5581	if (next == '/') {
5582	ctxt->instate = XML_PARSER_END_TAG;
5583	ctxt->checkIndex = 0;
5584	#ifdef DEBUG_PUSH
5585	xmlGenericError(xmlGenericErrorContext,
5586	"HPP: entering END_TAG\n");
5587	#endif
5588	break;
5589	}
5590	if ((!terminate) &&
5591	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5592	goto done;
5593
5594	/* Capture start position */
5595	if (ctxt->record_info) {
5596	node_info.begin_pos = ctxt->input->consumed +
5597	(CUR_PTR - ctxt->input->base);
5598	node_info.begin_line = ctxt->input->line;
5599	}
5600
5601
5602	failed = htmlParseStartTag(ctxt);
5603	name = ctxt->name;
5604	if ((failed == -1) \|\|
5605	(name == NULL)) {
5606	if (CUR == '>')
5607	NEXT;
5608	break;
5609	}
5610
5611	/*
5612	* Lookup the info for that element.
5613	*/
5614	info = htmlTagLookup(name);
5615	if (info == NULL) {
5616	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5617	"Tag %s invalid\n", name, NULL);
5618	}
5619
5620	/*
5621	* Check for an Empty Element labeled the XML/SGML way
5622	*/
5623	if ((CUR == '/') && (NXT(1) == '>')) {
5624	SKIP(2);
5625	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5626	ctxt->sax->endElement(ctxt->userData, name);
5627	htmlnamePop(ctxt);
5628	ctxt->instate = XML_PARSER_CONTENT;
5629	#ifdef DEBUG_PUSH
5630	xmlGenericError(xmlGenericErrorContext,
5631	"HPP: entering CONTENT\n");
5632	#endif
5633	break;
5634	}
5635
5636	if (CUR == '>') {
5637	NEXT;
5638	} else {
5639	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5640	"Couldn't find end of Start Tag %s\n",
5641	name, NULL);
5642
5643	/*
5644	* end of parsing of this node.
5645	*/
5646	if (xmlStrEqual(name, ctxt->name)) {
5647	nodePop(ctxt);
5648	htmlnamePop(ctxt);
5649	}
5650
5651	if (ctxt->record_info)
5652	htmlNodeInfoPush(ctxt, &node_info);
5653
5654	ctxt->instate = XML_PARSER_CONTENT;
5655	#ifdef DEBUG_PUSH
5656	xmlGenericError(xmlGenericErrorContext,
5657	"HPP: entering CONTENT\n");
5658	#endif
5659	break;
5660	}
5661
5662	/*
5663	* Check for an Empty Element from DTD definition
5664	*/
5665	if ((info != NULL) && (info->empty)) {
5666	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5667	ctxt->sax->endElement(ctxt->userData, name);
5668	htmlnamePop(ctxt);
5669	}
5670
5671	if (ctxt->record_info)
5672	htmlNodeInfoPush(ctxt, &node_info);
5673
5674	ctxt->instate = XML_PARSER_CONTENT;
5675	#ifdef DEBUG_PUSH
5676	xmlGenericError(xmlGenericErrorContext,
5677	"HPP: entering CONTENT\n");
5678	#endif
5679	break;
5680	}
5681	case XML_PARSER_CONTENT: {
5682	long cons;
5683	/*
5684	* Handle preparsed entities and charRef
5685	*/
5686	if (ctxt->token != 0) {
5687	xmlChar chr[2] = { 0 , 0 } ;
5688
5689	chr[0] = (xmlChar) ctxt->token;
5690	htmlCheckParagraph(ctxt);
5691	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5692	ctxt->sax->characters(ctxt->userData, chr, 1);
5693	ctxt->token = 0;
5694	ctxt->checkIndex = 0;
5695	}
5696	if ((avail == 1) && (terminate)) {
5697	cur = in->cur[0];
5698	if ((cur != '<') && (cur != '&')) {
5699	if (ctxt->sax != NULL) {
5700	if (IS_BLANK_CH(cur)) {
5701	if (ctxt->keepBlanks) {
5702	if (ctxt->sax->characters != NULL)
5703	ctxt->sax->characters(
5704	ctxt->userData, &cur, 1);
5705	} else {
5706	if (ctxt->sax->ignorableWhitespace != NULL)
5707	ctxt->sax->ignorableWhitespace(
5708	ctxt->userData, &cur, 1);
5709	}
5710	} else {
5711	htmlCheckParagraph(ctxt);
5712	if (ctxt->sax->characters != NULL)
5713	ctxt->sax->characters(
5714	ctxt->userData, &cur, 1);
5715	}
5716	}
5717	ctxt->token = 0;
5718	ctxt->checkIndex = 0;
5719	in->cur++;
5720	break;
5721	}
5722	}
5723	if (avail < 2)
5724	goto done;
5725	cur = in->cur[0];
5726	next = in->cur[1];
5727	cons = ctxt->nbChars;
5728	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
5729	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5730	/*
5731	* Handle SCRIPT/STYLE separately
5732	*/
5733	if (!terminate) {
5734	int idx;
5735	xmlChar val;
5736
5737	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5738	if (idx < 0)
5739	goto done;
5740	val = in->cur[idx + 2];
5741	if (val == 0) /* bad cut of input */
5742	goto done;
5743	}
5744	htmlParseScript(ctxt);
5745	if ((cur == '<') && (next == '/')) {
5746	ctxt->instate = XML_PARSER_END_TAG;
5747	ctxt->checkIndex = 0;
5748	#ifdef DEBUG_PUSH
5749	xmlGenericError(xmlGenericErrorContext,
5750	"HPP: entering END_TAG\n");
5751	#endif
5752	break;
5753	}
5754	} else {
5755	/*
5756	* Sometimes DOCTYPE arrives in the middle of the document
5757	*/
5758	if ((cur == '<') && (next == '!') &&
5759	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5760	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5761	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5762	(UPP(8) == 'E')) {
5763	if ((!terminate) &&
5764	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5765	goto done;
5766	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5767	"Misplaced DOCTYPE declaration\n",
5768	BAD_CAST "DOCTYPE" , NULL);
5769	htmlParseDocTypeDecl(ctxt);
5770	} else if ((cur == '<') && (next == '!') &&
5771	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5772	if ((!terminate) &&
5773	(htmlParseLookupSequence(
5774	ctxt, '-', '-', '>', 1, 1) < 0))
5775	goto done;
5776	#ifdef DEBUG_PUSH
5777	xmlGenericError(xmlGenericErrorContext,
5778	"HPP: Parsing Comment\n");
5779	#endif
5780	htmlParseComment(ctxt);
5781	ctxt->instate = XML_PARSER_CONTENT;
5782	} else if ((cur == '<') && (next == '?')) {
5783	if ((!terminate) &&
5784	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5785	goto done;
5786	#ifdef DEBUG_PUSH
5787	xmlGenericError(xmlGenericErrorContext,
5788	"HPP: Parsing PI\n");
5789	#endif
5790	htmlParsePI(ctxt);
5791	ctxt->instate = XML_PARSER_CONTENT;
5792	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
5793	goto done;
5794	} else if ((cur == '<') && (next == '/')) {
5795	ctxt->instate = XML_PARSER_END_TAG;
5796	ctxt->checkIndex = 0;
5797	#ifdef DEBUG_PUSH
5798	xmlGenericError(xmlGenericErrorContext,
5799	"HPP: entering END_TAG\n");
5800	#endif
5801	break;
5802	} else if (cur == '<') {
5803	ctxt->instate = XML_PARSER_START_TAG;
5804	ctxt->checkIndex = 0;
5805	#ifdef DEBUG_PUSH
5806	xmlGenericError(xmlGenericErrorContext,
5807	"HPP: entering START_TAG\n");
5808	#endif
5809	break;
5810	} else if (cur == '&') {
5811	if ((!terminate) &&
5812	(htmlParseLookupChars(ctxt,
5813	BAD_CAST "; >/", 4) < 0))
5814	goto done;
5815	#ifdef DEBUG_PUSH
5816	xmlGenericError(xmlGenericErrorContext,
5817	"HPP: Parsing Reference\n");
5818	#endif
5819	/* TODO: check generation of subtrees if noent !!! */
5820	htmlParseReference(ctxt);
5821	} else {
5822	/*
5823	* check that the text sequence is complete
5824	* before handing out the data to the parser
5825	* to avoid problems with erroneous end of
5826	* data detection.
5827	*/
5828	if ((!terminate) &&
5829	(htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5830	goto done;
5831	ctxt->checkIndex = 0;
5832	#ifdef DEBUG_PUSH
5833	xmlGenericError(xmlGenericErrorContext,
5834	"HPP: Parsing char data\n");
5835	#endif
5836	htmlParseCharData(ctxt);
5837	}
5838	}
5839	if (cons == ctxt->nbChars) {
5840	if (ctxt->node != NULL) {
5841	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5842	"detected an error in element content\n",
5843	NULL, NULL);
5844	}
5845	NEXT;
5846	break;
5847	}
5848
5849	break;
5850	}
5851	case XML_PARSER_END_TAG:
5852	if (avail < 2)
5853	goto done;
5854	if ((!terminate) &&
5855	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5856	goto done;
5857	htmlParseEndTag(ctxt);
5858	if (ctxt->nameNr == 0) {
5859	ctxt->instate = XML_PARSER_EPILOG;
5860	} else {
5861	ctxt->instate = XML_PARSER_CONTENT;
5862	}
5863	ctxt->checkIndex = 0;
5864	#ifdef DEBUG_PUSH
5865	xmlGenericError(xmlGenericErrorContext,
5866	"HPP: entering CONTENT\n");
5867	#endif
5868	break;
5869	case XML_PARSER_CDATA_SECTION:
5870	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5871	"HPP: internal error, state == CDATA\n",
5872	NULL, NULL);
5873	ctxt->instate = XML_PARSER_CONTENT;
5874	ctxt->checkIndex = 0;
5875	#ifdef DEBUG_PUSH
5876	xmlGenericError(xmlGenericErrorContext,
5877	"HPP: entering CONTENT\n");
5878	#endif
5879	break;
5880	case XML_PARSER_DTD:
5881	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5882	"HPP: internal error, state == DTD\n",
5883	NULL, NULL);
5884	ctxt->instate = XML_PARSER_CONTENT;
5885	ctxt->checkIndex = 0;
5886	#ifdef DEBUG_PUSH
5887	xmlGenericError(xmlGenericErrorContext,
5888	"HPP: entering CONTENT\n");
5889	#endif
5890	break;
5891	case XML_PARSER_COMMENT:
5892	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5893	"HPP: internal error, state == COMMENT\n",
5894	NULL, NULL);
5895	ctxt->instate = XML_PARSER_CONTENT;
5896	ctxt->checkIndex = 0;
5897	#ifdef DEBUG_PUSH
5898	xmlGenericError(xmlGenericErrorContext,
5899	"HPP: entering CONTENT\n");
5900	#endif
5901	break;
5902	case XML_PARSER_PI:
5903	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5904	"HPP: internal error, state == PI\n",
5905	NULL, NULL);
5906	ctxt->instate = XML_PARSER_CONTENT;
5907	ctxt->checkIndex = 0;
5908	#ifdef DEBUG_PUSH
5909	xmlGenericError(xmlGenericErrorContext,
5910	"HPP: entering CONTENT\n");
5911	#endif
5912	break;
5913	case XML_PARSER_ENTITY_DECL:
5914	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5915	"HPP: internal error, state == ENTITY_DECL\n",
5916	NULL, NULL);
5917	ctxt->instate = XML_PARSER_CONTENT;
5918	ctxt->checkIndex = 0;
5919	#ifdef DEBUG_PUSH
5920	xmlGenericError(xmlGenericErrorContext,
5921	"HPP: entering CONTENT\n");
5922	#endif
5923	break;
5924	case XML_PARSER_ENTITY_VALUE:
5925	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5926	"HPP: internal error, state == ENTITY_VALUE\n",
5927	NULL, NULL);
5928	ctxt->instate = XML_PARSER_CONTENT;
5929	ctxt->checkIndex = 0;
5930	#ifdef DEBUG_PUSH
5931	xmlGenericError(xmlGenericErrorContext,
5932	"HPP: entering DTD\n");
5933	#endif
5934	break;
5935	case XML_PARSER_ATTRIBUTE_VALUE:
5936	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5937	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5938	NULL, NULL);
5939	ctxt->instate = XML_PARSER_START_TAG;
5940	ctxt->checkIndex = 0;
5941	#ifdef DEBUG_PUSH
5942	xmlGenericError(xmlGenericErrorContext,
5943	"HPP: entering START_TAG\n");
5944	#endif
5945	break;
5946	case XML_PARSER_SYSTEM_LITERAL:
5947	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5948	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5949	NULL, NULL);
5950	ctxt->instate = XML_PARSER_CONTENT;
5951	ctxt->checkIndex = 0;
5952	#ifdef DEBUG_PUSH
5953	xmlGenericError(xmlGenericErrorContext,
5954	"HPP: entering CONTENT\n");
5955	#endif
5956	break;
5957	case XML_PARSER_IGNORE:
5958	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5959	"HPP: internal error, state == XML_PARSER_IGNORE\n",
5960	NULL, NULL);
5961	ctxt->instate = XML_PARSER_CONTENT;
5962	ctxt->checkIndex = 0;
5963	#ifdef DEBUG_PUSH
5964	xmlGenericError(xmlGenericErrorContext,
5965	"HPP: entering CONTENT\n");
5966	#endif
5967	break;
5968	case XML_PARSER_PUBLIC_LITERAL:
5969	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5970	"HPP: internal error, state == XML_PARSER_LITERAL\n",
5971	NULL, NULL);
5972	ctxt->instate = XML_PARSER_CONTENT;
5973	ctxt->checkIndex = 0;
5974	#ifdef DEBUG_PUSH
5975	xmlGenericError(xmlGenericErrorContext,
5976	"HPP: entering CONTENT\n");
5977	#endif
5978	break;
5979
5980	}
5981	}
5982	done:
5983	if ((avail == 0) && (terminate)) {
5984	htmlAutoCloseOnEnd(ctxt);
5985	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5986	/*
5987	* SAX: end of the document processing.
5988	*/
5989	ctxt->instate = XML_PARSER_EOF;
5990	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5991	ctxt->sax->endDocument(ctxt->userData);
5992	}
5993	}
5994	if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5995	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
5996	(ctxt->instate == XML_PARSER_EPILOG))) {
5997	xmlDtdPtr dtd;
5998	dtd = xmlGetIntSubset(ctxt->myDoc);
5999	if (dtd == NULL)
6000	ctxt->myDoc->intSubset =
6001	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6002	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6003	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6004	}
6005	#ifdef DEBUG_PUSH
6006	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6007	#endif
6008	return(ret);
6009	}
6010
6011	/**
6012	* htmlParseChunk:
6013	* @ctxt: an HTML parser context
6014	* @chunk: an char array
6015	* @size: the size in byte of the chunk
6016	* @terminate: last chunk indicator
6017	*
6018	* Parse a Chunk of memory
6019	*
6020	* Returns zero if no error, the xmlParserErrors otherwise.
6021	*/
6022	int
6023	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6024	int terminate) {
6025	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
6026	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6027	"htmlParseChunk: context error\n", NULL, NULL);
6028	return(XML_ERR_INTERNAL_ERROR);
6029	}
6030	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6031	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
6032	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6033	size_t cur = ctxt->input->cur - ctxt->input->base;
6034	int res;
6035
6036	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6037	if (res < 0) {
6038	ctxt->errNo = XML_PARSER_EOF;
6039	ctxt->disableSAX = 1;
6040	return (XML_PARSER_EOF);
6041	}
6042	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6043	#ifdef DEBUG_PUSH
6044	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6045	#endif
6046
6047	#if 0
6048	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
6049	htmlParseTryOrFinish(ctxt, terminate);
6050	#endif
6051	} else if (ctxt->instate != XML_PARSER_EOF) {
6052	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6053	xmlParserInputBufferPtr in = ctxt->input->buf;
6054	if ((in->encoder != NULL) && (in->buffer != NULL) &&
6055	(in->raw != NULL)) {
6056	int nbchars;
6057	size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6058	size_t current = ctxt->input->cur - ctxt->input->base;
6059
6060	nbchars = xmlCharEncInput(in, terminate);
6061	if (nbchars < 0) {
6062	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6063	"encoder error\n", NULL, NULL);
6064	return(XML_ERR_INVALID_ENCODING);
6065	}
6066	xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6067	}
6068	}
6069	}
6070	htmlParseTryOrFinish(ctxt, terminate);
6071	if (terminate) {
6072	if ((ctxt->instate != XML_PARSER_EOF) &&
6073	(ctxt->instate != XML_PARSER_EPILOG) &&
6074	(ctxt->instate != XML_PARSER_MISC)) {
6075	ctxt->errNo = XML_ERR_DOCUMENT_END;
6076	ctxt->wellFormed = 0;
6077	}
6078	if (ctxt->instate != XML_PARSER_EOF) {
6079	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6080	ctxt->sax->endDocument(ctxt->userData);
6081	}
6082	ctxt->instate = XML_PARSER_EOF;
6083	}
6084	return((xmlParserErrors) ctxt->errNo);
6085	}
6086
6087	/************************************************************************
6088	* *
6089	* User entry points *
6090	* *
6091	************************************************************************/
6092
6093	/**
6094	* htmlCreatePushParserCtxt:
6095	* @sax: a SAX handler
6096	* @user_data: The user data returned on SAX callbacks
6097	* @chunk: a pointer to an array of chars
6098	* @size: number of chars in the array
6099	* @filename: an optional file name or URI
6100	* @enc: an optional encoding
6101	*
6102	* Create a parser context for using the HTML parser in push mode
6103	* The value of @filename is used for fetching external entities
6104	* and error/warning reports.
6105	*
6106	* Returns the new parser context or NULL
6107	*/
6108	htmlParserCtxtPtr
6109	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6110	const char chunk, int size, const char filename,
6111	xmlCharEncoding enc) {
6112	htmlParserCtxtPtr ctxt;
6113	htmlParserInputPtr inputStream;
6114	xmlParserInputBufferPtr buf;
6115
6116	xmlInitParser();
6117
6118	buf = xmlAllocParserInputBuffer(enc);
6119	if (buf == NULL) return(NULL);
6120
6121	ctxt = htmlNewParserCtxt();
6122	if (ctxt == NULL) {
6123	xmlFreeParserInputBuffer(buf);
6124	return(NULL);
6125	}
6126	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
6127	ctxt->charset=XML_CHAR_ENCODING_UTF8;
6128	if (sax != NULL) {
6129	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6130	xmlFree(ctxt->sax);
6131	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6132	if (ctxt->sax == NULL) {
6133	xmlFree(buf);
6134	xmlFree(ctxt);
6135	return(NULL);
6136	}
6137	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6138	if (user_data != NULL)
6139	ctxt->userData = user_data;
6140	}
6141	if (filename == NULL) {
6142	ctxt->directory = NULL;
6143	} else {
6144	ctxt->directory = xmlParserGetDirectory(filename);
6145	}
6146
6147	inputStream = htmlNewInputStream(ctxt);
6148	if (inputStream == NULL) {
6149	xmlFreeParserCtxt(ctxt);
6150	xmlFree(buf);
6151	return(NULL);
6152	}
6153
6154	if (filename == NULL)
6155	inputStream->filename = NULL;
6156	else
6157	inputStream->filename = (char *)
6158	xmlCanonicPath((const xmlChar *) filename);
6159	inputStream->buf = buf;
6160	xmlBufResetInput(buf->buffer, inputStream);
6161
6162	inputPush(ctxt, inputStream);
6163
6164	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6165	(ctxt->input->buf != NULL)) {
6166	size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6167	size_t cur = ctxt->input->cur - ctxt->input->base;
6168
6169	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6170
6171	xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6172	#ifdef DEBUG_PUSH
6173	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6174	#endif
6175	}
6176	ctxt->progressive = 1;
6177
6178	return(ctxt);
6179	}
6180	#endif /* LIBXML_PUSH_ENABLED */
6181
6182	/**
6183	* htmlSAXParseDoc:
6184	* @cur: a pointer to an array of xmlChar
6185	* @encoding: a free form C string describing the HTML document encoding, or NULL
6186	* @sax: the SAX handler block
6187	* @userData: if using SAX, this pointer will be provided on callbacks.
6188	*
6189	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6190	* to handle parse events. If sax is NULL, fallback to the default DOM
6191	* behavior and return a tree.
6192	*
6193	* Returns the resulting document tree unless SAX is NULL or the document is
6194	* not well formed.
6195	*/
6196
6197	htmlDocPtr
6198	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
6199	htmlDocPtr ret;
6200	htmlParserCtxtPtr ctxt;
6201
6202	xmlInitParser();
6203
6204	if (cur == NULL) return(NULL);
6205
6206
6207	ctxt = htmlCreateDocParserCtxt(cur, encoding);
6208	if (ctxt == NULL) return(NULL);
6209	if (sax != NULL) {
6210	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6211	ctxt->sax = sax;
6212	ctxt->userData = userData;
6213	}
6214
6215	htmlParseDocument(ctxt);
6216	ret = ctxt->myDoc;
6217	if (sax != NULL) {
6218	ctxt->sax = NULL;
6219	ctxt->userData = NULL;
6220	}
6221	htmlFreeParserCtxt(ctxt);
6222
6223	return(ret);
6224	}
6225
6226	/**
6227	* htmlParseDoc:
6228	* @cur: a pointer to an array of xmlChar
6229	* @encoding: a free form C string describing the HTML document encoding, or NULL
6230	*
6231	* parse an HTML in-memory document and build a tree.
6232	*
6233	* Returns the resulting document tree
6234	*/
6235
6236	htmlDocPtr
6237	htmlParseDoc(xmlChar cur, const char encoding) {
6238	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6239	}
6240
6241
6242	/**
6243	* htmlCreateFileParserCtxt:
6244	* @filename: the filename
6245	* @encoding: a free form C string describing the HTML document encoding, or NULL
6246	*
6247	* Create a parser context for a file content.
6248	* Automatic support for ZLIB/Compress compressed document is provided
6249	* by default if found at compile-time.
6250	*
6251	* Returns the new parser context or NULL
6252	*/
6253	htmlParserCtxtPtr
6254	htmlCreateFileParserCtxt(const char filename, const char encoding)
6255	{
6256	htmlParserCtxtPtr ctxt;
6257	htmlParserInputPtr inputStream;
6258	char *canonicFilename;
6259	/* htmlCharEncoding enc; */
6260	xmlChar content, content_line = (xmlChar *) "charset=";
6261
6262	if (filename == NULL)
6263	return(NULL);
6264
6265	ctxt = htmlNewParserCtxt();
6266	if (ctxt == NULL) {
6267	return(NULL);
6268	}
6269	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
6270	if (canonicFilename == NULL) {
6271	#ifdef LIBXML_SAX1_ENABLED
6272	if (xmlDefaultSAXHandler.error != NULL) {
6273	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6274	}
6275	#endif
6276	xmlFreeParserCtxt(ctxt);
6277	return(NULL);
6278	}
6279
6280	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6281	xmlFree(canonicFilename);
6282	if (inputStream == NULL) {
6283	xmlFreeParserCtxt(ctxt);
6284	return(NULL);
6285	}
6286
6287	inputPush(ctxt, inputStream);
6288
6289	/* set encoding */
6290	if (encoding) {
6291	size_t l = strlen(encoding);
6292
6293	if (l < 1000) {
6294	content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6295	if (content) {
6296	strcpy ((char )content, (char )content_line);
6297	strcat ((char )content, (char )encoding);
6298	htmlCheckEncoding (ctxt, content);
6299	xmlFree (content);
6300	}
6301	}
6302	}
6303
6304	return(ctxt);
6305	}
6306
6307	/**
6308	* htmlSAXParseFile:
6309	* @filename: the filename
6310	* @encoding: a free form C string describing the HTML document encoding, or NULL
6311	* @sax: the SAX handler block
6312	* @userData: if using SAX, this pointer will be provided on callbacks.
6313	*
6314	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6315	* compressed document is provided by default if found at compile-time.
6316	* It use the given SAX function block to handle the parsing callback.
6317	* If sax is NULL, fallback to the default DOM tree building routines.
6318	*
6319	* Returns the resulting document tree unless SAX is NULL or the document is
6320	* not well formed.
6321	*/
6322
6323	htmlDocPtr
6324	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
6325	void *userData) {
6326	htmlDocPtr ret;
6327	htmlParserCtxtPtr ctxt;
6328	htmlSAXHandlerPtr oldsax = NULL;
6329
6330	xmlInitParser();
6331
6332	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6333	if (ctxt == NULL) return(NULL);
6334	if (sax != NULL) {
6335	oldsax = ctxt->sax;
6336	ctxt->sax = sax;
6337	ctxt->userData = userData;
6338	}
6339
6340	htmlParseDocument(ctxt);
6341
6342	ret = ctxt->myDoc;
6343	if (sax != NULL) {
6344	ctxt->sax = oldsax;
6345	ctxt->userData = NULL;
6346	}
6347	htmlFreeParserCtxt(ctxt);
6348
6349	return(ret);
6350	}
6351
6352	/**
6353	* htmlParseFile:
6354	* @filename: the filename
6355	* @encoding: a free form C string describing the HTML document encoding, or NULL
6356	*
6357	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6358	* compressed document is provided by default if found at compile-time.
6359	*
6360	* Returns the resulting document tree
6361	*/
6362
6363	htmlDocPtr
6364	htmlParseFile(const char filename, const char encoding) {
6365	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6366	}
6367
6368	/**
6369	* htmlHandleOmittedElem:
6370	* @val: int 0 or 1
6371	*
6372	* Set and return the previous value for handling HTML omitted tags.
6373	*
6374	* Returns the last value for 0 for no handling, 1 for auto insertion.
6375	*/
6376
6377	int
6378	htmlHandleOmittedElem(int val) {
6379	int old = htmlOmittedDefaultValue;
6380
6381	htmlOmittedDefaultValue = val;
6382	return(old);
6383	}
6384
6385	/**
6386	* htmlElementAllowedHere:
6387	* @parent: HTML parent element
6388	* @elt: HTML element
6389	*
6390	* Checks whether an HTML element may be a direct child of a parent element.
6391	* Note - doesn't check for deprecated elements
6392	*
6393	* Returns 1 if allowed; 0 otherwise.
6394	*/
6395	int
6396	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6397	const char** p ;
6398
6399	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
6400	return 0 ;
6401
6402	for ( p = parent->subelts; *p; ++p )
6403	if ( !xmlStrcmp((const xmlChar )p, elt) )
6404	return 1 ;
6405
6406	return 0 ;
6407	}
6408	/**
6409	* htmlElementStatusHere:
6410	* @parent: HTML parent element
6411	* @elt: HTML element
6412	*
6413	* Checks whether an HTML element may be a direct child of a parent element.
6414	* and if so whether it is valid or deprecated.
6415	*
6416	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6417	*/
6418	htmlStatus
6419	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6420	if ( ! parent \|\| ! elt )
6421	return HTML_INVALID ;
6422	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6423	return HTML_INVALID ;
6424
6425	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6426	}
6427	/**
6428	* htmlAttrAllowed:
6429	* @elt: HTML element
6430	* @attr: HTML attribute
6431	* @legacy: whether to allow deprecated attributes
6432	*
6433	* Checks whether an attribute is valid for an element
6434	* Has full knowledge of Required and Deprecated attributes
6435	*
6436	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6437	*/
6438	htmlStatus
6439	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6440	const char** p ;
6441
6442	if ( !elt \|\| ! attr )
6443	return HTML_INVALID ;
6444
6445	if ( elt->attrs_req )
6446	for ( p = elt->attrs_req; *p; ++p)
6447	if ( !xmlStrcmp((const xmlChar)p, attr) )
6448	return HTML_REQUIRED ;
6449
6450	if ( elt->attrs_opt )
6451	for ( p = elt->attrs_opt; *p; ++p)
6452	if ( !xmlStrcmp((const xmlChar)p, attr) )
6453	return HTML_VALID ;
6454
6455	if ( legacy && elt->attrs_depr )
6456	for ( p = elt->attrs_depr; *p; ++p)
6457	if ( !xmlStrcmp((const xmlChar)p, attr) )
6458	return HTML_DEPRECATED ;
6459
6460	return HTML_INVALID ;
6461	}
6462	/**
6463	* htmlNodeStatus:
6464	* @node: an htmlNodePtr in a tree
6465	* @legacy: whether to allow deprecated elements (YES is faster here
6466	* for Element nodes)
6467	*
6468	* Checks whether the tree node is valid. Experimental (the author
6469	* only uses the HTML enhancements in a SAX parser)
6470	*
6471	* Return: for Element nodes, a return from htmlElementAllowedHere (if
6472	* legacy allowed) or htmlElementStatusHere (otherwise).
6473	* for Attribute nodes, a return from htmlAttrAllowed
6474	* for other nodes, HTML_NA (no checks performed)
6475	*/
6476	htmlStatus
6477	htmlNodeStatus(const htmlNodePtr node, int legacy) {
6478	if ( ! node )
6479	return HTML_INVALID ;
6480
6481	switch ( node->type ) {
6482	case XML_ELEMENT_NODE:
6483	return legacy
6484	? ( htmlElementAllowedHere (
6485	htmlTagLookup(node->parent->name) , node->name
6486	) ? HTML_VALID : HTML_INVALID )
6487	: htmlElementStatusHere(
6488	htmlTagLookup(node->parent->name) ,
6489	htmlTagLookup(node->name) )
6490	;
6491	case XML_ATTRIBUTE_NODE:
6492	return htmlAttrAllowed(
6493	htmlTagLookup(node->parent->name) , node->name, legacy) ;
6494	default: return HTML_NA ;
6495	}
6496	}
6497	/************************************************************************
6498	* *
6499	* New set (2.6.0) of simpler and more flexible APIs *
6500	* *
6501	************************************************************************/
6502	/**
6503	* DICT_FREE:
6504	* @str: a string
6505	*
6506	* Free a string if it is not owned by the "dict" dictionnary in the
6507	* current scope
6508	*/
6509	#define DICT_FREE(str) \
6510	if ((str) && ((!dict) \|\| \
6511	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
6512	xmlFree((char *)(str));
6513
6514	/**
6515	* htmlCtxtReset:
6516	* @ctxt: an HTML parser context
6517	*
6518	* Reset a parser context
6519	*/
6520	void
6521	htmlCtxtReset(htmlParserCtxtPtr ctxt)
6522	{
6523	xmlParserInputPtr input;
6524	xmlDictPtr dict;
6525
6526	if (ctxt == NULL)
6527	return;
6528
6529	xmlInitParser();
6530	dict = ctxt->dict;
6531
6532	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6533	xmlFreeInputStream(input);
6534	}
6535	ctxt->inputNr = 0;
6536	ctxt->input = NULL;
6537
6538	ctxt->spaceNr = 0;
6539	if (ctxt->spaceTab != NULL) {
6540	ctxt->spaceTab[0] = -1;
6541	ctxt->space = &ctxt->spaceTab[0];
6542	} else {
6543	ctxt->space = NULL;
6544	}
6545
6546
6547	ctxt->nodeNr = 0;
6548	ctxt->node = NULL;
6549
6550	ctxt->nameNr = 0;
6551	ctxt->name = NULL;
6552
6553	DICT_FREE(ctxt->version);
6554	ctxt->version = NULL;
6555	DICT_FREE(ctxt->encoding);
6556	ctxt->encoding = NULL;
6557	DICT_FREE(ctxt->directory);
6558	ctxt->directory = NULL;
6559	DICT_FREE(ctxt->extSubURI);
6560	ctxt->extSubURI = NULL;
6561	DICT_FREE(ctxt->extSubSystem);
6562	ctxt->extSubSystem = NULL;
6563	if (ctxt->myDoc != NULL)
6564	xmlFreeDoc(ctxt->myDoc);
6565	ctxt->myDoc = NULL;
6566
6567	ctxt->standalone = -1;
6568	ctxt->hasExternalSubset = 0;
6569	ctxt->hasPErefs = 0;
6570	ctxt->html = 1;
6571	ctxt->external = 0;
6572	ctxt->instate = XML_PARSER_START;
6573	ctxt->token = 0;
6574
6575	ctxt->wellFormed = 1;
6576	ctxt->nsWellFormed = 1;
6577	ctxt->disableSAX = 0;
6578	ctxt->valid = 1;
6579	ctxt->vctxt.userData = ctxt;
6580	ctxt->vctxt.error = xmlParserValidityError;
6581	ctxt->vctxt.warning = xmlParserValidityWarning;
6582	ctxt->record_info = 0;
6583	ctxt->nbChars = 0;
6584	ctxt->checkIndex = 0;
6585	ctxt->inSubset = 0;
6586	ctxt->errNo = XML_ERR_OK;
6587	ctxt->depth = 0;
6588	ctxt->charset = XML_CHAR_ENCODING_NONE;
6589	ctxt->catalogs = NULL;
6590	xmlInitNodeInfoSeq(&ctxt->node_seq);
6591
6592	if (ctxt->attsDefault != NULL) {
6593	xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6594	ctxt->attsDefault = NULL;
6595	}
6596	if (ctxt->attsSpecial != NULL) {
6597	xmlHashFree(ctxt->attsSpecial, NULL);
6598	ctxt->attsSpecial = NULL;
6599	}
6600	}
6601
6602	/**
6603	* htmlCtxtUseOptions:
6604	* @ctxt: an HTML parser context
6605	* @options: a combination of htmlParserOption(s)
6606	*
6607	* Applies the options to the parser context
6608	*
6609	* Returns 0 in case of success, the set of unknown or unimplemented options
6610	* in case of error.
6611	*/
6612	int
6613	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6614	{
6615	if (ctxt == NULL)
6616	return(-1);
6617
6618	if (options & HTML_PARSE_NOWARNING) {
6619	ctxt->sax->warning = NULL;
6620	ctxt->vctxt.warning = NULL;
6621	options -= XML_PARSE_NOWARNING;
6622	ctxt->options \|= XML_PARSE_NOWARNING;
6623	}
6624	if (options & HTML_PARSE_NOERROR) {
6625	ctxt->sax->error = NULL;
6626	ctxt->vctxt.error = NULL;
6627	ctxt->sax->fatalError = NULL;
6628	options -= XML_PARSE_NOERROR;
6629	ctxt->options \|= XML_PARSE_NOERROR;
6630	}
6631	if (options & HTML_PARSE_PEDANTIC) {
6632	ctxt->pedantic = 1;
6633	options -= XML_PARSE_PEDANTIC;
6634	ctxt->options \|= XML_PARSE_PEDANTIC;
6635	} else
6636	ctxt->pedantic = 0;
6637	if (options & XML_PARSE_NOBLANKS) {
6638	ctxt->keepBlanks = 0;
6639	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6640	options -= XML_PARSE_NOBLANKS;
6641	ctxt->options \|= XML_PARSE_NOBLANKS;
6642	} else
6643	ctxt->keepBlanks = 1;
6644	if (options & HTML_PARSE_RECOVER) {
6645	ctxt->recovery = 1;
6646	options -= HTML_PARSE_RECOVER;
6647	} else
6648	ctxt->recovery = 0;
6649	if (options & HTML_PARSE_COMPACT) {
6650	ctxt->options \|= HTML_PARSE_COMPACT;
6651	options -= HTML_PARSE_COMPACT;
6652	}
6653	if (options & XML_PARSE_HUGE) {
6654	ctxt->options \|= XML_PARSE_HUGE;
6655	options -= XML_PARSE_HUGE;
6656	}
6657	if (options & HTML_PARSE_NODEFDTD) {
6658	ctxt->options \|= HTML_PARSE_NODEFDTD;
6659	options -= HTML_PARSE_NODEFDTD;
6660	}
6661	if (options & HTML_PARSE_IGNORE_ENC) {
6662	ctxt->options \|= HTML_PARSE_IGNORE_ENC;
6663	options -= HTML_PARSE_IGNORE_ENC;
6664	}
6665	if (options & HTML_PARSE_NOIMPLIED) {
6666	ctxt->options \|= HTML_PARSE_NOIMPLIED;
6667	options -= HTML_PARSE_NOIMPLIED;
6668	}
6669	ctxt->dictNames = 0;
6670	return (options);
6671	}
6672
6673	/**
6674	* htmlDoRead:
6675	* @ctxt: an HTML parser context
6676	* @URL: the base URL to use for the document
6677	* @encoding: the document encoding, or NULL
6678	* @options: a combination of htmlParserOption(s)
6679	* @reuse: keep the context for reuse
6680	*
6681	* Common front-end for the htmlRead functions
6682	*
6683	* Returns the resulting document tree or NULL
6684	*/
6685	static htmlDocPtr
6686	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
6687	int options, int reuse)
6688	{
6689	htmlDocPtr ret;
6690
6691	htmlCtxtUseOptions(ctxt, options);
6692	ctxt->html = 1;
6693	if (encoding != NULL) {
6694	xmlCharEncodingHandlerPtr hdlr;
6695
6696	hdlr = xmlFindCharEncodingHandler(encoding);
6697	if (hdlr != NULL) {
6698	xmlSwitchToEncoding(ctxt, hdlr);
6699	if (ctxt->input->encoding != NULL)
6700	xmlFree((xmlChar *) ctxt->input->encoding);
6701	ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6702	}
6703	}
6704	if ((URL != NULL) && (ctxt->input != NULL) &&
6705	(ctxt->input->filename == NULL))
6706	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
6707	htmlParseDocument(ctxt);
6708	ret = ctxt->myDoc;
6709	ctxt->myDoc = NULL;
6710	if (!reuse) {
6711	if ((ctxt->dictNames) &&
6712	(ret != NULL) &&
6713	(ret->dict == ctxt->dict))
6714	ctxt->dict = NULL;
6715	xmlFreeParserCtxt(ctxt);
6716	}
6717	return (ret);
6718	}
6719
6720	/**
6721	* htmlReadDoc:
6722	* @cur: a pointer to a zero terminated string
6723	* @URL: the base URL to use for the document
6724	* @encoding: the document encoding, or NULL
6725	* @options: a combination of htmlParserOption(s)
6726	*
6727	* parse an XML in-memory document and build a tree.
6728	*
6729	* Returns the resulting document tree
6730	*/
6731	htmlDocPtr
6732	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
6733	{
6734	htmlParserCtxtPtr ctxt;
6735
6736	if (cur == NULL)
6737	return (NULL);
6738
6739	xmlInitParser();
6740	ctxt = htmlCreateDocParserCtxt(cur, NULL);
6741	if (ctxt == NULL)
6742	return (NULL);
6743	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6744	}
6745
6746	/**
6747	* htmlReadFile:
6748	* @filename: a file or URL
6749	* @encoding: the document encoding, or NULL
6750	* @options: a combination of htmlParserOption(s)
6751	*
6752	* parse an XML file from the filesystem or the network.
6753	*
6754	* Returns the resulting document tree
6755	*/
6756	htmlDocPtr
6757	htmlReadFile(const char filename, const char encoding, int options)
6758	{
6759	htmlParserCtxtPtr ctxt;
6760
6761	xmlInitParser();
6762	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6763	if (ctxt == NULL)
6764	return (NULL);
6765	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6766	}
6767
6768	/**
6769	* htmlReadMemory:
6770	* @buffer: a pointer to a char array
6771	* @size: the size of the array
6772	* @URL: the base URL to use for the document
6773	* @encoding: the document encoding, or NULL
6774	* @options: a combination of htmlParserOption(s)
6775	*
6776	* parse an XML in-memory document and build a tree.
6777	*
6778	* Returns the resulting document tree
6779	*/
6780	htmlDocPtr
6781	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
6782	{
6783	htmlParserCtxtPtr ctxt;
6784
6785	xmlInitParser();
6786	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6787	if (ctxt == NULL)
6788	return (NULL);
6789	htmlDefaultSAXHandlerInit();
6790	if (ctxt->sax != NULL)
6791	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6792	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6793	}
6794
6795	/**
6796	* htmlReadFd:
6797	* @fd: an open file descriptor
6798	* @URL: the base URL to use for the document
6799	* @encoding: the document encoding, or NULL
6800	* @options: a combination of htmlParserOption(s)
6801	*
6802	* parse an XML from a file descriptor and build a tree.
6803	*
6804	* Returns the resulting document tree
6805	*/
6806	htmlDocPtr
6807	htmlReadFd(int fd, const char URL, const char encoding, int options)
6808	{
6809	htmlParserCtxtPtr ctxt;
6810	xmlParserInputBufferPtr input;
6811	xmlParserInputPtr stream;
6812
6813	if (fd < 0)
6814	return (NULL);
6815	xmlInitParser();
6816
6817	xmlInitParser();
6818	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6819	if (input == NULL)
6820	return (NULL);
6821	ctxt = xmlNewParserCtxt();
6822	if (ctxt == NULL) {
6823	xmlFreeParserInputBuffer(input);
6824	return (NULL);
6825	}
6826	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6827	if (stream == NULL) {
6828	xmlFreeParserInputBuffer(input);
6829	xmlFreeParserCtxt(ctxt);
6830	return (NULL);
6831	}
6832	inputPush(ctxt, stream);
6833	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6834	}
6835
6836	/**
6837	* htmlReadIO:
6838	* @ioread: an I/O read function
6839	* @ioclose: an I/O close function
6840	* @ioctx: an I/O handler
6841	* @URL: the base URL to use for the document
6842	* @encoding: the document encoding, or NULL
6843	* @options: a combination of htmlParserOption(s)
6844	*
6845	* parse an HTML document from I/O functions and source and build a tree.
6846	*
6847	* Returns the resulting document tree
6848	*/
6849	htmlDocPtr
6850	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6851	void ioctx, const char URL, const char *encoding, int options)
6852	{
6853	htmlParserCtxtPtr ctxt;
6854	xmlParserInputBufferPtr input;
6855	xmlParserInputPtr stream;
6856
6857	if (ioread == NULL)
6858	return (NULL);
6859	xmlInitParser();
6860
6861	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6862	XML_CHAR_ENCODING_NONE);
6863	if (input == NULL) {
6864	if (ioclose != NULL)
6865	ioclose(ioctx);
6866	return (NULL);
6867	}
6868	ctxt = htmlNewParserCtxt();
6869	if (ctxt == NULL) {
6870	xmlFreeParserInputBuffer(input);
6871	return (NULL);
6872	}
6873	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6874	if (stream == NULL) {
6875	xmlFreeParserInputBuffer(input);
6876	xmlFreeParserCtxt(ctxt);
6877	return (NULL);
6878	}
6879	inputPush(ctxt, stream);
6880	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6881	}
6882
6883	/**
6884	* htmlCtxtReadDoc:
6885	* @ctxt: an HTML parser context
6886	* @cur: a pointer to a zero terminated string
6887	* @URL: the base URL to use for the document
6888	* @encoding: the document encoding, or NULL
6889	* @options: a combination of htmlParserOption(s)
6890	*
6891	* parse an XML in-memory document and build a tree.
6892	* This reuses the existing @ctxt parser context
6893	*
6894	* Returns the resulting document tree
6895	*/
6896	htmlDocPtr
6897	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6898	const char URL, const char encoding, int options)
6899	{
6900	xmlParserInputPtr stream;
6901
6902	if (cur == NULL)
6903	return (NULL);
6904	if (ctxt == NULL)
6905	return (NULL);
6906	xmlInitParser();
6907
6908	htmlCtxtReset(ctxt);
6909
6910	stream = xmlNewStringInputStream(ctxt, cur);
6911	if (stream == NULL) {
6912	return (NULL);
6913	}
6914	inputPush(ctxt, stream);
6915	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6916	}
6917
6918	/**
6919	* htmlCtxtReadFile:
6920	* @ctxt: an HTML parser context
6921	* @filename: a file or URL
6922	* @encoding: the document encoding, or NULL
6923	* @options: a combination of htmlParserOption(s)
6924	*
6925	* parse an XML file from the filesystem or the network.
6926	* This reuses the existing @ctxt parser context
6927	*
6928	* Returns the resulting document tree
6929	*/
6930	htmlDocPtr
6931	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6932	const char *encoding, int options)
6933	{
6934	xmlParserInputPtr stream;
6935
6936	if (filename == NULL)
6937	return (NULL);
6938	if (ctxt == NULL)
6939	return (NULL);
6940	xmlInitParser();
6941
6942	htmlCtxtReset(ctxt);
6943
6944	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6945	if (stream == NULL) {
6946	return (NULL);
6947	}
6948	inputPush(ctxt, stream);
6949	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6950	}
6951
6952	/**
6953	* htmlCtxtReadMemory:
6954	* @ctxt: an HTML parser context
6955	* @buffer: a pointer to a char array
6956	* @size: the size of the array
6957	* @URL: the base URL to use for the document
6958	* @encoding: the document encoding, or NULL
6959	* @options: a combination of htmlParserOption(s)
6960	*
6961	* parse an XML in-memory document and build a tree.
6962	* This reuses the existing @ctxt parser context
6963	*
6964	* Returns the resulting document tree
6965	*/
6966	htmlDocPtr
6967	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6968	const char URL, const char encoding, int options)
6969	{
6970	xmlParserInputBufferPtr input;
6971	xmlParserInputPtr stream;
6972
6973	if (ctxt == NULL)
6974	return (NULL);
6975	if (buffer == NULL)
6976	return (NULL);
6977	xmlInitParser();
6978
6979	htmlCtxtReset(ctxt);
6980
6981	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6982	if (input == NULL) {
6983	return(NULL);
6984	}
6985
6986	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6987	if (stream == NULL) {
6988	xmlFreeParserInputBuffer(input);
6989	return(NULL);
6990	}
6991
6992	inputPush(ctxt, stream);
6993	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6994	}
6995
6996	/**
6997	* htmlCtxtReadFd:
6998	* @ctxt: an HTML parser context
6999	* @fd: an open file descriptor
7000	* @URL: the base URL to use for the document
7001	* @encoding: the document encoding, or NULL
7002	* @options: a combination of htmlParserOption(s)
7003	*
7004	* parse an XML from a file descriptor and build a tree.
7005	* This reuses the existing @ctxt parser context
7006	*
7007	* Returns the resulting document tree
7008	*/
7009	htmlDocPtr
7010	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7011	const char URL, const char encoding, int options)
7012	{
7013	xmlParserInputBufferPtr input;
7014	xmlParserInputPtr stream;
7015
7016	if (fd < 0)
7017	return (NULL);
7018	if (ctxt == NULL)
7019	return (NULL);
7020	xmlInitParser();
7021
7022	htmlCtxtReset(ctxt);
7023
7024
7025	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7026	if (input == NULL)
7027	return (NULL);
7028	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7029	if (stream == NULL) {
7030	xmlFreeParserInputBuffer(input);
7031	return (NULL);
7032	}
7033	inputPush(ctxt, stream);
7034	return (htmlDoRead(ctxt, URL, encoding, options, 1));
7035	}
7036
7037	/**
7038	* htmlCtxtReadIO:
7039	* @ctxt: an HTML parser context
7040	* @ioread: an I/O read function
7041	* @ioclose: an I/O close function
7042	* @ioctx: an I/O handler
7043	* @URL: the base URL to use for the document
7044	* @encoding: the document encoding, or NULL
7045	* @options: a combination of htmlParserOption(s)
7046	*
7047	* parse an HTML document from I/O functions and source and build a tree.
7048	* This reuses the existing @ctxt parser context
7049	*
7050	* Returns the resulting document tree
7051	*/
7052	htmlDocPtr
7053	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7054	xmlInputCloseCallback ioclose, void *ioctx,
7055	const char *URL,
7056	const char *encoding, int options)
7057	{
7058	xmlParserInputBufferPtr input;
7059	xmlParserInputPtr stream;
7060
7061	if (ioread == NULL)
7062	return (NULL);
7063	if (ctxt == NULL)
7064	return (NULL);
7065	xmlInitParser();
7066
7067	htmlCtxtReset(ctxt);
7068
7069	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7070	XML_CHAR_ENCODING_NONE);
7071	if (input == NULL) {
7072	if (ioclose != NULL)
7073	ioclose(ioctx);
7074	return (NULL);
7075	}
7076	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7077	if (stream == NULL) {
7078	xmlFreeParserInputBuffer(input);
7079	return (NULL);
7080	}
7081	inputPush(ctxt, stream);
7082	return (htmlDoRead(ctxt, URL, encoding, options, 1));
7083	}
7084
7085	#define bottom_HTMLparser
7086	#include "elfgcchack.h"
7087	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.9.2/HTMLparser.c@ 62281

Download in other formats: