HTMLparser.c@ 49482

Last change on this file since 49482 was 39915, checked in by vboxsync, 13 years ago
libxml-2.6.31 unmodified
Property svn:eol-style set to `native`
File size: 183.4 KB

Line
1	/*
2	* HTMLparser.c : an HTML 4.0 non-verifying parser
3	*
4	* See Copyright for the status of this software.
5	*
6	* daniel@veillard.com
7	*/
8
9	#define IN_LIBXML
10	#include "libxml.h"
11	#ifdef LIBXML_HTML_ENABLED
12
13	#include <string.h>
14	#ifdef HAVE_CTYPE_H
15	#include <ctype.h>
16	#endif
17	#ifdef HAVE_STDLIB_H
18	#include <stdlib.h>
19	#endif
20	#ifdef HAVE_SYS_STAT_H
21	#include <sys/stat.h>
22	#endif
23	#ifdef HAVE_FCNTL_H
24	#include <fcntl.h>
25	#endif
26	#ifdef HAVE_UNISTD_H
27	#include <unistd.h>
28	#endif
29	#ifdef HAVE_ZLIB_H
30	#include <zlib.h>
31	#endif
32
33	#include <libxml/xmlmemory.h>
34	#include <libxml/tree.h>
35	#include <libxml/parser.h>
36	#include <libxml/parserInternals.h>
37	#include <libxml/xmlerror.h>
38	#include <libxml/HTMLparser.h>
39	#include <libxml/HTMLtree.h>
40	#include <libxml/entities.h>
41	#include <libxml/encoding.h>
42	#include <libxml/valid.h>
43	#include <libxml/xmlIO.h>
44	#include <libxml/globals.h>
45	#include <libxml/uri.h>
46
47	#define HTML_MAX_NAMELEN 1000
48	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49	#define HTML_PARSER_BUFFER_SIZE 100
50
51	/* #define DEBUG */
52	/* #define DEBUG_PUSH */
53
54	static int htmlOmittedDefaultValue = 1;
55
56	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57	xmlChar end, xmlChar end2, xmlChar end3);
58	static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60	/************************************************************************
61	* *
62	* Some factorized error routines *
63	* *
64	************************************************************************/
65
66	/**
67	* htmlErrMemory:
68	* @ctxt: an HTML parser context
69	* @extra: extra informations
70	*
71	* Handle a redefinition of attribute error
72	*/
73	static void
74	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75	{
76	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77	(ctxt->instate == XML_PARSER_EOF))
78	return;
79	if (ctxt != NULL) {
80	ctxt->errNo = XML_ERR_NO_MEMORY;
81	ctxt->instate = XML_PARSER_EOF;
82	ctxt->disableSAX = 1;
83	}
84	if (extra)
85	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87	NULL, NULL, 0, 0,
88	"Memory allocation failed : %s\n", extra);
89	else
90	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92	NULL, NULL, 0, 0, "Memory allocation failed\n");
93	}
94
95	/**
96	* htmlParseErr:
97	* @ctxt: an HTML parser context
98	* @error: the error number
99	* @msg: the error message
100	* @str1: string infor
101	* @str2: string infor
102	*
103	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
104	*/
105	static void
106	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107	const char msg, const xmlChar str1, const xmlChar *str2)
108	{
109	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110	(ctxt->instate == XML_PARSER_EOF))
111	return;
112	if (ctxt != NULL)
113	ctxt->errNo = error;
114	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115	XML_ERR_ERROR, NULL, 0,
116	(const char ) str1, (const char ) str2,
117	NULL, 0, 0,
118	msg, str1, str2);
119	if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121	}
122
123	/**
124	* htmlParseErrInt:
125	* @ctxt: an HTML parser context
126	* @error: the error number
127	* @msg: the error message
128	* @val: integer info
129	*
130	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
131	*/
132	static void
133	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134	const char *msg, int val)
135	{
136	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137	(ctxt->instate == XML_PARSER_EOF))
138	return;
139	if (ctxt != NULL)
140	ctxt->errNo = error;
141	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142	XML_ERR_ERROR, NULL, 0, NULL, NULL,
143	NULL, val, 0, msg, val);
144	if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146	}
147
148	/************************************************************************
149	* *
150	* Parser stacks related functions and macros *
151	* *
152	************************************************************************/
153
154	/**
155	* htmlnamePush:
156	* @ctxt: an HTML parser context
157	* @value: the element name
158	*
159	* Pushes a new element name on top of the name stack
160	*
161	* Returns 0 in case of error, the index in the stack otherwise
162	*/
163	static int
164	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165	{
166	if (ctxt->nameNr >= ctxt->nameMax) {
167	ctxt->nameMax *= 2;
168	ctxt->nameTab = (const xmlChar * *)
169	xmlRealloc((xmlChar * *)ctxt->nameTab,
170	ctxt->nameMax *
171	sizeof(ctxt->nameTab[0]));
172	if (ctxt->nameTab == NULL) {
173	htmlErrMemory(ctxt, NULL);
174	return (0);
175	}
176	}
177	ctxt->nameTab[ctxt->nameNr] = value;
178	ctxt->name = value;
179	return (ctxt->nameNr++);
180	}
181	/**
182	* htmlnamePop:
183	* @ctxt: an HTML parser context
184	*
185	* Pops the top element name from the name stack
186	*
187	* Returns the name just removed
188	*/
189	static const xmlChar *
190	htmlnamePop(htmlParserCtxtPtr ctxt)
191	{
192	const xmlChar *ret;
193
194	if (ctxt->nameNr <= 0)
195	return (NULL);
196	ctxt->nameNr--;
197	if (ctxt->nameNr < 0)
198	return (NULL);
199	if (ctxt->nameNr > 0)
200	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201	else
202	ctxt->name = NULL;
203	ret = ctxt->nameTab[ctxt->nameNr];
204	ctxt->nameTab[ctxt->nameNr] = NULL;
205	return (ret);
206	}
207
208	/*
209	* Macros for accessing the content. Those should be used only by the parser,
210	* and not exported.
211	*
212	* Dirty macros, i.e. one need to make assumption on the context to use them
213	*
214	* CUR_PTR return the current pointer to the xmlChar to be parsed.
215	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217	* in UNICODE mode. This should be used internally by the parser
218	* only to compare to ASCII values otherwise it would break when
219	* running with UTF-8 encoding.
220	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221	* to compare on ASCII based substring.
222	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223	* it should be used only to compare on ASCII based substring.
224	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225	* strings without newlines within the parser.
226	*
227	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228	*
229	* CURRENT Returns the current char value, with the full decoding of
230	* UTF-8 if we are using this mode. It returns an int.
231	* NEXT Skip to the next character, this does the proper decoding
232	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
233	* NEXTL(l) Skip the current unicode character of l xmlChars long.
234	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235	*/
236
237	#define UPPER (toupper(*ctxt->input->cur))
238
239	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241	#define NXT(val) ctxt->input->cur[(val)]
242
243	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245	#define CUR_PTR ctxt->input->cur
246
247	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249	xmlParserInputShrink(ctxt->input)
250
251	#define GROW if ((ctxt->progressive == 0) && \
252	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255	#define CURRENT ((int) (*ctxt->input->cur))
256
257	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259	/* Inported from XML */
260
261	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
262	#define CUR ((int) (*ctxt->input->cur))
263	#define NEXT xmlNextChar(ctxt)
264
265	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266	#define NXT(val) ctxt->input->cur[(val)]
267	#define CUR_PTR ctxt->input->cur
268
269
270	#define NEXTL(l) do { \
271	if (*(ctxt->input->cur) == '\n') { \
272	ctxt->input->line++; ctxt->input->col = 1; \
273	} else ctxt->input->col++; \
274	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275	} while (0)
276
277	/************
278	\
279	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281	************/
282
283	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286	#define COPY_BUF(l,b,i,v) \
287	if (l == 1) b[i++] = (xmlChar) v; \
288	else i += xmlCopyChar(l,&b[i],v)
289
290	/**
291	* htmlCurrentChar:
292	* @ctxt: the HTML parser context
293	* @len: pointer to the length of the char read
294	*
295	* The current char value, if using UTF-8 this may actually span multiple
296	* bytes in the input buffer. Implement the end of line normalization:
297	* 2.11 End-of-Line Handling
298	* If the encoding is unspecified, in the case we find an ISO-Latin-1
299	* char, then the encoding converter is plugged in automatically.
300	*
301	* Returns the current char value and its length
302	*/
303
304	static int
305	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306	if (ctxt->instate == XML_PARSER_EOF)
307	return(0);
308
309	if (ctxt->token != 0) {
310	*len = 0;
311	return(ctxt->token);
312	}
313	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314	/*
315	* We are supposed to handle UTF8, check it's valid
316	* From rfc2044: encoding of the Unicode values on UTF-8:
317	*
318	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
319	* 0000 0000-0000 007F 0xxxxxxx
320	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322	*
323	* Check for the 0x110000 limit too
324	*/
325	const unsigned char *cur = ctxt->input->cur;
326	unsigned char c;
327	unsigned int val;
328
329	c = *cur;
330	if (c & 0x80) {
331	if (cur[1] == 0)
332	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333	if ((cur[1] & 0xc0) != 0x80)
334	goto encoding_error;
335	if ((c & 0xe0) == 0xe0) {
336
337	if (cur[2] == 0)
338	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339	if ((cur[2] & 0xc0) != 0x80)
340	goto encoding_error;
341	if ((c & 0xf0) == 0xf0) {
342	if (cur[3] == 0)
343	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344	if (((c & 0xf8) != 0xf0) \|\|
345	((cur[3] & 0xc0) != 0x80))
346	goto encoding_error;
347	/* 4-byte code */
348	*len = 4;
349	val = (cur[0] & 0x7) << 18;
350	val \|= (cur[1] & 0x3f) << 12;
351	val \|= (cur[2] & 0x3f) << 6;
352	val \|= cur[3] & 0x3f;
353	} else {
354	/* 3-byte code */
355	*len = 3;
356	val = (cur[0] & 0xf) << 12;
357	val \|= (cur[1] & 0x3f) << 6;
358	val \|= cur[2] & 0x3f;
359	}
360	} else {
361	/* 2-byte code */
362	*len = 2;
363	val = (cur[0] & 0x1f) << 6;
364	val \|= cur[1] & 0x3f;
365	}
366	if (!IS_CHAR(val)) {
367	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368	"Char 0x%X out of allowed range\n", val);
369	}
370	return(val);
371	} else {
372	/* 1-byte code */
373	*len = 1;
374	return((int) *ctxt->input->cur);
375	}
376	}
377	/*
378	* Assume it's a fixed length encoding (1) with
379	* a compatible encoding for the ASCII set, since
380	* XML constructs only use < 128 chars
381	*/
382	*len = 1;
383	if ((int) *ctxt->input->cur < 0x80)
384	return((int) *ctxt->input->cur);
385
386	/*
387	* Humm this is bad, do an automatic flow conversion
388	*/
389	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390	ctxt->charset = XML_CHAR_ENCODING_UTF8;
391	return(xmlCurrentChar(ctxt, len));
392
393	encoding_error:
394	/*
395	* If we detect an UTF8 error that probably mean that the
396	* input encoding didn't get properly advertized in the
397	* declaration header. Report the error and switch the encoding
398	* to ISO-Latin-1 (if you don't like this policy, just declare the
399	* encoding !)
400	*/
401	{
402	char buffer[150];
403
404	if (ctxt->input->end - ctxt->input->cur >= 4) {
405	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406	ctxt->input->cur[0], ctxt->input->cur[1],
407	ctxt->input->cur[2], ctxt->input->cur[3]);
408	} else {
409	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410	}
411	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412	"Input is not proper UTF-8, indicate encoding !\n",
413	BAD_CAST buffer, NULL);
414	}
415
416	ctxt->charset = XML_CHAR_ENCODING_8859_1;
417	*len = 1;
418	return((int) *ctxt->input->cur);
419	}
420
421	/**
422	* htmlSkipBlankChars:
423	* @ctxt: the HTML parser context
424	*
425	* skip all blanks character found at that point in the input streams.
426	*
427	* Returns the number of space chars skipped
428	*/
429
430	static int
431	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432	int res = 0;
433
434	while (IS_BLANK_CH(*(ctxt->input->cur))) {
435	if ((*ctxt->input->cur == 0) &&
436	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437	xmlPopInput(ctxt);
438	} else {
439	if (*(ctxt->input->cur) == '\n') {
440	ctxt->input->line++; ctxt->input->col = 1;
441	} else ctxt->input->col++;
442	ctxt->input->cur++;
443	ctxt->nbChars++;
444	if (*ctxt->input->cur == 0)
445	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446	}
447	res++;
448	}
449	return(res);
450	}
451
452
453
454	/************************************************************************
455	* *
456	* The list of HTML elements and their properties *
457	* *
458	************************************************************************/
459
460	/*
461	* Start Tag: 1 means the start tag can be ommited
462	* End Tag: 1 means the end tag can be ommited
463	* 2 means it's forbidden (empty elements)
464	* 3 means the tag is stylistic and should be closed easily
465	* Depr: this element is deprecated
466	* DTD: 1 means that this element is valid only in the Loose DTD
467	* 2 means that this element is valid only in the Frameset DTD
468	*
469	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470	, subElements , impliedsubelt , Attributes, userdata
471	*/
472
473	/* Definitions and a couple of vars for HTML Elements */
474
475	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476	#define NB_FONTSTYLE 8
477	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
478	#define NB_PHRASE 10
479	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480	#define NB_SPECIAL 16
481	#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
482	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483	#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484	#define NB_BLOCK NB_HEADING + NB_LIST + 14
485	#define FORMCTRL "input", "select", "textarea", "label", "button"
486	#define NB_FORMCTRL 5
487	#define PCDATA
488	#define NB_PCDATA 0
489	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
490	#define NB_HEADING 6
491	#define LIST "ul", "ol", "dir", "menu"
492	#define NB_LIST 4
493	#define MODIFIER
494	#define NB_MODIFIER 0
495	#define FLOW BLOCK,INLINE
496	#define NB_FLOW NB_BLOCK + NB_INLINE
497	#define EMPTY NULL
498
499
500	static const char* const html_flow[] = { FLOW, NULL } ;
501	static const char* const html_inline[] = { INLINE, NULL } ;
502
503	/* placeholders: elts with content but no subelements */
504	static const char* const html_pcdata[] = { NULL } ;
505	#define html_cdata html_pcdata
506
507
508	/* ... and for HTML Attributes */
509
510	#define COREATTRS "id", "class", "style", "title"
511	#define NB_COREATTRS 4
512	#define I18N "lang", "dir"
513	#define NB_I18N 2
514	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
515	#define NB_EVENTS 9
516	#define ATTRS COREATTRS,I18N,EVENTS
517	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
518	#define CELLHALIGN "align", "char", "charoff"
519	#define NB_CELLHALIGN 3
520	#define CELLVALIGN "valign"
521	#define NB_CELLVALIGN 1
522
523	static const char* const html_attrs[] = { ATTRS, NULL } ;
524	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525	static const char* const core_attrs[] = { COREATTRS, NULL } ;
526	static const char* const i18n_attrs[] = { I18N, NULL } ;
527
528
529	/* Other declarations that should go inline ... */
530	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
531	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532	"tabindex", "onfocus", "onblur", NULL } ;
533	static const char* const target_attr[] = { "target", NULL } ;
534	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535	static const char* const alt_attr[] = { "alt", NULL } ;
536	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537	static const char* const href_attrs[] = { "href", NULL } ;
538	static const char* const clear_attrs[] = { "clear", NULL } ;
539	static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541	static const char* const flow_param[] = { FLOW, "param", NULL } ;
542	static const char* const applet_attrs[] = { COREATTRS , "codebase",
543	"archive", "alt", "name", "height", "width", "align",
544	"hspace", "vspace", NULL } ;
545	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
546	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
547	static const char* const basefont_attrs[] =
548	{ "id", "size", "color", "face", NULL } ;
549	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552	static const char* const body_depr[] = { "background", "bgcolor", "text",
553	"link", "vlink", "alink", NULL } ;
554	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
555	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
558	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559	static const char* const col_elt[] = { "col", NULL } ;
560	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563	static const char* const compact_attr[] = { "compact", NULL } ;
564	static const char* const label_attr[] = { "label", NULL } ;
565	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575	static const char* const version_attr[] = { "version", NULL } ;
576	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
579	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
580	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584	static const char* const align_attr[] = { "align", NULL } ;
585	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587	static const char* const name_attr[] = { "name", NULL } ;
588	static const char* const action_attr[] = { "action", NULL } ;
589	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591	static const char* const content_attr[] = { "content", NULL } ;
592	static const char* const type_attr[] = { "type", NULL } ;
593	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594	static const char* const object_contents[] = { FLOW, "param", NULL } ;
595	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598	static const char* const option_elt[] = { "option", NULL } ;
599	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602	static const char* const width_attr[] = { "width", NULL } ;
603	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605	static const char* const language_attr[] = { "language", NULL } ;
606	static const char* const select_content[] = { "optgroup", "option", NULL } ;
607	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609	static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612	static const char* const tr_elt[] = { "tr", NULL } ;
613	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617	static const char* const tr_contents[] = { "th", "td", NULL } ;
618	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619	static const char* const li_elt[] = { "li", NULL } ;
620	static const char* const ul_depr[] = { "type", "compact", NULL} ;
621	static const char* const dir_attr[] = { "dir", NULL} ;
622
623	#define DECL (const char**)
624
625	static const htmlElemDesc
626	html40ElementTable[] = {
627	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
628	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629	},
630	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632	},
633	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
634	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635	},
636	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
637	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
638	},
639	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
640	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641	},
642	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644	},
645	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
646	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647	},
648	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
649	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650	},
651	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
652	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653	},
654	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656	},
657	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
658	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659	},
660	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
661	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662	},
663	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
664	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665	},
666	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
667	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668	},
669	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
670	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671	},
672	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
673	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674	},
675	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677	},
678	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
679	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680	},
681	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683	},
684	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
685	EMPTY , NULL , DECL col_attrs , NULL, NULL
686	},
687	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
688	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689	},
690	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
691	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692	},
693	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
694	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695	},
696	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
697	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698	},
699	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
700	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701	},
702	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704	},
705	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
706	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
707	},
708	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
709	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710	},
711	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
712	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713	},
714	{ "embed", 0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
715	EMPTY, NULL, DECL embed_attrs, NULL, NULL
716	},
717	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
718	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719	},
720	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
721	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722	},
723	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
724	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725	},
726	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727	EMPTY, NULL, NULL, DECL frame_attrs, NULL
728	},
729	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731	},
732	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
733	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734	},
735	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
736	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737	},
738	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
739	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740	},
741	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
742	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743	},
744	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
745	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746	},
747	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
748	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749	},
750	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
751	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752	},
753	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755	},
756	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
757	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758	},
759	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
760	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761	},
762	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764	},
765	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
766	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
767	},
768	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
769	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770	},
771	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
772	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773	},
774	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776	},
777	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779	},
780	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
781	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782	},
783	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785	},
786	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
787	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788	},
789	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791	},
792	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
793	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
794	},
795	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
796	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797	},
798	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800	},
801	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803	},
804	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805	DECL html_flow, "div", DECL html_attrs, NULL, NULL
806	},
807	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809	},
810	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
811	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812	},
813	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
814	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
815	},
816	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818	},
819	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
820	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821	},
822	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
823	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
824	},
825	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827	},
828	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830	},
831	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833	},
834	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836	},
837	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
838	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839	},
840	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
841	DECL select_content, NULL, DECL select_attrs, NULL, NULL
842	},
843	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
844	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845	},
846	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848	},
849	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
850	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851	},
852	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854	},
855	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
856	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857	},
858	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
859	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860	},
861	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
862	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863	},
864	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
865	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866	},
867	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
868	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869	},
870	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
871	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872	},
873	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875	},
876	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
877	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878	},
879	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
880	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881	},
882	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
883	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884	},
885	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
886	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887	},
888	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
889	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890	},
891	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893	},
894	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
895	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896	},
897	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
898	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899	},
900	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902	}
903	};
904
905	/*
906	* start tags that imply the end of current element
907	*/
908	static const char * const htmlStartClose[] = {
909	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910	"dl", "ul", "ol", "menu", "dir", "address", "pre",
911	"listing", "xmp", "head", NULL,
912	"head", "p", NULL,
913	"title", "p", NULL,
914	"body", "head", "style", "link", "title", "p", NULL,
915	"frameset", "head", "style", "link", "title", "p", NULL,
916	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917	"pre", "listing", "xmp", "head", "li", NULL,
918	"hr", "p", "head", NULL,
919	"h1", "p", "head", NULL,
920	"h2", "p", "head", NULL,
921	"h3", "p", "head", NULL,
922	"h4", "p", "head", NULL,
923	"h5", "p", "head", NULL,
924	"h6", "p", "head", NULL,
925	"dir", "p", "head", NULL,
926	"address", "p", "head", "ul", NULL,
927	"pre", "p", "head", "ul", NULL,
928	"listing", "p", "head", NULL,
929	"xmp", "p", "head", NULL,
930	"blockquote", "p", "head", NULL,
931	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
932	"xmp", "head", NULL,
933	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
934	"head", "dd", NULL,
935	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
936	"head", "dt", NULL,
937	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938	"listing", "xmp", NULL,
939	"ol", "p", "head", "ul", NULL,
940	"menu", "p", "head", "ul", NULL,
941	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942	"div", "p", "head", NULL,
943	"noscript", "p", "head", NULL,
944	"center", "font", "b", "i", "p", "head", NULL,
945	"a", "a", NULL,
946	"caption", "p", NULL,
947	"colgroup", "caption", "colgroup", "col", "p", NULL,
948	"col", "caption", "col", "p", NULL,
949	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950	"listing", "xmp", "a", NULL,
951	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
953	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954	"thead", "caption", "col", "colgroup", NULL,
955	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
956	"tbody", "p", NULL,
957	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958	"tfoot", "tbody", "p", NULL,
959	"optgroup", "option", NULL,
960	"option", "option", NULL,
961	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962	"pre", "listing", "xmp", "a", NULL,
963	NULL
964	};
965
966	/*
967	* The list of HTML elements which are supposed not to have
968	* CDATA content and where a p element will be implied
969	*
970	* TODO: extend that list by reading the HTML SGML DTD on
971	* implied paragraph
972	*/
973	static const char *const htmlNoContentElements[] = {
974	"html",
975	"head",
976	NULL
977	};
978
979	/*
980	* The list of HTML attributes which are of content %Script;
981	* NOTE: when adding ones, check htmlIsScriptAttribute() since
982	* it assumes the name starts with 'on'
983	*/
984	static const char *const htmlScriptAttributes[] = {
985	"onclick",
986	"ondblclick",
987	"onmousedown",
988	"onmouseup",
989	"onmouseover",
990	"onmousemove",
991	"onmouseout",
992	"onkeypress",
993	"onkeydown",
994	"onkeyup",
995	"onload",
996	"onunload",
997	"onfocus",
998	"onblur",
999	"onsubmit",
1000	"onrest",
1001	"onchange",
1002	"onselect"
1003	};
1004
1005	/*
1006	* This table is used by the htmlparser to know what to do with
1007	* broken html pages. By assigning different priorities to different
1008	* elements the parser can decide how to handle extra endtags.
1009	* Endtags are only allowed to close elements with lower or equal
1010	* priority.
1011	*/
1012
1013	typedef struct {
1014	const char *name;
1015	int priority;
1016	} elementPriority;
1017
1018	static const elementPriority htmlEndPriority[] = {
1019	{"div", 150},
1020	{"td", 160},
1021	{"th", 160},
1022	{"tr", 170},
1023	{"thead", 180},
1024	{"tbody", 180},
1025	{"tfoot", 180},
1026	{"table", 190},
1027	{"head", 200},
1028	{"body", 200},
1029	{"html", 220},
1030	{NULL, 100} /* Default priority */
1031	};
1032
1033	static const char** htmlStartCloseIndex[100];
1034	static int htmlStartCloseIndexinitialized = 0;
1035
1036	/************************************************************************
1037	* *
1038	* functions to handle HTML specific data *
1039	* *
1040	************************************************************************/
1041
1042	/**
1043	* htmlInitAutoClose:
1044	*
1045	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046	* This is not reentrant. Call xmlInitParser() once before processing in
1047	* case of use in multithreaded programs.
1048	*/
1049	void
1050	htmlInitAutoClose(void) {
1051	int indx, i = 0;
1052
1053	if (htmlStartCloseIndexinitialized) return;
1054
1055	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056	indx = 0;
1057	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1058	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1059	while (htmlStartClose[i] != NULL) i++;
1060	i++;
1061	}
1062	htmlStartCloseIndexinitialized = 1;
1063	}
1064
1065	/**
1066	* htmlTagLookup:
1067	* @tag: The tag name in lowercase
1068	*
1069	* Lookup the HTML tag in the ElementTable
1070	*
1071	* Returns the related htmlElemDescPtr or NULL if not found.
1072	*/
1073	const htmlElemDesc *
1074	htmlTagLookup(const xmlChar *tag) {
1075	unsigned int i;
1076
1077	for (i = 0; i < (sizeof(html40ElementTable) /
1078	sizeof(html40ElementTable[0]));i++) {
1079	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080	return((htmlElemDescPtr) &html40ElementTable[i]);
1081	}
1082	return(NULL);
1083	}
1084
1085	/**
1086	* htmlGetEndPriority:
1087	* @name: The name of the element to look up the priority for.
1088	*
1089	* Return value: The "endtag" priority.
1090	**/
1091	static int
1092	htmlGetEndPriority (const xmlChar *name) {
1093	int i = 0;
1094
1095	while ((htmlEndPriority[i].name != NULL) &&
1096	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097	i++;
1098
1099	return(htmlEndPriority[i].priority);
1100	}
1101
1102
1103	/**
1104	* htmlCheckAutoClose:
1105	* @newtag: The new tag name
1106	* @oldtag: The old tag name
1107	*
1108	* Checks whether the new tag is one of the registered valid tags for
1109	* closing old.
1110	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111	*
1112	* Returns 0 if no, 1 if yes.
1113	*/
1114	static int
1115	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116	{
1117	int i, indx;
1118	const char **closed = NULL;
1119
1120	if (htmlStartCloseIndexinitialized == 0)
1121	htmlInitAutoClose();
1122
1123	/* inefficient, but not a big deal */
1124	for (indx = 0; indx < 100; indx++) {
1125	closed = htmlStartCloseIndex[indx];
1126	if (closed == NULL)
1127	return (0);
1128	if (xmlStrEqual(BAD_CAST * closed, newtag))
1129	break;
1130	}
1131
1132	i = closed - htmlStartClose;
1133	i++;
1134	while (htmlStartClose[i] != NULL) {
1135	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1136	return (1);
1137	}
1138	i++;
1139	}
1140	return (0);
1141	}
1142
1143	/**
1144	* htmlAutoCloseOnClose:
1145	* @ctxt: an HTML parser context
1146	* @newtag: The new tag name
1147	* @force: force the tag closure
1148	*
1149	* The HTML DTD allows an ending tag to implicitly close other tags.
1150	*/
1151	static void
1152	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153	{
1154	const htmlElemDesc *info;
1155	int i, priority;
1156
1157	priority = htmlGetEndPriority(newtag);
1158
1159	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1160
1161	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162	break;
1163	/*
1164	* A missplaced endtag can only close elements with lower
1165	* or equal priority, so if we find an element with higher
1166	* priority before we find an element with
1167	* matching name, we just ignore this endtag
1168	*/
1169	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170	return;
1171	}
1172	if (i < 0)
1173	return;
1174
1175	while (!xmlStrEqual(newtag, ctxt->name)) {
1176	info = htmlTagLookup(ctxt->name);
1177	if ((info != NULL) && (info->endTag == 3)) {
1178	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179	"Opening and ending tag mismatch: %s and %s\n",
1180	newtag, ctxt->name);
1181	}
1182	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1184	htmlnamePop(ctxt);
1185	}
1186	}
1187
1188	/**
1189	* htmlAutoCloseOnEnd:
1190	* @ctxt: an HTML parser context
1191	*
1192	* Close all remaining tags at the end of the stream
1193	*/
1194	static void
1195	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196	{
1197	int i;
1198
1199	if (ctxt->nameNr == 0)
1200	return;
1201	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1202	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1204	htmlnamePop(ctxt);
1205	}
1206	}
1207
1208	/**
1209	* htmlAutoClose:
1210	* @ctxt: an HTML parser context
1211	* @newtag: The new tag name or NULL
1212	*
1213	* The HTML DTD allows a tag to implicitly close other tags.
1214	* The list is kept in htmlStartClose array. This function is
1215	* called when a new tag has been detected and generates the
1216	* appropriates closes if possible/needed.
1217	* If newtag is NULL this mean we are at the end of the resource
1218	* and we should check
1219	*/
1220	static void
1221	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222	{
1223	while ((newtag != NULL) && (ctxt->name != NULL) &&
1224	(htmlCheckAutoClose(newtag, ctxt->name))) {
1225	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1227	htmlnamePop(ctxt);
1228	}
1229	if (newtag == NULL) {
1230	htmlAutoCloseOnEnd(ctxt);
1231	return;
1232	}
1233	while ((newtag == NULL) && (ctxt->name != NULL) &&
1234	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
1235	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
1236	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1237	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1239	htmlnamePop(ctxt);
1240	}
1241	}
1242
1243	/**
1244	* htmlAutoCloseTag:
1245	* @doc: the HTML document
1246	* @name: The tag name
1247	* @elem: the HTML element
1248	*
1249	* The HTML DTD allows a tag to implicitly close other tags.
1250	* The list is kept in htmlStartClose array. This function checks
1251	* if the element or one of it's children would autoclose the
1252	* given tag.
1253	*
1254	* Returns 1 if autoclose, 0 otherwise
1255	*/
1256	int
1257	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258	htmlNodePtr child;
1259
1260	if (elem == NULL) return(1);
1261	if (xmlStrEqual(name, elem->name)) return(0);
1262	if (htmlCheckAutoClose(elem->name, name)) return(1);
1263	child = elem->children;
1264	while (child != NULL) {
1265	if (htmlAutoCloseTag(doc, name, child)) return(1);
1266	child = child->next;
1267	}
1268	return(0);
1269	}
1270
1271	/**
1272	* htmlIsAutoClosed:
1273	* @doc: the HTML document
1274	* @elem: the HTML element
1275	*
1276	* The HTML DTD allows a tag to implicitly close other tags.
1277	* The list is kept in htmlStartClose array. This function checks
1278	* if a tag is autoclosed by one of it's child
1279	*
1280	* Returns 1 if autoclosed, 0 otherwise
1281	*/
1282	int
1283	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284	htmlNodePtr child;
1285
1286	if (elem == NULL) return(1);
1287	child = elem->children;
1288	while (child != NULL) {
1289	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290	child = child->next;
1291	}
1292	return(0);
1293	}
1294
1295	/**
1296	* htmlCheckImplied:
1297	* @ctxt: an HTML parser context
1298	* @newtag: The new tag name
1299	*
1300	* The HTML DTD allows a tag to exists only implicitly
1301	* called when a new tag has been detected and generates the
1302	* appropriates implicit tags if missing
1303	*/
1304	static void
1305	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306	if (!htmlOmittedDefaultValue)
1307	return;
1308	if (xmlStrEqual(newtag, BAD_CAST"html"))
1309	return;
1310	if (ctxt->nameNr <= 0) {
1311	htmlnamePush(ctxt, BAD_CAST"html");
1312	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314	}
1315	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
1316	return;
1317	if ((ctxt->nameNr <= 1) &&
1318	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
1319	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
1320	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
1321	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
1322	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
1323	(xmlStrEqual(newtag, BAD_CAST"base")))) {
1324	/*
1325	* dropped OBJECT ... i you put it first BODY will be
1326	* assumed !
1327	*/
1328	htmlnamePush(ctxt, BAD_CAST"head");
1329	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334	int i;
1335	for (i = 0;i < ctxt->nameNr;i++) {
1336	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337	return;
1338	}
1339	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340	return;
1341	}
1342	}
1343
1344	htmlnamePush(ctxt, BAD_CAST"body");
1345	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347	}
1348	}
1349
1350	/**
1351	* htmlCheckParagraph
1352	* @ctxt: an HTML parser context
1353	*
1354	* Check whether a p element need to be implied before inserting
1355	* characters in the current element.
1356	*
1357	* Returns 1 if a paragraph has been inserted, 0 if not and -1
1358	* in case of error.
1359	*/
1360
1361	static int
1362	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363	const xmlChar *tag;
1364	int i;
1365
1366	if (ctxt == NULL)
1367	return(-1);
1368	tag = ctxt->name;
1369	if (tag == NULL) {
1370	htmlAutoClose(ctxt, BAD_CAST"p");
1371	htmlCheckImplied(ctxt, BAD_CAST"p");
1372	htmlnamePush(ctxt, BAD_CAST"p");
1373	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375	return(1);
1376	}
1377	if (!htmlOmittedDefaultValue)
1378	return(0);
1379	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1381	htmlAutoClose(ctxt, BAD_CAST"p");
1382	htmlCheckImplied(ctxt, BAD_CAST"p");
1383	htmlnamePush(ctxt, BAD_CAST"p");
1384	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386	return(1);
1387	}
1388	}
1389	return(0);
1390	}
1391
1392	/**
1393	* htmlIsScriptAttribute:
1394	* @name: an attribute name
1395	*
1396	* Check if an attribute is of content type Script
1397	*
1398	* Returns 1 is the attribute is a script 0 otherwise
1399	*/
1400	int
1401	htmlIsScriptAttribute(const xmlChar *name) {
1402	unsigned int i;
1403
1404	if (name == NULL)
1405	return(0);
1406	/*
1407	* all script attributes start with 'on'
1408	*/
1409	if ((name[0] != 'o') \|\| (name[1] != 'n'))
1410	return(0);
1411	for (i = 0;
1412	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413	i++) {
1414	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415	return(1);
1416	}
1417	return(0);
1418	}
1419
1420	/************************************************************************
1421	* *
1422	* The list of HTML predefined entities *
1423	* *
1424	************************************************************************/
1425
1426
1427	static const htmlEntityDesc html40EntitiesTable[] = {
1428	/*
1429	* the 4 absolute ones, plus apostrophe.
1430	*/
1431	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432	{ 38, "amp", "ampersand, U+0026 ISOnum" },
1433	{ 39, "apos", "single quote" },
1434	{ 60, "lt", "less-than sign, U+003C ISOnum" },
1435	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1436
1437	/*
1438	* A bunch still in the 128-255 range
1439	* Replacing them depend really on the charset used.
1440	*/
1441	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1442	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1444	{ 163, "pound","pound sign, U+00A3 ISOnum" },
1445	{ 164, "curren","currency sign, U+00A4 ISOnum" },
1446	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1447	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448	{ 167, "sect", "section sign, U+00A7 ISOnum" },
1449	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1451	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1452	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453	{ 172, "not", "not sign, U+00AC ISOnum" },
1454	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1456	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1458	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462	{ 181, "micro","micro sign, U+00B5 ISOnum" },
1463	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1467	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1468	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1490	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1497	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1517	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1521	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1522	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528	{ 247, "divide","division sign, U+00F7 ISOnum" },
1529	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1534	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1540	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544	/*
1545	* Anything below should really be kept as entities references
1546	*/
1547	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1550	{ 732, "tilde","small tilde, U+02DC ISOdia" },
1551
1552	{ 913, "Alpha","greek capital letter alpha, U+0391" },
1553	{ 914, "Beta", "greek capital letter beta, U+0392" },
1554	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1557	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1558	{ 919, "Eta", "greek capital letter eta, U+0397" },
1559	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560	{ 921, "Iota", "greek capital letter iota, U+0399" },
1561	{ 922, "Kappa","greek capital letter kappa, U+039A" },
1562	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1563	{ 924, "Mu", "greek capital letter mu, U+039C" },
1564	{ 925, "Nu", "greek capital letter nu, U+039D" },
1565	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1566	{ 927, "Omicron","greek capital letter omicron, U+039F" },
1567	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1568	{ 929, "Rho", "greek capital letter rho, U+03A1" },
1569	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570	{ 932, "Tau", "greek capital letter tau, U+03A4" },
1571	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1573	{ 935, "Chi", "greek capital letter chi, U+03A7" },
1574	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1575	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1579	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1581	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1583	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1584	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1585	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1586	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1589	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1590	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1591	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1592	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1593	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1594	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1597	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1599	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1600	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1601	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1602	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1605
1606	{ 8194, "ensp", "en space, U+2002 ISOpub" },
1607	{ 8195, "emsp", "em space, U+2003 ISOpub" },
1608	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1609	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1610	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1611	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1612	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1613	{ 8211, "ndash","en dash, U+2013 ISOpub" },
1614	{ 8212, "mdash","em dash, U+2014 ISOpub" },
1615	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1616	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1617	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1618	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1619	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1620	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1621	{ 8224, "dagger","dagger, U+2020 ISOpub" },
1622	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1623
1624	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1625	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1628
1629	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1630	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1636	{ 8260, "frasl","fraction slash, U+2044 NEW" },
1637
1638	{ 8364, "euro", "euro sign, U+20AC NEW" },
1639
1640	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1643	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1644	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1646	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1647	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1648	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1649	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1650	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1652	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1653	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1654	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1655	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1656
1657	{ 8704, "forall","for all, U+2200 ISOtech" },
1658	{ 8706, "part", "partial differential, U+2202 ISOtech" },
1659	{ 8707, "exist","there exists, U+2203 ISOtech" },
1660	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1661	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1662	{ 8712, "isin", "element of, U+2208 ISOtech" },
1663	{ 8713, "notin","not an element of, U+2209 ISOtech" },
1664	{ 8715, "ni", "contains as member, U+220B ISOtech" },
1665	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1666	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1667	{ 8722, "minus","minus sign, U+2212 ISOtech" },
1668	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1669	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1670	{ 8733, "prop", "proportional to, U+221D ISOtech" },
1671	{ 8734, "infin","infinity, U+221E ISOtech" },
1672	{ 8736, "ang", "angle, U+2220 ISOamso" },
1673	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1674	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1675	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1676	{ 8746, "cup", "union = cup, U+222A ISOtech" },
1677	{ 8747, "int", "integral, U+222B ISOtech" },
1678	{ 8756, "there4","therefore, U+2234 ISOtech" },
1679	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1680	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1681	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1683	{ 8801, "equiv","identical to, U+2261 ISOtech" },
1684	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1685	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1686	{ 8834, "sub", "subset of, U+2282 ISOtech" },
1687	{ 8835, "sup", "superset of, U+2283 ISOtech" },
1688	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1689	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1690	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1691	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1693	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1695	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1697	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1699	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1700	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1701	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1702
1703	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1704	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1705	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1706	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1707
1708	};
1709
1710	/************************************************************************
1711	* *
1712	* Commodity functions to handle entities *
1713	* *
1714	************************************************************************/
1715
1716	/*
1717	* Macro used to grow the current buffer.
1718	*/
1719	#define growBuffer(buffer) { \
1720	xmlChar *tmp; \
1721	buffer##_size *= 2; \
1722	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
1723	if (tmp == NULL) { \
1724	htmlErrMemory(ctxt, "growing buffer\n"); \
1725	xmlFree(buffer); \
1726	return(NULL); \
1727	} \
1728	buffer = tmp; \
1729	}
1730
1731	/**
1732	* htmlEntityLookup:
1733	* @name: the entity name
1734	*
1735	* Lookup the given entity in EntitiesTable
1736	*
1737	* TODO: the linear scan is really ugly, an hash table is really needed.
1738	*
1739	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740	*/
1741	const htmlEntityDesc *
1742	htmlEntityLookup(const xmlChar *name) {
1743	unsigned int i;
1744
1745	for (i = 0;i < (sizeof(html40EntitiesTable)/
1746	sizeof(html40EntitiesTable[0]));i++) {
1747	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1748	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1749	}
1750	}
1751	return(NULL);
1752	}
1753
1754	/**
1755	* htmlEntityValueLookup:
1756	* @value: the entity's unicode value
1757	*
1758	* Lookup the given entity in EntitiesTable
1759	*
1760	* TODO: the linear scan is really ugly, an hash table is really needed.
1761	*
1762	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763	*/
1764	const htmlEntityDesc *
1765	htmlEntityValueLookup(unsigned int value) {
1766	unsigned int i;
1767
1768	for (i = 0;i < (sizeof(html40EntitiesTable)/
1769	sizeof(html40EntitiesTable[0]));i++) {
1770	if (html40EntitiesTable[i].value >= value) {
1771	if (html40EntitiesTable[i].value > value)
1772	break;
1773	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1774	}
1775	}
1776	return(NULL);
1777	}
1778
1779	/**
1780	* UTF8ToHtml:
1781	* @out: a pointer to an array of bytes to store the result
1782	* @outlen: the length of @out
1783	* @in: a pointer to an array of UTF-8 chars
1784	* @inlen: the length of @in
1785	*
1786	* Take a block of UTF-8 chars in and try to convert it to an ASCII
1787	* plus HTML entities block of chars out.
1788	*
1789	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790	* The value of @inlen after return is the number of octets consumed
1791	* as the return value is positive, else unpredictable.
1792	* The value of @outlen after return is the number of octets consumed.
1793	*/
1794	int
1795	UTF8ToHtml(unsigned char* out, int *outlen,
1796	const unsigned char* in, int *inlen) {
1797	const unsigned char* processed = in;
1798	const unsigned char* outend;
1799	const unsigned char* outstart = out;
1800	const unsigned char* instart = in;
1801	const unsigned char* inend;
1802	unsigned int c, d;
1803	int trailing;
1804
1805	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
1806	if (in == NULL) {
1807	/*
1808	* initialization nothing to do
1809	*/
1810	*outlen = 0;
1811	*inlen = 0;
1812	return(0);
1813	}
1814	inend = in + (*inlen);
1815	outend = out + (*outlen);
1816	while (in < inend) {
1817	d = *in++;
1818	if (d < 0x80) { c= d; trailing= 0; }
1819	else if (d < 0xC0) {
1820	/* trailing byte in leading position */
1821	*outlen = out - outstart;
1822	*inlen = processed - instart;
1823	return(-2);
1824	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1825	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1826	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1827	else {
1828	/* no chance for this in Ascii */
1829	*outlen = out - outstart;
1830	*inlen = processed - instart;
1831	return(-2);
1832	}
1833
1834	if (inend - in < trailing) {
1835	break;
1836	}
1837
1838	for ( ; trailing; trailing--) {
1839	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
1840	break;
1841	c <<= 6;
1842	c \|= d & 0x3F;
1843	}
1844
1845	/* assertion: c is a single UTF-4 value */
1846	if (c < 0x80) {
1847	if (out + 1 >= outend)
1848	break;
1849	*out++ = c;
1850	} else {
1851	int len;
1852	const htmlEntityDesc * ent;
1853	const char *cp;
1854	char nbuf[16];
1855
1856	/*
1857	* Try to lookup a predefined HTML entity for it
1858	*/
1859
1860	ent = htmlEntityValueLookup(c);
1861	if (ent == NULL) {
1862	snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863	cp = nbuf;
1864	}
1865	else
1866	cp = ent->name;
1867	len = strlen(cp);
1868	if (out + 2 + len >= outend)
1869	break;
1870	*out++ = '&';
1871	memcpy(out, cp, len);
1872	out += len;
1873	*out++ = ';';
1874	}
1875	processed = in;
1876	}
1877	*outlen = out - outstart;
1878	*inlen = processed - instart;
1879	return(0);
1880	}
1881
1882	/**
1883	* htmlEncodeEntities:
1884	* @out: a pointer to an array of bytes to store the result
1885	* @outlen: the length of @out
1886	* @in: a pointer to an array of UTF-8 chars
1887	* @inlen: the length of @in
1888	* @quoteChar: the quote character to escape (' or ") or zero.
1889	*
1890	* Take a block of UTF-8 chars in and try to convert it to an ASCII
1891	* plus HTML entities block of chars out.
1892	*
1893	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894	* The value of @inlen after return is the number of octets consumed
1895	* as the return value is positive, else unpredictable.
1896	* The value of @outlen after return is the number of octets consumed.
1897	*/
1898	int
1899	htmlEncodeEntities(unsigned char* out, int *outlen,
1900	const unsigned char* in, int *inlen, int quoteChar) {
1901	const unsigned char* processed = in;
1902	const unsigned char* outend;
1903	const unsigned char* outstart = out;
1904	const unsigned char* instart = in;
1905	const unsigned char* inend;
1906	unsigned int c, d;
1907	int trailing;
1908
1909	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
1910	return(-1);
1911	outend = out + (*outlen);
1912	inend = in + (*inlen);
1913	while (in < inend) {
1914	d = *in++;
1915	if (d < 0x80) { c= d; trailing= 0; }
1916	else if (d < 0xC0) {
1917	/* trailing byte in leading position */
1918	*outlen = out - outstart;
1919	*inlen = processed - instart;
1920	return(-2);
1921	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1922	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1923	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1924	else {
1925	/* no chance for this in Ascii */
1926	*outlen = out - outstart;
1927	*inlen = processed - instart;
1928	return(-2);
1929	}
1930
1931	if (inend - in < trailing)
1932	break;
1933
1934	while (trailing--) {
1935	if (((d= *in++) & 0xC0) != 0x80) {
1936	*outlen = out - outstart;
1937	*inlen = processed - instart;
1938	return(-2);
1939	}
1940	c <<= 6;
1941	c \|= d & 0x3F;
1942	}
1943
1944	/* assertion: c is a single UTF-4 value */
1945	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946	(c != '&') && (c != '<') && (c != '>')) {
1947	if (out >= outend)
1948	break;
1949	*out++ = c;
1950	} else {
1951	const htmlEntityDesc * ent;
1952	const char *cp;
1953	char nbuf[16];
1954	int len;
1955
1956	/*
1957	* Try to lookup a predefined HTML entity for it
1958	*/
1959	ent = htmlEntityValueLookup(c);
1960	if (ent == NULL) {
1961	snprintf(nbuf, sizeof(nbuf), "#%u", c);
1962	cp = nbuf;
1963	}
1964	else
1965	cp = ent->name;
1966	len = strlen(cp);
1967	if (out + 2 + len > outend)
1968	break;
1969	*out++ = '&';
1970	memcpy(out, cp, len);
1971	out += len;
1972	*out++ = ';';
1973	}
1974	processed = in;
1975	}
1976	*outlen = out - outstart;
1977	*inlen = processed - instart;
1978	return(0);
1979	}
1980
1981	/************************************************************************
1982	* *
1983	* Commodity functions to handle streams *
1984	* *
1985	************************************************************************/
1986
1987	/**
1988	* htmlNewInputStream:
1989	* @ctxt: an HTML parser context
1990	*
1991	* Create a new input stream structure
1992	* Returns the new input stream or NULL
1993	*/
1994	static htmlParserInputPtr
1995	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996	htmlParserInputPtr input;
1997
1998	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999	if (input == NULL) {
2000	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2001	return(NULL);
2002	}
2003	memset(input, 0, sizeof(htmlParserInput));
2004	input->filename = NULL;
2005	input->directory = NULL;
2006	input->base = NULL;
2007	input->cur = NULL;
2008	input->buf = NULL;
2009	input->line = 1;
2010	input->col = 1;
2011	input->buf = NULL;
2012	input->free = NULL;
2013	input->version = NULL;
2014	input->consumed = 0;
2015	input->length = 0;
2016	return(input);
2017	}
2018
2019
2020	/************************************************************************
2021	* *
2022	* Commodity functions, cleanup needed ? *
2023	* *
2024	************************************************************************/
2025	/*
2026	* all tags allowing pc data from the html 4.01 loose dtd
2027	* NOTE: it might be more apropriate to integrate this information
2028	* into the html40ElementTable array but I don't want to risk any
2029	* binary incomptibility
2030	*/
2031	static const char *allowPCData[] = {
2032	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033	"blockquote", "body", "button", "caption", "center", "cite", "code",
2034	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038	};
2039
2040	/**
2041	* areBlanks:
2042	* @ctxt: an HTML parser context
2043	* @str: a xmlChar *
2044	* @len: the size of @str
2045	*
2046	* Is this a sequence of blank chars that one can ignore ?
2047	*
2048	* Returns 1 if ignorable 0 otherwise.
2049	*/
2050
2051	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2052	unsigned int i;
2053	int j;
2054	xmlNodePtr lastChild;
2055	xmlDtdPtr dtd;
2056
2057	for (j = 0;j < len;j++)
2058	if (!(IS_BLANK_CH(str[j]))) return(0);
2059
2060	if (CUR == 0) return(1);
2061	if (CUR != '<') return(0);
2062	if (ctxt->name == NULL)
2063	return(1);
2064	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065	return(1);
2066	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067	return(1);
2068
2069	/* Only strip CDATA children of the body tag for strict HTML DTDs */
2070	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071	dtd = xmlGetIntSubset(ctxt->myDoc);
2072	if (dtd != NULL && dtd->ExternalID != NULL) {
2073	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
2074	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075	return(1);
2076	}
2077	}
2078
2079	if (ctxt->node == NULL) return(0);
2080	lastChild = xmlGetLastChild(ctxt->node);
2081	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082	lastChild = lastChild->prev;
2083	if (lastChild == NULL) {
2084	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085	(ctxt->node->content != NULL)) return(0);
2086	/* keep ws in constructs like ...<b> </b>...
2087	for all tags "b" allowing PCDATA */
2088	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090	return(0);
2091	}
2092	}
2093	} else if (xmlNodeIsText(lastChild)) {
2094	return(0);
2095	} else {
2096	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097	for all tags "p" allowing PCDATA */
2098	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100	return(0);
2101	}
2102	}
2103	}
2104	return(1);
2105	}
2106
2107	/**
2108	* htmlNewDocNoDtD:
2109	* @URI: URI for the dtd, or NULL
2110	* @ExternalID: the external ID of the DTD, or NULL
2111	*
2112	* Creates a new HTML document without a DTD node if @URI and @ExternalID
2113	* are NULL
2114	*
2115	* Returns a new document, do not initialize the DTD if not provided
2116	*/
2117	htmlDocPtr
2118	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
2119	xmlDocPtr cur;
2120
2121	/*
2122	* Allocate a new document and fill the fields.
2123	*/
2124	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125	if (cur == NULL) {
2126	htmlErrMemory(NULL, "HTML document creation failed\n");
2127	return(NULL);
2128	}
2129	memset(cur, 0, sizeof(xmlDoc));
2130
2131	cur->type = XML_HTML_DOCUMENT_NODE;
2132	cur->version = NULL;
2133	cur->intSubset = NULL;
2134	cur->doc = cur;
2135	cur->name = NULL;
2136	cur->children = NULL;
2137	cur->extSubset = NULL;
2138	cur->oldNs = NULL;
2139	cur->encoding = NULL;
2140	cur->standalone = 1;
2141	cur->compression = 0;
2142	cur->ids = NULL;
2143	cur->refs = NULL;
2144	cur->_private = NULL;
2145	cur->charset = XML_CHAR_ENCODING_UTF8;
2146	if ((ExternalID != NULL) \|\|
2147	(URI != NULL))
2148	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2149	return(cur);
2150	}
2151
2152	/**
2153	* htmlNewDoc:
2154	* @URI: URI for the dtd, or NULL
2155	* @ExternalID: the external ID of the DTD, or NULL
2156	*
2157	* Creates a new HTML document
2158	*
2159	* Returns a new document
2160	*/
2161	htmlDocPtr
2162	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
2163	if ((URI == NULL) && (ExternalID == NULL))
2164	return(htmlNewDocNoDtD(
2165	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2166	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2167
2168	return(htmlNewDocNoDtD(URI, ExternalID));
2169	}
2170
2171
2172	/************************************************************************
2173	* *
2174	* The parser itself *
2175	* Relates to http://www.w3.org/TR/html40 *
2176	* *
2177	************************************************************************/
2178
2179	/************************************************************************
2180	* *
2181	* The parser itself *
2182	* *
2183	************************************************************************/
2184
2185	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2186
2187	/**
2188	* htmlParseHTMLName:
2189	* @ctxt: an HTML parser context
2190	*
2191	* parse an HTML tag or attribute name, note that we convert it to lowercase
2192	* since HTML names are not case-sensitive.
2193	*
2194	* Returns the Tag Name parsed or NULL
2195	*/
2196
2197	static const xmlChar *
2198	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2199	int i = 0;
2200	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2201
2202	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2203	(CUR != ':')) return(NULL);
2204
2205	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2206	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
2207	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
2208	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2209	else loc[i] = CUR;
2210	i++;
2211
2212	NEXT;
2213	}
2214
2215	return(xmlDictLookup(ctxt->dict, loc, i));
2216	}
2217
2218
2219	/**
2220	* htmlParseHTMLName_nonInvasive:
2221	* @ctxt: an HTML parser context
2222	*
2223	* parse an HTML tag or attribute name, note that we convert it to lowercase
2224	* since HTML names are not case-sensitive, this doesn't consume the data
2225	* from the stream, it's a look-ahead
2226	*
2227	* Returns the Tag Name parsed or NULL
2228	*/
2229
2230	static const xmlChar *
2231	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2232	int i = 0;
2233	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2234
2235	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2236	(NXT(1) != ':')) return(NULL);
2237
2238	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2239	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
2240	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
2241	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2242	else loc[i] = NXT(1+i);
2243	i++;
2244	}
2245
2246	return(xmlDictLookup(ctxt->dict, loc, i));
2247	}
2248
2249
2250	/**
2251	* htmlParseName:
2252	* @ctxt: an HTML parser context
2253	*
2254	* parse an HTML name, this routine is case sensitive.
2255	*
2256	* Returns the Name parsed or NULL
2257	*/
2258
2259	static const xmlChar *
2260	htmlParseName(htmlParserCtxtPtr ctxt) {
2261	const xmlChar *in;
2262	const xmlChar *ret;
2263	int count = 0;
2264
2265	GROW;
2266
2267	/*
2268	* Accelerator for simple ASCII names
2269	*/
2270	in = ctxt->input->cur;
2271	if (((in >= 0x61) && (in <= 0x7A)) \|\|
2272	((in >= 0x41) && (in <= 0x5A)) \|\|
2273	(in == '_') \|\| (in == ':')) {
2274	in++;
2275	while (((in >= 0x61) && (in <= 0x7A)) \|\|
2276	((in >= 0x41) && (in <= 0x5A)) \|\|
2277	((in >= 0x30) && (in <= 0x39)) \|\|
2278	(in == '_') \|\| (in == '-') \|\|
2279	(in == ':') \|\| (in == '.'))
2280	in++;
2281	if ((in > 0) && (in < 0x80)) {
2282	count = in - ctxt->input->cur;
2283	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2284	ctxt->input->cur = in;
2285	ctxt->nbChars += count;
2286	ctxt->input->col += count;
2287	return(ret);
2288	}
2289	}
2290	return(htmlParseNameComplex(ctxt));
2291	}
2292
2293	static const xmlChar *
2294	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2295	int len = 0, l;
2296	int c;
2297	int count = 0;
2298
2299	/*
2300	* Handler for more complex cases
2301	*/
2302	GROW;
2303	c = CUR_CHAR(l);
2304	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
2305	(!IS_LETTER(c) && (c != '_') &&
2306	(c != ':'))) {
2307	return(NULL);
2308	}
2309
2310	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2311	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
2312	(c == '.') \|\| (c == '-') \|\|
2313	(c == '_') \|\| (c == ':') \|\|
2314	(IS_COMBINING(c)) \|\|
2315	(IS_EXTENDER(c)))) {
2316	if (count++ > 100) {
2317	count = 0;
2318	GROW;
2319	}
2320	len += l;
2321	NEXTL(l);
2322	c = CUR_CHAR(l);
2323	}
2324	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2325	}
2326
2327
2328	/**
2329	* htmlParseHTMLAttribute:
2330	* @ctxt: an HTML parser context
2331	* @stop: a char stop value
2332	*
2333	* parse an HTML attribute value till the stop (quote), if
2334	* stop is 0 then it stops at the first space
2335	*
2336	* Returns the attribute parsed or NULL
2337	*/
2338
2339	static xmlChar *
2340	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2341	xmlChar *buffer = NULL;
2342	int buffer_size = 0;
2343	xmlChar *out = NULL;
2344	const xmlChar *name = NULL;
2345	const xmlChar *cur = NULL;
2346	const htmlEntityDesc * ent;
2347
2348	/*
2349	* allocate a translation buffer.
2350	*/
2351	buffer_size = HTML_PARSER_BUFFER_SIZE;
2352	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
2353	if (buffer == NULL) {
2354	htmlErrMemory(ctxt, "buffer allocation failed\n");
2355	return(NULL);
2356	}
2357	out = buffer;
2358
2359	/*
2360	* Ok loop until we reach one of the ending chars
2361	*/
2362	while ((CUR != 0) && (CUR != stop)) {
2363	if ((stop == 0) && (CUR == '>')) break;
2364	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2365	if (CUR == '&') {
2366	if (NXT(1) == '#') {
2367	unsigned int c;
2368	int bits;
2369
2370	c = htmlParseCharRef(ctxt);
2371	if (c < 0x80)
2372	{ *out++ = c; bits= -6; }
2373	else if (c < 0x800)
2374	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2375	else if (c < 0x10000)
2376	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2377	else
2378	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2379
2380	for ( ; bits >= 0; bits-= 6) {
2381	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2382	}
2383
2384	if (out - buffer > buffer_size - 100) {
2385	int indx = out - buffer;
2386
2387	growBuffer(buffer);
2388	out = &buffer[indx];
2389	}
2390	} else {
2391	ent = htmlParseEntityRef(ctxt, &name);
2392	if (name == NULL) {
2393	*out++ = '&';
2394	if (out - buffer > buffer_size - 100) {
2395	int indx = out - buffer;
2396
2397	growBuffer(buffer);
2398	out = &buffer[indx];
2399	}
2400	} else if (ent == NULL) {
2401	*out++ = '&';
2402	cur = name;
2403	while (*cur != 0) {
2404	if (out - buffer > buffer_size - 100) {
2405	int indx = out - buffer;
2406
2407	growBuffer(buffer);
2408	out = &buffer[indx];
2409	}
2410	out++ = cur++;
2411	}
2412	} else {
2413	unsigned int c;
2414	int bits;
2415
2416	if (out - buffer > buffer_size - 100) {
2417	int indx = out - buffer;
2418
2419	growBuffer(buffer);
2420	out = &buffer[indx];
2421	}
2422	c = ent->value;
2423	if (c < 0x80)
2424	{ *out++ = c; bits= -6; }
2425	else if (c < 0x800)
2426	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2427	else if (c < 0x10000)
2428	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2429	else
2430	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2431
2432	for ( ; bits >= 0; bits-= 6) {
2433	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2434	}
2435	}
2436	}
2437	} else {
2438	unsigned int c;
2439	int bits, l;
2440
2441	if (out - buffer > buffer_size - 100) {
2442	int indx = out - buffer;
2443
2444	growBuffer(buffer);
2445	out = &buffer[indx];
2446	}
2447	c = CUR_CHAR(l);
2448	if (c < 0x80)
2449	{ *out++ = c; bits= -6; }
2450	else if (c < 0x800)
2451	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2452	else if (c < 0x10000)
2453	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2454	else
2455	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2456
2457	for ( ; bits >= 0; bits-= 6) {
2458	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2459	}
2460	NEXT;
2461	}
2462	}
2463	*out++ = 0;
2464	return(buffer);
2465	}
2466
2467	/**
2468	* htmlParseEntityRef:
2469	* @ctxt: an HTML parser context
2470	* @str: location to store the entity name
2471	*
2472	* parse an HTML ENTITY references
2473	*
2474	* [68] EntityRef ::= '&' Name ';'
2475	*
2476	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2477	* if non-NULL *str will have to be freed by the caller.
2478	*/
2479	const htmlEntityDesc *
2480	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2481	const xmlChar *name;
2482	const htmlEntityDesc * ent = NULL;
2483
2484	if (str != NULL) *str = NULL;
2485	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
2486
2487	if (CUR == '&') {
2488	NEXT;
2489	name = htmlParseName(ctxt);
2490	if (name == NULL) {
2491	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2492	"htmlParseEntityRef: no name\n", NULL, NULL);
2493	} else {
2494	GROW;
2495	if (CUR == ';') {
2496	if (str != NULL)
2497	*str = name;
2498
2499	/*
2500	* Lookup the entity in the table.
2501	*/
2502	ent = htmlEntityLookup(name);
2503	if (ent != NULL) /* OK that's ugly !!! */
2504	NEXT;
2505	} else {
2506	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2507	"htmlParseEntityRef: expecting ';'\n",
2508	NULL, NULL);
2509	if (str != NULL)
2510	*str = name;
2511	}
2512	}
2513	}
2514	return(ent);
2515	}
2516
2517	/**
2518	* htmlParseAttValue:
2519	* @ctxt: an HTML parser context
2520	*
2521	* parse a value for an attribute
2522	* Note: the parser won't do substitution of entities here, this
2523	* will be handled later in xmlStringGetNodeList, unless it was
2524	* asked for ctxt->replaceEntities != 0
2525	*
2526	* Returns the AttValue parsed or NULL.
2527	*/
2528
2529	static xmlChar *
2530	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2531	xmlChar *ret = NULL;
2532
2533	if (CUR == '"') {
2534	NEXT;
2535	ret = htmlParseHTMLAttribute(ctxt, '"');
2536	if (CUR != '"') {
2537	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2538	"AttValue: \" expected\n", NULL, NULL);
2539	} else
2540	NEXT;
2541	} else if (CUR == '\'') {
2542	NEXT;
2543	ret = htmlParseHTMLAttribute(ctxt, '\'');
2544	if (CUR != '\'') {
2545	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2546	"AttValue: ' expected\n", NULL, NULL);
2547	} else
2548	NEXT;
2549	} else {
2550	/*
2551	* That's an HTMLism, the attribute value may not be quoted
2552	*/
2553	ret = htmlParseHTMLAttribute(ctxt, 0);
2554	if (ret == NULL) {
2555	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2556	"AttValue: no value found\n", NULL, NULL);
2557	}
2558	}
2559	return(ret);
2560	}
2561
2562	/**
2563	* htmlParseSystemLiteral:
2564	* @ctxt: an HTML parser context
2565	*
2566	* parse an HTML Literal
2567	*
2568	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
2569	*
2570	* Returns the SystemLiteral parsed or NULL
2571	*/
2572
2573	static xmlChar *
2574	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2575	const xmlChar *q;
2576	xmlChar *ret = NULL;
2577
2578	if (CUR == '"') {
2579	NEXT;
2580	q = CUR_PTR;
2581	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2582	NEXT;
2583	if (!IS_CHAR_CH(CUR)) {
2584	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2585	"Unfinished SystemLiteral\n", NULL, NULL);
2586	} else {
2587	ret = xmlStrndup(q, CUR_PTR - q);
2588	NEXT;
2589	}
2590	} else if (CUR == '\'') {
2591	NEXT;
2592	q = CUR_PTR;
2593	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2594	NEXT;
2595	if (!IS_CHAR_CH(CUR)) {
2596	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2597	"Unfinished SystemLiteral\n", NULL, NULL);
2598	} else {
2599	ret = xmlStrndup(q, CUR_PTR - q);
2600	NEXT;
2601	}
2602	} else {
2603	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2604	" or ' expected\n", NULL, NULL);
2605	}
2606
2607	return(ret);
2608	}
2609
2610	/**
2611	* htmlParsePubidLiteral:
2612	* @ctxt: an HTML parser context
2613	*
2614	* parse an HTML public literal
2615	*
2616	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
2617	*
2618	* Returns the PubidLiteral parsed or NULL.
2619	*/
2620
2621	static xmlChar *
2622	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2623	const xmlChar *q;
2624	xmlChar *ret = NULL;
2625	/*
2626	* Name ::= (Letter \| '_') (NameChar)*
2627	*/
2628	if (CUR == '"') {
2629	NEXT;
2630	q = CUR_PTR;
2631	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2632	if (CUR != '"') {
2633	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2634	"Unfinished PubidLiteral\n", NULL, NULL);
2635	} else {
2636	ret = xmlStrndup(q, CUR_PTR - q);
2637	NEXT;
2638	}
2639	} else if (CUR == '\'') {
2640	NEXT;
2641	q = CUR_PTR;
2642	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2643	NEXT;
2644	if (CUR != '\'') {
2645	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2646	"Unfinished PubidLiteral\n", NULL, NULL);
2647	} else {
2648	ret = xmlStrndup(q, CUR_PTR - q);
2649	NEXT;
2650	}
2651	} else {
2652	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2653	"PubidLiteral \" or ' expected\n", NULL, NULL);
2654	}
2655
2656	return(ret);
2657	}
2658
2659	/**
2660	* htmlParseScript:
2661	* @ctxt: an HTML parser context
2662	*
2663	* parse the content of an HTML SCRIPT or STYLE element
2664	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
2665	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2666	* http://www.w3.org/TR/html4/types.html#type-script
2667	* http://www.w3.org/TR/html4/types.html#h-6.15
2668	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2669	*
2670	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
2671	* element and the value of intrinsic event attributes. User agents must
2672	* not evaluate script data as HTML markup but instead must pass it on as
2673	* data to a script engine.
2674	* NOTES:
2675	* - The content is passed like CDATA
2676	* - the attributes for style and scripting "onXXX" are also described
2677	* as CDATA but SGML allows entities references in attributes so their
2678	* processing is identical as other attributes
2679	*/
2680	static void
2681	htmlParseScript(htmlParserCtxtPtr ctxt) {
2682	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2683	int nbchar = 0;
2684	int cur,l;
2685
2686	SHRINK;
2687	cur = CUR_CHAR(l);
2688	while (IS_CHAR_CH(cur)) {
2689	if ((cur == '<') && (NXT(1) == '/')) {
2690	/*
2691	* One should break here, the specification is clear:
2692	* Authors should therefore escape "</" within the content.
2693	* Escape mechanisms are specific to each scripting or
2694	* style sheet language.
2695	*
2696	* In recovery mode, only break if end tag match the
2697	* current tag, effectively ignoring all tags inside the
2698	* script/style block and treating the entire block as
2699	* CDATA.
2700	*/
2701	if (ctxt->recovery) {
2702	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2703	xmlStrlen(ctxt->name)) == 0)
2704	{
2705	break; /* while */
2706	} else {
2707	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2708	"Element %s embeds close tag\n",
2709	ctxt->name, NULL);
2710	}
2711	} else {
2712	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
2713	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2714	{
2715	break; /* while */
2716	}
2717	}
2718	}
2719	COPY_BUF(l,buf,nbchar,cur);
2720	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2721	if (ctxt->sax->cdataBlock!= NULL) {
2722	/*
2723	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2724	*/
2725	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2726	} else if (ctxt->sax->characters != NULL) {
2727	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2728	}
2729	nbchar = 0;
2730	}
2731	GROW;
2732	NEXTL(l);
2733	cur = CUR_CHAR(l);
2734	}
2735
2736	if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2737	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2738	"Invalid char in CDATA 0x%X\n", cur);
2739	NEXT;
2740	}
2741
2742	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2743	if (ctxt->sax->cdataBlock!= NULL) {
2744	/*
2745	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2746	*/
2747	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2748	} else if (ctxt->sax->characters != NULL) {
2749	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2750	}
2751	}
2752	}
2753
2754
2755	/**
2756	* htmlParseCharData:
2757	* @ctxt: an HTML parser context
2758	*
2759	* parse a CharData section.
2760	* if we are within a CDATA section ']]>' marks an end of section.
2761	*
2762	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2763	*/
2764
2765	static void
2766	htmlParseCharData(htmlParserCtxtPtr ctxt) {
2767	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2768	int nbchar = 0;
2769	int cur, l;
2770
2771	SHRINK;
2772	cur = CUR_CHAR(l);
2773	while (((cur != '<') \|\| (ctxt->token == '<')) &&
2774	((cur != '&') \|\| (ctxt->token == '&')) &&
2775	(cur != 0)) {
2776	if (!(IS_CHAR(cur))) {
2777	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2778	"Invalid char in CDATA 0x%X\n", cur);
2779	} else {
2780	COPY_BUF(l,buf,nbchar,cur);
2781	}
2782	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2783	/*
2784	* Ok the segment is to be consumed as chars.
2785	*/
2786	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2787	if (areBlanks(ctxt, buf, nbchar)) {
2788	if (ctxt->sax->ignorableWhitespace != NULL)
2789	ctxt->sax->ignorableWhitespace(ctxt->userData,
2790	buf, nbchar);
2791	} else {
2792	htmlCheckParagraph(ctxt);
2793	if (ctxt->sax->characters != NULL)
2794	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2795	}
2796	}
2797	nbchar = 0;
2798	}
2799	NEXTL(l);
2800	cur = CUR_CHAR(l);
2801	if (cur == 0) {
2802	SHRINK;
2803	GROW;
2804	cur = CUR_CHAR(l);
2805	}
2806	}
2807	if (nbchar != 0) {
2808	buf[nbchar] = 0;
2809
2810	/*
2811	* Ok the segment is to be consumed as chars.
2812	*/
2813	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2814	if (areBlanks(ctxt, buf, nbchar)) {
2815	if (ctxt->sax->ignorableWhitespace != NULL)
2816	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2817	} else {
2818	htmlCheckParagraph(ctxt);
2819	if (ctxt->sax->characters != NULL)
2820	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2821	}
2822	}
2823	} else {
2824	/*
2825	* Loop detection
2826	*/
2827	if (cur == 0)
2828	ctxt->instate = XML_PARSER_EOF;
2829	}
2830	}
2831
2832	/**
2833	* htmlParseExternalID:
2834	* @ctxt: an HTML parser context
2835	* @publicID: a xmlChar** receiving PubidLiteral
2836	*
2837	* Parse an External ID or a Public ID
2838	*
2839	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2840	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
2841	*
2842	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
2843	*
2844	* Returns the function returns SystemLiteral and in the second
2845	* case publicID receives PubidLiteral, is strict is off
2846	* it is possible to return NULL and have publicID set.
2847	*/
2848
2849	static xmlChar *
2850	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2851	xmlChar *URI = NULL;
2852
2853	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2854	(UPP(2) == 'S') && (UPP(3) == 'T') &&
2855	(UPP(4) == 'E') && (UPP(5) == 'M')) {
2856	SKIP(6);
2857	if (!IS_BLANK_CH(CUR)) {
2858	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2859	"Space required after 'SYSTEM'\n", NULL, NULL);
2860	}
2861	SKIP_BLANKS;
2862	URI = htmlParseSystemLiteral(ctxt);
2863	if (URI == NULL) {
2864	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2865	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2866	}
2867	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2868	(UPP(2) == 'B') && (UPP(3) == 'L') &&
2869	(UPP(4) == 'I') && (UPP(5) == 'C')) {
2870	SKIP(6);
2871	if (!IS_BLANK_CH(CUR)) {
2872	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2873	"Space required after 'PUBLIC'\n", NULL, NULL);
2874	}
2875	SKIP_BLANKS;
2876	*publicID = htmlParsePubidLiteral(ctxt);
2877	if (*publicID == NULL) {
2878	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2879	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
2880	NULL, NULL);
2881	}
2882	SKIP_BLANKS;
2883	if ((CUR == '"') \|\| (CUR == '\'')) {
2884	URI = htmlParseSystemLiteral(ctxt);
2885	}
2886	}
2887	return(URI);
2888	}
2889
2890	/**
2891	* xmlParsePI:
2892	* @ctxt: an XML parser context
2893	*
2894	* parse an XML Processing Instruction.
2895	*
2896	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2897	*/
2898	static void
2899	htmlParsePI(htmlParserCtxtPtr ctxt) {
2900	xmlChar *buf = NULL;
2901	int len = 0;
2902	int size = HTML_PARSER_BUFFER_SIZE;
2903	int cur, l;
2904	const xmlChar *target;
2905	xmlParserInputState state;
2906	int count = 0;
2907
2908	if ((RAW == '<') && (NXT(1) == '?')) {
2909	state = ctxt->instate;
2910	ctxt->instate = XML_PARSER_PI;
2911	/*
2912	* this is a Processing Instruction.
2913	*/
2914	SKIP(2);
2915	SHRINK;
2916
2917	/*
2918	* Parse the target name and check for special support like
2919	* namespace.
2920	*/
2921	target = htmlParseName(ctxt);
2922	if (target != NULL) {
2923	if (RAW == '>') {
2924	SKIP(1);
2925
2926	/*
2927	* SAX: PI detected.
2928	*/
2929	if ((ctxt->sax) && (!ctxt->disableSAX) &&
2930	(ctxt->sax->processingInstruction != NULL))
2931	ctxt->sax->processingInstruction(ctxt->userData,
2932	target, NULL);
2933	ctxt->instate = state;
2934	return;
2935	}
2936	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
2937	if (buf == NULL) {
2938	htmlErrMemory(ctxt, NULL);
2939	ctxt->instate = state;
2940	return;
2941	}
2942	cur = CUR;
2943	if (!IS_BLANK(cur)) {
2944	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2945	"ParsePI: PI %s space expected\n", target, NULL);
2946	}
2947	SKIP_BLANKS;
2948	cur = CUR_CHAR(l);
2949	while (IS_CHAR(cur) && (cur != '>')) {
2950	if (len + 5 >= size) {
2951	xmlChar *tmp;
2952
2953	size *= 2;
2954	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
2955	if (tmp == NULL) {
2956	htmlErrMemory(ctxt, NULL);
2957	xmlFree(buf);
2958	ctxt->instate = state;
2959	return;
2960	}
2961	buf = tmp;
2962	}
2963	count++;
2964	if (count > 50) {
2965	GROW;
2966	count = 0;
2967	}
2968	COPY_BUF(l,buf,len,cur);
2969	NEXTL(l);
2970	cur = CUR_CHAR(l);
2971	if (cur == 0) {
2972	SHRINK;
2973	GROW;
2974	cur = CUR_CHAR(l);
2975	}
2976	}
2977	buf[len] = 0;
2978	if (cur != '>') {
2979	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2980	"ParsePI: PI %s never end ...\n", target, NULL);
2981	} else {
2982	SKIP(1);
2983
2984	/*
2985	* SAX: PI detected.
2986	*/
2987	if ((ctxt->sax) && (!ctxt->disableSAX) &&
2988	(ctxt->sax->processingInstruction != NULL))
2989	ctxt->sax->processingInstruction(ctxt->userData,
2990	target, buf);
2991	}
2992	xmlFree(buf);
2993	} else {
2994	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2995	"PI is not started correctly", NULL, NULL);
2996	}
2997	ctxt->instate = state;
2998	}
2999	}
3000
3001	/**
3002	* htmlParseComment:
3003	* @ctxt: an HTML parser context
3004	*
3005	* Parse an XML (SGML) comment <!-- .... -->
3006	*
3007	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
3008	*/
3009	static void
3010	htmlParseComment(htmlParserCtxtPtr ctxt) {
3011	xmlChar *buf = NULL;
3012	int len;
3013	int size = HTML_PARSER_BUFFER_SIZE;
3014	int q, ql;
3015	int r, rl;
3016	int cur, l;
3017	xmlParserInputState state;
3018
3019	/*
3020	* Check that there is a comment right here.
3021	*/
3022	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
3023	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
3024
3025	state = ctxt->instate;
3026	ctxt->instate = XML_PARSER_COMMENT;
3027	SHRINK;
3028	SKIP(4);
3029	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
3030	if (buf == NULL) {
3031	htmlErrMemory(ctxt, "buffer allocation failed\n");
3032	ctxt->instate = state;
3033	return;
3034	}
3035	q = CUR_CHAR(ql);
3036	NEXTL(ql);
3037	r = CUR_CHAR(rl);
3038	NEXTL(rl);
3039	cur = CUR_CHAR(l);
3040	len = 0;
3041	while (IS_CHAR(cur) &&
3042	((cur != '>') \|\|
3043	(r != '-') \|\| (q != '-'))) {
3044	if (len + 5 >= size) {
3045	xmlChar *tmp;
3046
3047	size *= 2;
3048	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
3049	if (tmp == NULL) {
3050	xmlFree(buf);
3051	htmlErrMemory(ctxt, "growing buffer failed\n");
3052	ctxt->instate = state;
3053	return;
3054	}
3055	buf = tmp;
3056	}
3057	COPY_BUF(ql,buf,len,q);
3058	q = r;
3059	ql = rl;
3060	r = cur;
3061	rl = l;
3062	NEXTL(l);
3063	cur = CUR_CHAR(l);
3064	if (cur == 0) {
3065	SHRINK;
3066	GROW;
3067	cur = CUR_CHAR(l);
3068	}
3069	}
3070	buf[len] = 0;
3071	if (!IS_CHAR(cur)) {
3072	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3073	"Comment not terminated \n<!--%.50s\n", buf, NULL);
3074	xmlFree(buf);
3075	} else {
3076	NEXT;
3077	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3078	(!ctxt->disableSAX))
3079	ctxt->sax->comment(ctxt->userData, buf);
3080	xmlFree(buf);
3081	}
3082	ctxt->instate = state;
3083	}
3084
3085	/**
3086	* htmlParseCharRef:
3087	* @ctxt: an HTML parser context
3088	*
3089	* parse Reference declarations
3090	*
3091	* [66] CharRef ::= '&#' [0-9]+ ';' \|
3092	* '&#x' [0-9a-fA-F]+ ';'
3093	*
3094	* Returns the value parsed (as an int)
3095	*/
3096	int
3097	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3098	int val = 0;
3099
3100	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3101	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3102	"htmlParseCharRef: context error\n",
3103	NULL, NULL);
3104	return(0);
3105	}
3106	if ((CUR == '&') && (NXT(1) == '#') &&
3107	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
3108	SKIP(3);
3109	while (CUR != ';') {
3110	if ((CUR >= '0') && (CUR <= '9'))
3111	val = val * 16 + (CUR - '0');
3112	else if ((CUR >= 'a') && (CUR <= 'f'))
3113	val = val * 16 + (CUR - 'a') + 10;
3114	else if ((CUR >= 'A') && (CUR <= 'F'))
3115	val = val * 16 + (CUR - 'A') + 10;
3116	else {
3117	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3118	"htmlParseCharRef: invalid hexadecimal value\n",
3119	NULL, NULL);
3120	return(0);
3121	}
3122	NEXT;
3123	}
3124	if (CUR == ';')
3125	NEXT;
3126	} else if ((CUR == '&') && (NXT(1) == '#')) {
3127	SKIP(2);
3128	while (CUR != ';') {
3129	if ((CUR >= '0') && (CUR <= '9'))
3130	val = val * 10 + (CUR - '0');
3131	else {
3132	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3133	"htmlParseCharRef: invalid decimal value\n",
3134	NULL, NULL);
3135	return(0);
3136	}
3137	NEXT;
3138	}
3139	if (CUR == ';')
3140	NEXT;
3141	} else {
3142	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3143	"htmlParseCharRef: invalid value\n", NULL, NULL);
3144	}
3145	/*
3146	* Check the value IS_CHAR ...
3147	*/
3148	if (IS_CHAR(val)) {
3149	return(val);
3150	} else {
3151	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3152	"htmlParseCharRef: invalid xmlChar value %d\n",
3153	val);
3154	}
3155	return(0);
3156	}
3157
3158
3159	/**
3160	* htmlParseDocTypeDecl:
3161	* @ctxt: an HTML parser context
3162	*
3163	* parse a DOCTYPE declaration
3164	*
3165	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3166	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
3167	*/
3168
3169	static void
3170	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3171	const xmlChar *name;
3172	xmlChar *ExternalID = NULL;
3173	xmlChar *URI = NULL;
3174
3175	/*
3176	* We know that '<!DOCTYPE' has been detected.
3177	*/
3178	SKIP(9);
3179
3180	SKIP_BLANKS;
3181
3182	/*
3183	* Parse the DOCTYPE name.
3184	*/
3185	name = htmlParseName(ctxt);
3186	if (name == NULL) {
3187	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3188	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3189	NULL, NULL);
3190	}
3191	/*
3192	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3193	*/
3194
3195	SKIP_BLANKS;
3196
3197	/*
3198	* Check for SystemID and ExternalID
3199	*/
3200	URI = htmlParseExternalID(ctxt, &ExternalID);
3201	SKIP_BLANKS;
3202
3203	/*
3204	* We should be at the end of the DOCTYPE declaration.
3205	*/
3206	if (CUR != '>') {
3207	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3208	"DOCTYPE improperly terminated\n", NULL, NULL);
3209	/* We shouldn't try to resynchronize ... */
3210	}
3211	NEXT;
3212
3213	/*
3214	* Create or update the document accordingly to the DOCTYPE
3215	*/
3216	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3217	(!ctxt->disableSAX))
3218	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3219
3220	/*
3221	* Cleanup, since we don't use all those identifiers
3222	*/
3223	if (URI != NULL) xmlFree(URI);
3224	if (ExternalID != NULL) xmlFree(ExternalID);
3225	}
3226
3227	/**
3228	* htmlParseAttribute:
3229	* @ctxt: an HTML parser context
3230	* @value: a xmlChar ** used to store the value of the attribute
3231	*
3232	* parse an attribute
3233	*
3234	* [41] Attribute ::= Name Eq AttValue
3235	*
3236	* [25] Eq ::= S? '=' S?
3237	*
3238	* With namespace:
3239	*
3240	* [NS 11] Attribute ::= QName Eq AttValue
3241	*
3242	* Also the case QName == xmlns:??? is handled independently as a namespace
3243	* definition.
3244	*
3245	* Returns the attribute name, and the value in *value.
3246	*/
3247
3248	static const xmlChar *
3249	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3250	const xmlChar *name;
3251	xmlChar *val = NULL;
3252
3253	*value = NULL;
3254	name = htmlParseHTMLName(ctxt);
3255	if (name == NULL) {
3256	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3257	"error parsing attribute name\n", NULL, NULL);
3258	return(NULL);
3259	}
3260
3261	/*
3262	* read the value
3263	*/
3264	SKIP_BLANKS;
3265	if (CUR == '=') {
3266	NEXT;
3267	SKIP_BLANKS;
3268	val = htmlParseAttValue(ctxt);
3269	} else if (htmlIsBooleanAttr(name)) {
3270	/*
3271	* assume a minimized attribute
3272	*/
3273	val = xmlStrdup(name);
3274	}
3275
3276	*value = val;
3277	return(name);
3278	}
3279
3280	/**
3281	* htmlCheckEncoding:
3282	* @ctxt: an HTML parser context
3283	* @attvalue: the attribute value
3284	*
3285	* Checks an http-equiv attribute from a Meta tag to detect
3286	* the encoding
3287	* If a new encoding is detected the parser is switched to decode
3288	* it and pass UTF8
3289	*/
3290	static void
3291	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3292	const xmlChar *encoding;
3293
3294	if ((ctxt == NULL) \|\| (attvalue == NULL))
3295	return;
3296
3297	/* do not change encoding */
3298	if (ctxt->input->encoding != NULL)
3299	return;
3300
3301	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3302	if (encoding != NULL) {
3303	encoding += 8;
3304	} else {
3305	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3306	if (encoding != NULL)
3307	encoding += 9;
3308	}
3309	if (encoding != NULL) {
3310	xmlCharEncoding enc;
3311	xmlCharEncodingHandlerPtr handler;
3312
3313	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
3314
3315	if (ctxt->input->encoding != NULL)
3316	xmlFree((xmlChar *) ctxt->input->encoding);
3317	ctxt->input->encoding = xmlStrdup(encoding);
3318
3319	enc = xmlParseCharEncoding((const char *) encoding);
3320	/*
3321	* registered set of known encodings
3322	*/
3323	if (enc != XML_CHAR_ENCODING_ERROR) {
3324	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
3325	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
3326	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
3327	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
3328	(ctxt->input->buf != NULL) &&
3329	(ctxt->input->buf->encoder == NULL)) {
3330	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3331	"htmlCheckEncoding: wrong encoding meta\n",
3332	NULL, NULL);
3333	} else {
3334	xmlSwitchEncoding(ctxt, enc);
3335	}
3336	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3337	} else {
3338	/*
3339	* fallback for unknown encodings
3340	*/
3341	handler = xmlFindCharEncodingHandler((const char *) encoding);
3342	if (handler != NULL) {
3343	xmlSwitchToEncoding(ctxt, handler);
3344	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3345	} else {
3346	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3347	}
3348	}
3349
3350	if ((ctxt->input->buf != NULL) &&
3351	(ctxt->input->buf->encoder != NULL) &&
3352	(ctxt->input->buf->raw != NULL) &&
3353	(ctxt->input->buf->buffer != NULL)) {
3354	int nbchars;
3355	int processed;
3356
3357	/*
3358	* convert as much as possible to the parser reading buffer.
3359	*/
3360	processed = ctxt->input->cur - ctxt->input->base;
3361	xmlBufferShrink(ctxt->input->buf->buffer, processed);
3362	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3363	ctxt->input->buf->buffer,
3364	ctxt->input->buf->raw);
3365	if (nbchars < 0) {
3366	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3367	"htmlCheckEncoding: encoder error\n",
3368	NULL, NULL);
3369	}
3370	ctxt->input->base =
3371	ctxt->input->cur = ctxt->input->buf->buffer->content;
3372	}
3373	}
3374	}
3375
3376	/**
3377	* htmlCheckMeta:
3378	* @ctxt: an HTML parser context
3379	* @atts: the attributes values
3380	*
3381	* Checks an attributes from a Meta tag
3382	*/
3383	static void
3384	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3385	int i;
3386	const xmlChar att, value;
3387	int http = 0;
3388	const xmlChar *content = NULL;
3389
3390	if ((ctxt == NULL) \|\| (atts == NULL))
3391	return;
3392
3393	i = 0;
3394	att = atts[i++];
3395	while (att != NULL) {
3396	value = atts[i++];
3397	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3398	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3399	http = 1;
3400	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3401	content = value;
3402	att = atts[i++];
3403	}
3404	if ((http) && (content != NULL))
3405	htmlCheckEncoding(ctxt, content);
3406
3407	}
3408
3409	/**
3410	* htmlParseStartTag:
3411	* @ctxt: an HTML parser context
3412	*
3413	* parse a start of tag either for rule element or
3414	* EmptyElement. In both case we don't parse the tag closing chars.
3415	*
3416	* [40] STag ::= '<' Name (S Attribute)* S? '>'
3417	*
3418	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3419	*
3420	* With namespace:
3421	*
3422	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3423	*
3424	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3425	*
3426	* Returns 0 in case of success and -1 in case of error.
3427	*/
3428
3429	static int
3430	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3431	const xmlChar *name;
3432	const xmlChar *attname;
3433	xmlChar *attvalue;
3434	const xmlChar **atts;
3435	int nbatts = 0;
3436	int maxatts;
3437	int meta = 0;
3438	int i;
3439
3440	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3441	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3442	"htmlParseStartTag: context error\n", NULL, NULL);
3443	return -1;
3444	}
3445	if (CUR != '<') return -1;
3446	NEXT;
3447
3448	atts = ctxt->atts;
3449	maxatts = ctxt->maxatts;
3450
3451	GROW;
3452	name = htmlParseHTMLName(ctxt);
3453	if (name == NULL) {
3454	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3455	"htmlParseStartTag: invalid element name\n",
3456	NULL, NULL);
3457	/* Dump the bogus tag like browsers do */
3458	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3459	NEXT;
3460	return -1;
3461	}
3462	if (xmlStrEqual(name, BAD_CAST"meta"))
3463	meta = 1;
3464
3465	/*
3466	* Check for auto-closure of HTML elements.
3467	*/
3468	htmlAutoClose(ctxt, name);
3469
3470	/*
3471	* Check for implied HTML elements.
3472	*/
3473	htmlCheckImplied(ctxt, name);
3474
3475	/*
3476	* Avoid html at any level > 0, head at any level != 1
3477	* or any attempt to recurse body
3478	*/
3479	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3480	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3481	"htmlParseStartTag: misplaced <html> tag\n",
3482	name, NULL);
3483	return 0;
3484	}
3485	if ((ctxt->nameNr != 1) &&
3486	(xmlStrEqual(name, BAD_CAST"head"))) {
3487	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3488	"htmlParseStartTag: misplaced <head> tag\n",
3489	name, NULL);
3490	return 0;
3491	}
3492	if (xmlStrEqual(name, BAD_CAST"body")) {
3493	int indx;
3494	for (indx = 0;indx < ctxt->nameNr;indx++) {
3495	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3496	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3497	"htmlParseStartTag: misplaced <body> tag\n",
3498	name, NULL);
3499	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3500	NEXT;
3501	return 0;
3502	}
3503	}
3504	}
3505
3506	/*
3507	* Now parse the attributes, it ends up with the ending
3508	*
3509	* (S Attribute)* S?
3510	*/
3511	SKIP_BLANKS;
3512	while ((IS_CHAR_CH(CUR)) &&
3513	(CUR != '>') &&
3514	((CUR != '/') \|\| (NXT(1) != '>'))) {
3515	long cons = ctxt->nbChars;
3516
3517	GROW;
3518	attname = htmlParseAttribute(ctxt, &attvalue);
3519	if (attname != NULL) {
3520
3521	/*
3522	* Well formedness requires at most one declaration of an attribute
3523	*/
3524	for (i = 0; i < nbatts;i += 2) {
3525	if (xmlStrEqual(atts[i], attname)) {
3526	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3527	"Attribute %s redefined\n", attname, NULL);
3528	if (attvalue != NULL)
3529	xmlFree(attvalue);
3530	goto failed;
3531	}
3532	}
3533
3534	/*
3535	* Add the pair to atts
3536	*/
3537	if (atts == NULL) {
3538	maxatts = 22; /* allow for 10 attrs by default */
3539	atts = (const xmlChar **)
3540	xmlMalloc(maxatts * sizeof(xmlChar *));
3541	if (atts == NULL) {
3542	htmlErrMemory(ctxt, NULL);
3543	if (attvalue != NULL)
3544	xmlFree(attvalue);
3545	goto failed;
3546	}
3547	ctxt->atts = atts;
3548	ctxt->maxatts = maxatts;
3549	} else if (nbatts + 4 > maxatts) {
3550	const xmlChar **n;
3551
3552	maxatts *= 2;
3553	n = (const xmlChar *) xmlRealloc((void ) atts,
3554	maxatts * sizeof(const xmlChar *));
3555	if (n == NULL) {
3556	htmlErrMemory(ctxt, NULL);
3557	if (attvalue != NULL)
3558	xmlFree(attvalue);
3559	goto failed;
3560	}
3561	atts = n;
3562	ctxt->atts = atts;
3563	ctxt->maxatts = maxatts;
3564	}
3565	atts[nbatts++] = attname;
3566	atts[nbatts++] = attvalue;
3567	atts[nbatts] = NULL;
3568	atts[nbatts + 1] = NULL;
3569	}
3570	else {
3571	if (attvalue != NULL)
3572	xmlFree(attvalue);
3573	/* Dump the bogus attribute string up to the next blank or
3574	* the end of the tag. */
3575	while ((IS_CHAR_CH(CUR)) &&
3576	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3577	((CUR != '/') \|\| (NXT(1) != '>')))
3578	NEXT;
3579	}
3580
3581	failed:
3582	SKIP_BLANKS;
3583	if (cons == ctxt->nbChars) {
3584	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3585	"htmlParseStartTag: problem parsing attributes\n",
3586	NULL, NULL);
3587	break;
3588	}
3589	}
3590
3591	/*
3592	* Handle specific association to the META tag
3593	*/
3594	if (meta && (nbatts != 0))
3595	htmlCheckMeta(ctxt, atts);
3596
3597	/*
3598	* SAX: Start of Element !
3599	*/
3600	htmlnamePush(ctxt, name);
3601	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3602	if (nbatts != 0)
3603	ctxt->sax->startElement(ctxt->userData, name, atts);
3604	else
3605	ctxt->sax->startElement(ctxt->userData, name, NULL);
3606	}
3607
3608	if (atts != NULL) {
3609	for (i = 1;i < nbatts;i += 2) {
3610	if (atts[i] != NULL)
3611	xmlFree((xmlChar *) atts[i]);
3612	}
3613	}
3614
3615	return 0;
3616	}
3617
3618	/**
3619	* htmlParseEndTag:
3620	* @ctxt: an HTML parser context
3621	*
3622	* parse an end of tag
3623	*
3624	* [42] ETag ::= '</' Name S? '>'
3625	*
3626	* With namespace
3627	*
3628	* [NS 9] ETag ::= '</' QName S? '>'
3629	*
3630	* Returns 1 if the current level should be closed.
3631	*/
3632
3633	static int
3634	htmlParseEndTag(htmlParserCtxtPtr ctxt)
3635	{
3636	const xmlChar *name;
3637	const xmlChar *oldname;
3638	int i, ret;
3639
3640	if ((CUR != '<') \|\| (NXT(1) != '/')) {
3641	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3642	"htmlParseEndTag: '</' not found\n", NULL, NULL);
3643	return (0);
3644	}
3645	SKIP(2);
3646
3647	name = htmlParseHTMLName(ctxt);
3648	if (name == NULL)
3649	return (0);
3650
3651	/*
3652	* We should definitely be at the ending "S? '>'" part
3653	*/
3654	SKIP_BLANKS;
3655	if ((!IS_CHAR_CH(CUR)) \|\| (CUR != '>')) {
3656	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3657	"End tag : expected '>'\n", NULL, NULL);
3658	if (ctxt->recovery) {
3659	/*
3660	* We're not at the ending > !!
3661	* Error, unless in recover mode where we search forwards
3662	* until we find a >
3663	*/
3664	while (CUR != '\0' && CUR != '>') NEXT;
3665	NEXT;
3666	}
3667	} else
3668	NEXT;
3669
3670	/*
3671	* If the name read is not one of the element in the parsing stack
3672	* then return, it's just an error.
3673	*/
3674	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3675	if (xmlStrEqual(name, ctxt->nameTab[i]))
3676	break;
3677	}
3678	if (i < 0) {
3679	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3680	"Unexpected end tag : %s\n", name, NULL);
3681	return (0);
3682	}
3683
3684
3685	/*
3686	* Check for auto-closure of HTML elements.
3687	*/
3688
3689	htmlAutoCloseOnClose(ctxt, name);
3690
3691	/*
3692	* Well formedness constraints, opening and closing must match.
3693	* With the exception that the autoclose may have popped stuff out
3694	* of the stack.
3695	*/
3696	if (!xmlStrEqual(name, ctxt->name)) {
3697	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3698	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3699	"Opening and ending tag mismatch: %s and %s\n",
3700	name, ctxt->name);
3701	}
3702	}
3703
3704	/*
3705	* SAX: End of Tag
3706	*/
3707	oldname = ctxt->name;
3708	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3709	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3710	ctxt->sax->endElement(ctxt->userData, name);
3711	htmlnamePop(ctxt);
3712	ret = 1;
3713	} else {
3714	ret = 0;
3715	}
3716
3717	return (ret);
3718	}
3719
3720
3721	/**
3722	* htmlParseReference:
3723	* @ctxt: an HTML parser context
3724	*
3725	* parse and handle entity references in content,
3726	* this will end-up in a call to character() since this is either a
3727	* CharRef, or a predefined entity.
3728	*/
3729	static void
3730	htmlParseReference(htmlParserCtxtPtr ctxt) {
3731	const htmlEntityDesc * ent;
3732	xmlChar out[6];
3733	const xmlChar *name;
3734	if (CUR != '&') return;
3735
3736	if (NXT(1) == '#') {
3737	unsigned int c;
3738	int bits, i = 0;
3739
3740	c = htmlParseCharRef(ctxt);
3741	if (c == 0)
3742	return;
3743
3744	if (c < 0x80) { out[i++]= c; bits= -6; }
3745	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
3746	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
3747	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
3748
3749	for ( ; bits >= 0; bits-= 6) {
3750	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
3751	}
3752	out[i] = 0;
3753
3754	htmlCheckParagraph(ctxt);
3755	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3756	ctxt->sax->characters(ctxt->userData, out, i);
3757	} else {
3758	ent = htmlParseEntityRef(ctxt, &name);
3759	if (name == NULL) {
3760	htmlCheckParagraph(ctxt);
3761	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3762	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3763	return;
3764	}
3765	if ((ent == NULL) \|\| !(ent->value > 0)) {
3766	htmlCheckParagraph(ctxt);
3767	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3768	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3769	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3770	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3771	}
3772	} else {
3773	unsigned int c;
3774	int bits, i = 0;
3775
3776	c = ent->value;
3777	if (c < 0x80)
3778	{ out[i++]= c; bits= -6; }
3779	else if (c < 0x800)
3780	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
3781	else if (c < 0x10000)
3782	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
3783	else
3784	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
3785
3786	for ( ; bits >= 0; bits-= 6) {
3787	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
3788	}
3789	out[i] = 0;
3790
3791	htmlCheckParagraph(ctxt);
3792	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3793	ctxt->sax->characters(ctxt->userData, out, i);
3794	}
3795	}
3796	}
3797
3798	/**
3799	* htmlParseContent:
3800	* @ctxt: an HTML parser context
3801	*
3802	* Parse a content: comment, sub-element, reference or text.
3803	*/
3804
3805	static void
3806	htmlParseContent(htmlParserCtxtPtr ctxt) {
3807	xmlChar *currentNode;
3808	int depth;
3809	const xmlChar *name;
3810
3811	currentNode = xmlStrdup(ctxt->name);
3812	depth = ctxt->nameNr;
3813	while (1) {
3814	long cons = ctxt->nbChars;
3815
3816	GROW;
3817	/*
3818	* Our tag or one of it's parent or children is ending.
3819	*/
3820	if ((CUR == '<') && (NXT(1) == '/')) {
3821	if (htmlParseEndTag(ctxt) &&
3822	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
3823	if (currentNode != NULL)
3824	xmlFree(currentNode);
3825	return;
3826	}
3827	continue; /* while */
3828	}
3829
3830	else if ((CUR == '<') &&
3831	((IS_ASCII_LETTER(NXT(1))) \|\|
3832	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
3833	name = htmlParseHTMLName_nonInvasive(ctxt);
3834	if (name == NULL) {
3835	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3836	"htmlParseStartTag: invalid element name\n",
3837	NULL, NULL);
3838	/* Dump the bogus tag like browsers do */
3839	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3840	NEXT;
3841
3842	if (currentNode != NULL)
3843	xmlFree(currentNode);
3844	return;
3845	}
3846
3847	if (ctxt->name != NULL) {
3848	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3849	htmlAutoClose(ctxt, name);
3850	continue;
3851	}
3852	}
3853	}
3854
3855	/*
3856	* Has this node been popped out during parsing of
3857	* the next element
3858	*/
3859	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3860	(!xmlStrEqual(currentNode, ctxt->name)))
3861	{
3862	if (currentNode != NULL) xmlFree(currentNode);
3863	return;
3864	}
3865
3866	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
3867	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
3868	/*
3869	* Handle SCRIPT/STYLE separately
3870	*/
3871	htmlParseScript(ctxt);
3872	} else {
3873	/*
3874	* Sometimes DOCTYPE arrives in the middle of the document
3875	*/
3876	if ((CUR == '<') && (NXT(1) == '!') &&
3877	(UPP(2) == 'D') && (UPP(3) == 'O') &&
3878	(UPP(4) == 'C') && (UPP(5) == 'T') &&
3879	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3880	(UPP(8) == 'E')) {
3881	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3882	"Misplaced DOCTYPE declaration\n",
3883	BAD_CAST "DOCTYPE" , NULL);
3884	htmlParseDocTypeDecl(ctxt);
3885	}
3886
3887	/*
3888	* First case : a comment
3889	*/
3890	if ((CUR == '<') && (NXT(1) == '!') &&
3891	(NXT(2) == '-') && (NXT(3) == '-')) {
3892	htmlParseComment(ctxt);
3893	}
3894
3895	/*
3896	* Second case : a Processing Instruction.
3897	*/
3898	else if ((CUR == '<') && (NXT(1) == '?')) {
3899	htmlParsePI(ctxt);
3900	}
3901
3902	/*
3903	* Third case : a sub-element.
3904	*/
3905	else if (CUR == '<') {
3906	htmlParseElement(ctxt);
3907	}
3908
3909	/*
3910	* Fourth case : a reference. If if has not been resolved,
3911	* parsing returns it's Name, create the node
3912	*/
3913	else if (CUR == '&') {
3914	htmlParseReference(ctxt);
3915	}
3916
3917	/*
3918	* Fifth case : end of the resource
3919	*/
3920	else if (CUR == 0) {
3921	htmlAutoCloseOnEnd(ctxt);
3922	break;
3923	}
3924
3925	/*
3926	* Last case, text. Note that References are handled directly.
3927	*/
3928	else {
3929	htmlParseCharData(ctxt);
3930	}
3931
3932	if (cons == ctxt->nbChars) {
3933	if (ctxt->node != NULL) {
3934	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3935	"detected an error in element content\n",
3936	NULL, NULL);
3937	}
3938	break;
3939	}
3940	}
3941	GROW;
3942	}
3943	if (currentNode != NULL) xmlFree(currentNode);
3944	}
3945
3946	/**
3947	* htmlParseContent:
3948	* @ctxt: an HTML parser context
3949	*
3950	* Parse a content: comment, sub-element, reference or text.
3951	*/
3952
3953	void
3954	__htmlParseContent(void *ctxt) {
3955	if (ctxt != NULL)
3956	htmlParseContent((htmlParserCtxtPtr) ctxt);
3957	}
3958
3959	/**
3960	* htmlParseElement:
3961	* @ctxt: an HTML parser context
3962	*
3963	* parse an HTML element, this is highly recursive
3964	*
3965	* [39] element ::= EmptyElemTag \| STag content ETag
3966	*
3967	* [41] Attribute ::= Name Eq AttValue
3968	*/
3969
3970	void
3971	htmlParseElement(htmlParserCtxtPtr ctxt) {
3972	const xmlChar *name;
3973	xmlChar *currentNode = NULL;
3974	const htmlElemDesc * info;
3975	htmlParserNodeInfo node_info;
3976	int failed;
3977	int depth;
3978	const xmlChar *oldptr;
3979
3980	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3981	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3982	"htmlParseElement: context error\n", NULL, NULL);
3983	return;
3984	}
3985	/* Capture start position */
3986	if (ctxt->record_info) {
3987	node_info.begin_pos = ctxt->input->consumed +
3988	(CUR_PTR - ctxt->input->base);
3989	node_info.begin_line = ctxt->input->line;
3990	}
3991
3992	failed = htmlParseStartTag(ctxt);
3993	name = ctxt->name;
3994	if (failed \|\| (name == NULL)) {
3995	if (CUR == '>')
3996	NEXT;
3997	return;
3998	}
3999
4000	/*
4001	* Lookup the info for that element.
4002	*/
4003	info = htmlTagLookup(name);
4004	if (info == NULL) {
4005	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4006	"Tag %s invalid\n", name, NULL);
4007	}
4008
4009	/*
4010	* Check for an Empty Element labeled the XML/SGML way
4011	*/
4012	if ((CUR == '/') && (NXT(1) == '>')) {
4013	SKIP(2);
4014	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4015	ctxt->sax->endElement(ctxt->userData, name);
4016	htmlnamePop(ctxt);
4017	return;
4018	}
4019
4020	if (CUR == '>') {
4021	NEXT;
4022	} else {
4023	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4024	"Couldn't find end of Start Tag %s\n", name, NULL);
4025
4026	/*
4027	* end of parsing of this node.
4028	*/
4029	if (xmlStrEqual(name, ctxt->name)) {
4030	nodePop(ctxt);
4031	htmlnamePop(ctxt);
4032	}
4033
4034	/*
4035	* Capture end position and add node
4036	*/
4037	if (ctxt->record_info) {
4038	node_info.end_pos = ctxt->input->consumed +
4039	(CUR_PTR - ctxt->input->base);
4040	node_info.end_line = ctxt->input->line;
4041	node_info.node = ctxt->node;
4042	xmlParserAddNodeInfo(ctxt, &node_info);
4043	}
4044	return;
4045	}
4046
4047	/*
4048	* Check for an Empty Element from DTD definition
4049	*/
4050	if ((info != NULL) && (info->empty)) {
4051	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4052	ctxt->sax->endElement(ctxt->userData, name);
4053	htmlnamePop(ctxt);
4054	return;
4055	}
4056
4057	/*
4058	* Parse the content of the element:
4059	*/
4060	currentNode = xmlStrdup(ctxt->name);
4061	depth = ctxt->nameNr;
4062	while (IS_CHAR_CH(CUR)) {
4063	oldptr = ctxt->input->cur;
4064	htmlParseContent(ctxt);
4065	if (oldptr==ctxt->input->cur) break;
4066	if (ctxt->nameNr < depth) break;
4067	}
4068
4069	/*
4070	* Capture end position and add node
4071	*/
4072	if ( currentNode != NULL && ctxt->record_info ) {
4073	node_info.end_pos = ctxt->input->consumed +
4074	(CUR_PTR - ctxt->input->base);
4075	node_info.end_line = ctxt->input->line;
4076	node_info.node = ctxt->node;
4077	xmlParserAddNodeInfo(ctxt, &node_info);
4078	}
4079	if (!IS_CHAR_CH(CUR)) {
4080	htmlAutoCloseOnEnd(ctxt);
4081	}
4082
4083	if (currentNode != NULL)
4084	xmlFree(currentNode);
4085	}
4086
4087	/**
4088	* htmlParseDocument:
4089	* @ctxt: an HTML parser context
4090	*
4091	* parse an HTML document (and build a tree if using the standard SAX
4092	* interface).
4093	*
4094	* Returns 0, -1 in case of error. the parser context is augmented
4095	* as a result of the parsing.
4096	*/
4097
4098	int
4099	htmlParseDocument(htmlParserCtxtPtr ctxt) {
4100	xmlDtdPtr dtd;
4101
4102	xmlInitParser();
4103
4104	htmlDefaultSAXHandlerInit();
4105
4106	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4107	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4108	"htmlParseDocument: context error\n", NULL, NULL);
4109	return(XML_ERR_INTERNAL_ERROR);
4110	}
4111	ctxt->html = 1;
4112	GROW;
4113	/*
4114	* SAX: beginning of the document processing.
4115	*/
4116	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4117	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4118
4119	/*
4120	* Wipe out everything which is before the first '<'
4121	*/
4122	SKIP_BLANKS;
4123	if (CUR == 0) {
4124	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4125	"Document is empty\n", NULL, NULL);
4126	}
4127
4128	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4129	ctxt->sax->startDocument(ctxt->userData);
4130
4131
4132	/*
4133	* Parse possible comments and PIs before any content
4134	*/
4135	while (((CUR == '<') && (NXT(1) == '!') &&
4136	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4137	((CUR == '<') && (NXT(1) == '?'))) {
4138	htmlParseComment(ctxt);
4139	htmlParsePI(ctxt);
4140	SKIP_BLANKS;
4141	}
4142
4143
4144	/*
4145	* Then possibly doc type declaration(s) and more Misc
4146	* (doctypedecl Misc*)?
4147	*/
4148	if ((CUR == '<') && (NXT(1) == '!') &&
4149	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4150	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4151	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4152	(UPP(8) == 'E')) {
4153	htmlParseDocTypeDecl(ctxt);
4154	}
4155	SKIP_BLANKS;
4156
4157	/*
4158	* Parse possible comments and PIs before any content
4159	*/
4160	while (((CUR == '<') && (NXT(1) == '!') &&
4161	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4162	((CUR == '<') && (NXT(1) == '?'))) {
4163	htmlParseComment(ctxt);
4164	htmlParsePI(ctxt);
4165	SKIP_BLANKS;
4166	}
4167
4168	/*
4169	* Time to start parsing the tree itself
4170	*/
4171	htmlParseContent(ctxt);
4172
4173	/*
4174	* autoclose
4175	*/
4176	if (CUR == 0)
4177	htmlAutoCloseOnEnd(ctxt);
4178
4179
4180	/*
4181	* SAX: end of the document processing.
4182	*/
4183	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4184	ctxt->sax->endDocument(ctxt->userData);
4185
4186	if (ctxt->myDoc != NULL) {
4187	dtd = xmlGetIntSubset(ctxt->myDoc);
4188	if (dtd == NULL)
4189	ctxt->myDoc->intSubset =
4190	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4191	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4192	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4193	}
4194	if (! ctxt->wellFormed) return(-1);
4195	return(0);
4196	}
4197
4198
4199	/************************************************************************
4200	* *
4201	* Parser contexts handling *
4202	* *
4203	************************************************************************/
4204
4205	/**
4206	* htmlInitParserCtxt:
4207	* @ctxt: an HTML parser context
4208	*
4209	* Initialize a parser context
4210	*
4211	* Returns 0 in case of success and -1 in case of error
4212	*/
4213
4214	static int
4215	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4216	{
4217	htmlSAXHandler *sax;
4218
4219	if (ctxt == NULL) return(-1);
4220	memset(ctxt, 0, sizeof(htmlParserCtxt));
4221
4222	ctxt->dict = xmlDictCreate();
4223	if (ctxt->dict == NULL) {
4224	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4225	return(-1);
4226	}
4227	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4228	if (sax == NULL) {
4229	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4230	return(-1);
4231	}
4232	else
4233	memset(sax, 0, sizeof(htmlSAXHandler));
4234
4235	/* Allocate the Input stack */
4236	ctxt->inputTab = (htmlParserInputPtr *)
4237	xmlMalloc(5 * sizeof(htmlParserInputPtr));
4238	if (ctxt->inputTab == NULL) {
4239	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4240	ctxt->inputNr = 0;
4241	ctxt->inputMax = 0;
4242	ctxt->input = NULL;
4243	return(-1);
4244	}
4245	ctxt->inputNr = 0;
4246	ctxt->inputMax = 5;
4247	ctxt->input = NULL;
4248	ctxt->version = NULL;
4249	ctxt->encoding = NULL;
4250	ctxt->standalone = -1;
4251	ctxt->instate = XML_PARSER_START;
4252
4253	/* Allocate the Node stack */
4254	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
4255	if (ctxt->nodeTab == NULL) {
4256	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4257	ctxt->nodeNr = 0;
4258	ctxt->nodeMax = 0;
4259	ctxt->node = NULL;
4260	ctxt->inputNr = 0;
4261	ctxt->inputMax = 0;
4262	ctxt->input = NULL;
4263	return(-1);
4264	}
4265	ctxt->nodeNr = 0;
4266	ctxt->nodeMax = 10;
4267	ctxt->node = NULL;
4268
4269	/* Allocate the Name stack */
4270	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
4271	if (ctxt->nameTab == NULL) {
4272	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4273	ctxt->nameNr = 0;
4274	ctxt->nameMax = 10;
4275	ctxt->name = NULL;
4276	ctxt->nodeNr = 0;
4277	ctxt->nodeMax = 0;
4278	ctxt->node = NULL;
4279	ctxt->inputNr = 0;
4280	ctxt->inputMax = 0;
4281	ctxt->input = NULL;
4282	return(-1);
4283	}
4284	ctxt->nameNr = 0;
4285	ctxt->nameMax = 10;
4286	ctxt->name = NULL;
4287
4288	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4289	else {
4290	ctxt->sax = sax;
4291	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4292	}
4293	ctxt->userData = ctxt;
4294	ctxt->myDoc = NULL;
4295	ctxt->wellFormed = 1;
4296	ctxt->replaceEntities = 0;
4297	ctxt->linenumbers = xmlLineNumbersDefaultValue;
4298	ctxt->html = 1;
4299	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4300	ctxt->vctxt.userData = ctxt;
4301	ctxt->vctxt.error = xmlParserValidityError;
4302	ctxt->vctxt.warning = xmlParserValidityWarning;
4303	ctxt->record_info = 0;
4304	ctxt->validate = 0;
4305	ctxt->nbChars = 0;
4306	ctxt->checkIndex = 0;
4307	ctxt->catalogs = NULL;
4308	xmlInitNodeInfoSeq(&ctxt->node_seq);
4309	return(0);
4310	}
4311
4312	/**
4313	* htmlFreeParserCtxt:
4314	* @ctxt: an HTML parser context
4315	*
4316	* Free all the memory used by a parser context. However the parsed
4317	* document in ctxt->myDoc is not freed.
4318	*/
4319
4320	void
4321	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4322	{
4323	xmlFreeParserCtxt(ctxt);
4324	}
4325
4326	/**
4327	* htmlNewParserCtxt:
4328	*
4329	* Allocate and initialize a new parser context.
4330	*
4331	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
4332	*/
4333
4334	htmlParserCtxtPtr
4335	htmlNewParserCtxt(void)
4336	{
4337	xmlParserCtxtPtr ctxt;
4338
4339	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4340	if (ctxt == NULL) {
4341	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4342	return(NULL);
4343	}
4344	memset(ctxt, 0, sizeof(xmlParserCtxt));
4345	if (htmlInitParserCtxt(ctxt) < 0) {
4346	htmlFreeParserCtxt(ctxt);
4347	return(NULL);
4348	}
4349	return(ctxt);
4350	}
4351
4352	/**
4353	* htmlCreateMemoryParserCtxt:
4354	* @buffer: a pointer to a char array
4355	* @size: the size of the array
4356	*
4357	* Create a parser context for an HTML in-memory document.
4358	*
4359	* Returns the new parser context or NULL
4360	*/
4361	htmlParserCtxtPtr
4362	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4363	xmlParserCtxtPtr ctxt;
4364	xmlParserInputPtr input;
4365	xmlParserInputBufferPtr buf;
4366
4367	if (buffer == NULL)
4368	return(NULL);
4369	if (size <= 0)
4370	return(NULL);
4371
4372	ctxt = htmlNewParserCtxt();
4373	if (ctxt == NULL)
4374	return(NULL);
4375
4376	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4377	if (buf == NULL) return(NULL);
4378
4379	input = xmlNewInputStream(ctxt);
4380	if (input == NULL) {
4381	xmlFreeParserCtxt(ctxt);
4382	return(NULL);
4383	}
4384
4385	input->filename = NULL;
4386	input->buf = buf;
4387	input->base = input->buf->buffer->content;
4388	input->cur = input->buf->buffer->content;
4389	input->end = &input->buf->buffer->content[input->buf->buffer->use];
4390
4391	inputPush(ctxt, input);
4392	return(ctxt);
4393	}
4394
4395	/**
4396	* htmlCreateDocParserCtxt:
4397	* @cur: a pointer to an array of xmlChar
4398	* @encoding: a free form C string describing the HTML document encoding, or NULL
4399	*
4400	* Create a parser context for an HTML document.
4401	*
4402	* TODO: check the need to add encoding handling there
4403	*
4404	* Returns the new parser context or NULL
4405	*/
4406	static htmlParserCtxtPtr
4407	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
4408	int len;
4409	htmlParserCtxtPtr ctxt;
4410
4411	if (cur == NULL)
4412	return(NULL);
4413	len = xmlStrlen(cur);
4414	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4415	if (ctxt == NULL)
4416	return(NULL);
4417
4418	if (encoding != NULL) {
4419	xmlCharEncoding enc;
4420	xmlCharEncodingHandlerPtr handler;
4421
4422	if (ctxt->input->encoding != NULL)
4423	xmlFree((xmlChar *) ctxt->input->encoding);
4424	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4425
4426	enc = xmlParseCharEncoding(encoding);
4427	/*
4428	* registered set of known encodings
4429	*/
4430	if (enc != XML_CHAR_ENCODING_ERROR) {
4431	xmlSwitchEncoding(ctxt, enc);
4432	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4433	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4434	"Unsupported encoding %s\n",
4435	(const xmlChar *) encoding, NULL);
4436	}
4437	} else {
4438	/*
4439	* fallback for unknown encodings
4440	*/
4441	handler = xmlFindCharEncodingHandler((const char *) encoding);
4442	if (handler != NULL) {
4443	xmlSwitchToEncoding(ctxt, handler);
4444	} else {
4445	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4446	"Unsupported encoding %s\n",
4447	(const xmlChar *) encoding, NULL);
4448	}
4449	}
4450	}
4451	return(ctxt);
4452	}
4453
4454	#ifdef LIBXML_PUSH_ENABLED
4455	/************************************************************************
4456	* *
4457	* Progressive parsing interfaces *
4458	* *
4459	************************************************************************/
4460
4461	/**
4462	* htmlParseLookupSequence:
4463	* @ctxt: an HTML parser context
4464	* @first: the first char to lookup
4465	* @next: the next char to lookup or zero
4466	* @third: the next char to lookup or zero
4467	* @comment: flag to force checking inside comments
4468	*
4469	* Try to find if a sequence (first, next, third) or just (first next) or
4470	* (first) is available in the input stream.
4471	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
4472	* to avoid rescanning sequences of bytes, it DOES change the state of the
4473	* parser, do not use liberally.
4474	* This is basically similar to xmlParseLookupSequence()
4475	*
4476	* Returns the index to the current parsing point if the full sequence
4477	* is available, -1 otherwise.
4478	*/
4479	static int
4480	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4481	xmlChar next, xmlChar third, int iscomment) {
4482	int base, len;
4483	htmlParserInputPtr in;
4484	const xmlChar *buf;
4485	int incomment = 0;
4486
4487	in = ctxt->input;
4488	if (in == NULL) return(-1);
4489	base = in->cur - in->base;
4490	if (base < 0) return(-1);
4491	if (ctxt->checkIndex > base)
4492	base = ctxt->checkIndex;
4493	if (in->buf == NULL) {
4494	buf = in->base;
4495	len = in->length;
4496	} else {
4497	buf = in->buf->buffer->content;
4498	len = in->buf->buffer->use;
4499	}
4500	/* take into account the sequence length */
4501	if (third) len -= 2;
4502	else if (next) len --;
4503	for (;base < len;base++) {
4504	if (!incomment && (base + 4 < len) && !iscomment) {
4505	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4506	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4507	incomment = 1;
4508	/* do not increment past <! - some people use <!--> */
4509	base += 2;
4510	}
4511	}
4512	if (incomment) {
4513	if (base + 3 > len)
4514	return(-1);
4515	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4516	(buf[base + 2] == '>')) {
4517	incomment = 0;
4518	base += 2;
4519	}
4520	continue;
4521	}
4522	if (buf[base] == first) {
4523	if (third != 0) {
4524	if ((buf[base + 1] != next) \|\|
4525	(buf[base + 2] != third)) continue;
4526	} else if (next != 0) {
4527	if (buf[base + 1] != next) continue;
4528	}
4529	ctxt->checkIndex = 0;
4530	#ifdef DEBUG_PUSH
4531	if (next == 0)
4532	xmlGenericError(xmlGenericErrorContext,
4533	"HPP: lookup '%c' found at %d\n",
4534	first, base);
4535	else if (third == 0)
4536	xmlGenericError(xmlGenericErrorContext,
4537	"HPP: lookup '%c%c' found at %d\n",
4538	first, next, base);
4539	else
4540	xmlGenericError(xmlGenericErrorContext,
4541	"HPP: lookup '%c%c%c' found at %d\n",
4542	first, next, third, base);
4543	#endif
4544	return(base - (in->cur - in->base));
4545	}
4546	}
4547	ctxt->checkIndex = base;
4548	#ifdef DEBUG_PUSH
4549	if (next == 0)
4550	xmlGenericError(xmlGenericErrorContext,
4551	"HPP: lookup '%c' failed\n", first);
4552	else if (third == 0)
4553	xmlGenericError(xmlGenericErrorContext,
4554	"HPP: lookup '%c%c' failed\n", first, next);
4555	else
4556	xmlGenericError(xmlGenericErrorContext,
4557	"HPP: lookup '%c%c%c' failed\n", first, next, third);
4558	#endif
4559	return(-1);
4560	}
4561
4562	/**
4563	* htmlParseTryOrFinish:
4564	* @ctxt: an HTML parser context
4565	* @terminate: last chunk indicator
4566	*
4567	* Try to progress on parsing
4568	*
4569	* Returns zero if no parsing was possible
4570	*/
4571	static int
4572	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4573	int ret = 0;
4574	htmlParserInputPtr in;
4575	int avail = 0;
4576	xmlChar cur, next;
4577
4578	#ifdef DEBUG_PUSH
4579	switch (ctxt->instate) {
4580	case XML_PARSER_EOF:
4581	xmlGenericError(xmlGenericErrorContext,
4582	"HPP: try EOF\n"); break;
4583	case XML_PARSER_START:
4584	xmlGenericError(xmlGenericErrorContext,
4585	"HPP: try START\n"); break;
4586	case XML_PARSER_MISC:
4587	xmlGenericError(xmlGenericErrorContext,
4588	"HPP: try MISC\n");break;
4589	case XML_PARSER_COMMENT:
4590	xmlGenericError(xmlGenericErrorContext,
4591	"HPP: try COMMENT\n");break;
4592	case XML_PARSER_PROLOG:
4593	xmlGenericError(xmlGenericErrorContext,
4594	"HPP: try PROLOG\n");break;
4595	case XML_PARSER_START_TAG:
4596	xmlGenericError(xmlGenericErrorContext,
4597	"HPP: try START_TAG\n");break;
4598	case XML_PARSER_CONTENT:
4599	xmlGenericError(xmlGenericErrorContext,
4600	"HPP: try CONTENT\n");break;
4601	case XML_PARSER_CDATA_SECTION:
4602	xmlGenericError(xmlGenericErrorContext,
4603	"HPP: try CDATA_SECTION\n");break;
4604	case XML_PARSER_END_TAG:
4605	xmlGenericError(xmlGenericErrorContext,
4606	"HPP: try END_TAG\n");break;
4607	case XML_PARSER_ENTITY_DECL:
4608	xmlGenericError(xmlGenericErrorContext,
4609	"HPP: try ENTITY_DECL\n");break;
4610	case XML_PARSER_ENTITY_VALUE:
4611	xmlGenericError(xmlGenericErrorContext,
4612	"HPP: try ENTITY_VALUE\n");break;
4613	case XML_PARSER_ATTRIBUTE_VALUE:
4614	xmlGenericError(xmlGenericErrorContext,
4615	"HPP: try ATTRIBUTE_VALUE\n");break;
4616	case XML_PARSER_DTD:
4617	xmlGenericError(xmlGenericErrorContext,
4618	"HPP: try DTD\n");break;
4619	case XML_PARSER_EPILOG:
4620	xmlGenericError(xmlGenericErrorContext,
4621	"HPP: try EPILOG\n");break;
4622	case XML_PARSER_PI:
4623	xmlGenericError(xmlGenericErrorContext,
4624	"HPP: try PI\n");break;
4625	case XML_PARSER_SYSTEM_LITERAL:
4626	xmlGenericError(xmlGenericErrorContext,
4627	"HPP: try SYSTEM_LITERAL\n");break;
4628	}
4629	#endif
4630
4631	while (1) {
4632
4633	in = ctxt->input;
4634	if (in == NULL) break;
4635	if (in->buf == NULL)
4636	avail = in->length - (in->cur - in->base);
4637	else
4638	avail = in->buf->buffer->use - (in->cur - in->base);
4639	if ((avail == 0) && (terminate)) {
4640	htmlAutoCloseOnEnd(ctxt);
4641	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4642	/*
4643	* SAX: end of the document processing.
4644	*/
4645	ctxt->instate = XML_PARSER_EOF;
4646	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4647	ctxt->sax->endDocument(ctxt->userData);
4648	}
4649	}
4650	if (avail < 1)
4651	goto done;
4652	cur = in->cur[0];
4653	if (cur == 0) {
4654	SKIP(1);
4655	continue;
4656	}
4657
4658	switch (ctxt->instate) {
4659	case XML_PARSER_EOF:
4660	/*
4661	* Document parsing is done !
4662	*/
4663	goto done;
4664	case XML_PARSER_START:
4665	/*
4666	* Very first chars read from the document flow.
4667	*/
4668	cur = in->cur[0];
4669	if (IS_BLANK_CH(cur)) {
4670	SKIP_BLANKS;
4671	if (in->buf == NULL)
4672	avail = in->length - (in->cur - in->base);
4673	else
4674	avail = in->buf->buffer->use - (in->cur - in->base);
4675	}
4676	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4677	ctxt->sax->setDocumentLocator(ctxt->userData,
4678	&xmlDefaultSAXLocator);
4679	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4680	(!ctxt->disableSAX))
4681	ctxt->sax->startDocument(ctxt->userData);
4682
4683	cur = in->cur[0];
4684	next = in->cur[1];
4685	if ((cur == '<') && (next == '!') &&
4686	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4687	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4688	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4689	(UPP(8) == 'E')) {
4690	if ((!terminate) &&
4691	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4692	goto done;
4693	#ifdef DEBUG_PUSH
4694	xmlGenericError(xmlGenericErrorContext,
4695	"HPP: Parsing internal subset\n");
4696	#endif
4697	htmlParseDocTypeDecl(ctxt);
4698	ctxt->instate = XML_PARSER_PROLOG;
4699	#ifdef DEBUG_PUSH
4700	xmlGenericError(xmlGenericErrorContext,
4701	"HPP: entering PROLOG\n");
4702	#endif
4703	} else {
4704	ctxt->instate = XML_PARSER_MISC;
4705	#ifdef DEBUG_PUSH
4706	xmlGenericError(xmlGenericErrorContext,
4707	"HPP: entering MISC\n");
4708	#endif
4709	}
4710	break;
4711	case XML_PARSER_MISC:
4712	SKIP_BLANKS;
4713	if (in->buf == NULL)
4714	avail = in->length - (in->cur - in->base);
4715	else
4716	avail = in->buf->buffer->use - (in->cur - in->base);
4717	if (avail < 2)
4718	goto done;
4719	cur = in->cur[0];
4720	next = in->cur[1];
4721	if ((cur == '<') && (next == '!') &&
4722	(in->cur[2] == '-') && (in->cur[3] == '-')) {
4723	if ((!terminate) &&
4724	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4725	goto done;
4726	#ifdef DEBUG_PUSH
4727	xmlGenericError(xmlGenericErrorContext,
4728	"HPP: Parsing Comment\n");
4729	#endif
4730	htmlParseComment(ctxt);
4731	ctxt->instate = XML_PARSER_MISC;
4732	} else if ((cur == '<') && (next == '?')) {
4733	if ((!terminate) &&
4734	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4735	goto done;
4736	#ifdef DEBUG_PUSH
4737	xmlGenericError(xmlGenericErrorContext,
4738	"HPP: Parsing PI\n");
4739	#endif
4740	htmlParsePI(ctxt);
4741	ctxt->instate = XML_PARSER_MISC;
4742	} else if ((cur == '<') && (next == '!') &&
4743	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4744	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4745	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4746	(UPP(8) == 'E')) {
4747	if ((!terminate) &&
4748	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4749	goto done;
4750	#ifdef DEBUG_PUSH
4751	xmlGenericError(xmlGenericErrorContext,
4752	"HPP: Parsing internal subset\n");
4753	#endif
4754	htmlParseDocTypeDecl(ctxt);
4755	ctxt->instate = XML_PARSER_PROLOG;
4756	#ifdef DEBUG_PUSH
4757	xmlGenericError(xmlGenericErrorContext,
4758	"HPP: entering PROLOG\n");
4759	#endif
4760	} else if ((cur == '<') && (next == '!') &&
4761	(avail < 9)) {
4762	goto done;
4763	} else {
4764	ctxt->instate = XML_PARSER_START_TAG;
4765	#ifdef DEBUG_PUSH
4766	xmlGenericError(xmlGenericErrorContext,
4767	"HPP: entering START_TAG\n");
4768	#endif
4769	}
4770	break;
4771	case XML_PARSER_PROLOG:
4772	SKIP_BLANKS;
4773	if (in->buf == NULL)
4774	avail = in->length - (in->cur - in->base);
4775	else
4776	avail = in->buf->buffer->use - (in->cur - in->base);
4777	if (avail < 2)
4778	goto done;
4779	cur = in->cur[0];
4780	next = in->cur[1];
4781	if ((cur == '<') && (next == '!') &&
4782	(in->cur[2] == '-') && (in->cur[3] == '-')) {
4783	if ((!terminate) &&
4784	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4785	goto done;
4786	#ifdef DEBUG_PUSH
4787	xmlGenericError(xmlGenericErrorContext,
4788	"HPP: Parsing Comment\n");
4789	#endif
4790	htmlParseComment(ctxt);
4791	ctxt->instate = XML_PARSER_PROLOG;
4792	} else if ((cur == '<') && (next == '?')) {
4793	if ((!terminate) &&
4794	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4795	goto done;
4796	#ifdef DEBUG_PUSH
4797	xmlGenericError(xmlGenericErrorContext,
4798	"HPP: Parsing PI\n");
4799	#endif
4800	htmlParsePI(ctxt);
4801	ctxt->instate = XML_PARSER_PROLOG;
4802	} else if ((cur == '<') && (next == '!') &&
4803	(avail < 4)) {
4804	goto done;
4805	} else {
4806	ctxt->instate = XML_PARSER_START_TAG;
4807	#ifdef DEBUG_PUSH
4808	xmlGenericError(xmlGenericErrorContext,
4809	"HPP: entering START_TAG\n");
4810	#endif
4811	}
4812	break;
4813	case XML_PARSER_EPILOG:
4814	if (in->buf == NULL)
4815	avail = in->length - (in->cur - in->base);
4816	else
4817	avail = in->buf->buffer->use - (in->cur - in->base);
4818	if (avail < 1)
4819	goto done;
4820	cur = in->cur[0];
4821	if (IS_BLANK_CH(cur)) {
4822	htmlParseCharData(ctxt);
4823	goto done;
4824	}
4825	if (avail < 2)
4826	goto done;
4827	next = in->cur[1];
4828	if ((cur == '<') && (next == '!') &&
4829	(in->cur[2] == '-') && (in->cur[3] == '-')) {
4830	if ((!terminate) &&
4831	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4832	goto done;
4833	#ifdef DEBUG_PUSH
4834	xmlGenericError(xmlGenericErrorContext,
4835	"HPP: Parsing Comment\n");
4836	#endif
4837	htmlParseComment(ctxt);
4838	ctxt->instate = XML_PARSER_EPILOG;
4839	} else if ((cur == '<') && (next == '?')) {
4840	if ((!terminate) &&
4841	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4842	goto done;
4843	#ifdef DEBUG_PUSH
4844	xmlGenericError(xmlGenericErrorContext,
4845	"HPP: Parsing PI\n");
4846	#endif
4847	htmlParsePI(ctxt);
4848	ctxt->instate = XML_PARSER_EPILOG;
4849	} else if ((cur == '<') && (next == '!') &&
4850	(avail < 4)) {
4851	goto done;
4852	} else {
4853	ctxt->errNo = XML_ERR_DOCUMENT_END;
4854	ctxt->wellFormed = 0;
4855	ctxt->instate = XML_PARSER_EOF;
4856	#ifdef DEBUG_PUSH
4857	xmlGenericError(xmlGenericErrorContext,
4858	"HPP: entering EOF\n");
4859	#endif
4860	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4861	ctxt->sax->endDocument(ctxt->userData);
4862	goto done;
4863	}
4864	break;
4865	case XML_PARSER_START_TAG: {
4866	const xmlChar *name;
4867	int failed;
4868	const htmlElemDesc * info;
4869
4870	if (avail < 2)
4871	goto done;
4872	cur = in->cur[0];
4873	if (cur != '<') {
4874	ctxt->instate = XML_PARSER_CONTENT;
4875	#ifdef DEBUG_PUSH
4876	xmlGenericError(xmlGenericErrorContext,
4877	"HPP: entering CONTENT\n");
4878	#endif
4879	break;
4880	}
4881	if (in->cur[1] == '/') {
4882	ctxt->instate = XML_PARSER_END_TAG;
4883	ctxt->checkIndex = 0;
4884	#ifdef DEBUG_PUSH
4885	xmlGenericError(xmlGenericErrorContext,
4886	"HPP: entering END_TAG\n");
4887	#endif
4888	break;
4889	}
4890	if ((!terminate) &&
4891	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4892	goto done;
4893
4894	failed = htmlParseStartTag(ctxt);
4895	name = ctxt->name;
4896	if (failed \|\|
4897	(name == NULL)) {
4898	if (CUR == '>')
4899	NEXT;
4900	break;
4901	}
4902
4903	/*
4904	* Lookup the info for that element.
4905	*/
4906	info = htmlTagLookup(name);
4907	if (info == NULL) {
4908	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4909	"Tag %s invalid\n", name, NULL);
4910	}
4911
4912	/*
4913	* Check for an Empty Element labeled the XML/SGML way
4914	*/
4915	if ((CUR == '/') && (NXT(1) == '>')) {
4916	SKIP(2);
4917	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4918	ctxt->sax->endElement(ctxt->userData, name);
4919	htmlnamePop(ctxt);
4920	ctxt->instate = XML_PARSER_CONTENT;
4921	#ifdef DEBUG_PUSH
4922	xmlGenericError(xmlGenericErrorContext,
4923	"HPP: entering CONTENT\n");
4924	#endif
4925	break;
4926	}
4927
4928	if (CUR == '>') {
4929	NEXT;
4930	} else {
4931	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4932	"Couldn't find end of Start Tag %s\n",
4933	name, NULL);
4934
4935	/*
4936	* end of parsing of this node.
4937	*/
4938	if (xmlStrEqual(name, ctxt->name)) {
4939	nodePop(ctxt);
4940	htmlnamePop(ctxt);
4941	}
4942
4943	ctxt->instate = XML_PARSER_CONTENT;
4944	#ifdef DEBUG_PUSH
4945	xmlGenericError(xmlGenericErrorContext,
4946	"HPP: entering CONTENT\n");
4947	#endif
4948	break;
4949	}
4950
4951	/*
4952	* Check for an Empty Element from DTD definition
4953	*/
4954	if ((info != NULL) && (info->empty)) {
4955	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4956	ctxt->sax->endElement(ctxt->userData, name);
4957	htmlnamePop(ctxt);
4958	}
4959	ctxt->instate = XML_PARSER_CONTENT;
4960	#ifdef DEBUG_PUSH
4961	xmlGenericError(xmlGenericErrorContext,
4962	"HPP: entering CONTENT\n");
4963	#endif
4964	break;
4965	}
4966	case XML_PARSER_CONTENT: {
4967	long cons;
4968	/*
4969	* Handle preparsed entities and charRef
4970	*/
4971	if (ctxt->token != 0) {
4972	xmlChar chr[2] = { 0 , 0 } ;
4973
4974	chr[0] = (xmlChar) ctxt->token;
4975	htmlCheckParagraph(ctxt);
4976	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4977	ctxt->sax->characters(ctxt->userData, chr, 1);
4978	ctxt->token = 0;
4979	ctxt->checkIndex = 0;
4980	}
4981	if ((avail == 1) && (terminate)) {
4982	cur = in->cur[0];
4983	if ((cur != '<') && (cur != '&')) {
4984	if (ctxt->sax != NULL) {
4985	if (IS_BLANK_CH(cur)) {
4986	if (ctxt->sax->ignorableWhitespace != NULL)
4987	ctxt->sax->ignorableWhitespace(
4988	ctxt->userData, &cur, 1);
4989	} else {
4990	htmlCheckParagraph(ctxt);
4991	if (ctxt->sax->characters != NULL)
4992	ctxt->sax->characters(
4993	ctxt->userData, &cur, 1);
4994	}
4995	}
4996	ctxt->token = 0;
4997	ctxt->checkIndex = 0;
4998	in->cur++;
4999	break;
5000	}
5001	}
5002	if (avail < 2)
5003	goto done;
5004	cur = in->cur[0];
5005	next = in->cur[1];
5006	cons = ctxt->nbChars;
5007	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
5008	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5009	/*
5010	* Handle SCRIPT/STYLE separately
5011	*/
5012	if (!terminate) {
5013	int idx;
5014	xmlChar val;
5015
5016	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5017	if (idx < 0)
5018	goto done;
5019	val = in->cur[idx + 2];
5020	if (val == 0) /* bad cut of input */
5021	goto done;
5022	}
5023	htmlParseScript(ctxt);
5024	if ((cur == '<') && (next == '/')) {
5025	ctxt->instate = XML_PARSER_END_TAG;
5026	ctxt->checkIndex = 0;
5027	#ifdef DEBUG_PUSH
5028	xmlGenericError(xmlGenericErrorContext,
5029	"HPP: entering END_TAG\n");
5030	#endif
5031	break;
5032	}
5033	} else {
5034	/*
5035	* Sometimes DOCTYPE arrives in the middle of the document
5036	*/
5037	if ((cur == '<') && (next == '!') &&
5038	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5039	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5040	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5041	(UPP(8) == 'E')) {
5042	if ((!terminate) &&
5043	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5044	goto done;
5045	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5046	"Misplaced DOCTYPE declaration\n",
5047	BAD_CAST "DOCTYPE" , NULL);
5048	htmlParseDocTypeDecl(ctxt);
5049	} else if ((cur == '<') && (next == '!') &&
5050	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5051	if ((!terminate) &&
5052	(htmlParseLookupSequence(
5053	ctxt, '-', '-', '>', 1) < 0))
5054	goto done;
5055	#ifdef DEBUG_PUSH
5056	xmlGenericError(xmlGenericErrorContext,
5057	"HPP: Parsing Comment\n");
5058	#endif
5059	htmlParseComment(ctxt);
5060	ctxt->instate = XML_PARSER_CONTENT;
5061	} else if ((cur == '<') && (next == '?')) {
5062	if ((!terminate) &&
5063	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5064	goto done;
5065	#ifdef DEBUG_PUSH
5066	xmlGenericError(xmlGenericErrorContext,
5067	"HPP: Parsing PI\n");
5068	#endif
5069	htmlParsePI(ctxt);
5070	ctxt->instate = XML_PARSER_CONTENT;
5071	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
5072	goto done;
5073	} else if ((cur == '<') && (next == '/')) {
5074	ctxt->instate = XML_PARSER_END_TAG;
5075	ctxt->checkIndex = 0;
5076	#ifdef DEBUG_PUSH
5077	xmlGenericError(xmlGenericErrorContext,
5078	"HPP: entering END_TAG\n");
5079	#endif
5080	break;
5081	} else if (cur == '<') {
5082	ctxt->instate = XML_PARSER_START_TAG;
5083	ctxt->checkIndex = 0;
5084	#ifdef DEBUG_PUSH
5085	xmlGenericError(xmlGenericErrorContext,
5086	"HPP: entering START_TAG\n");
5087	#endif
5088	break;
5089	} else if (cur == '&') {
5090	if ((!terminate) &&
5091	(htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5092	goto done;
5093	#ifdef DEBUG_PUSH
5094	xmlGenericError(xmlGenericErrorContext,
5095	"HPP: Parsing Reference\n");
5096	#endif
5097	/* TODO: check generation of subtrees if noent !!! */
5098	htmlParseReference(ctxt);
5099	} else {
5100	/*
5101	* check that the text sequence is complete
5102	* before handing out the data to the parser
5103	* to avoid problems with erroneous end of
5104	* data detection.
5105	*/
5106	if ((!terminate) &&
5107	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5108	goto done;
5109	ctxt->checkIndex = 0;
5110	#ifdef DEBUG_PUSH
5111	xmlGenericError(xmlGenericErrorContext,
5112	"HPP: Parsing char data\n");
5113	#endif
5114	htmlParseCharData(ctxt);
5115	}
5116	}
5117	if (cons == ctxt->nbChars) {
5118	if (ctxt->node != NULL) {
5119	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5120	"detected an error in element content\n",
5121	NULL, NULL);
5122	}
5123	NEXT;
5124	break;
5125	}
5126
5127	break;
5128	}
5129	case XML_PARSER_END_TAG:
5130	if (avail < 2)
5131	goto done;
5132	if ((!terminate) &&
5133	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5134	goto done;
5135	htmlParseEndTag(ctxt);
5136	if (ctxt->nameNr == 0) {
5137	ctxt->instate = XML_PARSER_EPILOG;
5138	} else {
5139	ctxt->instate = XML_PARSER_CONTENT;
5140	}
5141	ctxt->checkIndex = 0;
5142	#ifdef DEBUG_PUSH
5143	xmlGenericError(xmlGenericErrorContext,
5144	"HPP: entering CONTENT\n");
5145	#endif
5146	break;
5147	case XML_PARSER_CDATA_SECTION:
5148	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5149	"HPP: internal error, state == CDATA\n",
5150	NULL, NULL);
5151	ctxt->instate = XML_PARSER_CONTENT;
5152	ctxt->checkIndex = 0;
5153	#ifdef DEBUG_PUSH
5154	xmlGenericError(xmlGenericErrorContext,
5155	"HPP: entering CONTENT\n");
5156	#endif
5157	break;
5158	case XML_PARSER_DTD:
5159	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5160	"HPP: internal error, state == DTD\n",
5161	NULL, NULL);
5162	ctxt->instate = XML_PARSER_CONTENT;
5163	ctxt->checkIndex = 0;
5164	#ifdef DEBUG_PUSH
5165	xmlGenericError(xmlGenericErrorContext,
5166	"HPP: entering CONTENT\n");
5167	#endif
5168	break;
5169	case XML_PARSER_COMMENT:
5170	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5171	"HPP: internal error, state == COMMENT\n",
5172	NULL, NULL);
5173	ctxt->instate = XML_PARSER_CONTENT;
5174	ctxt->checkIndex = 0;
5175	#ifdef DEBUG_PUSH
5176	xmlGenericError(xmlGenericErrorContext,
5177	"HPP: entering CONTENT\n");
5178	#endif
5179	break;
5180	case XML_PARSER_PI:
5181	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5182	"HPP: internal error, state == PI\n",
5183	NULL, NULL);
5184	ctxt->instate = XML_PARSER_CONTENT;
5185	ctxt->checkIndex = 0;
5186	#ifdef DEBUG_PUSH
5187	xmlGenericError(xmlGenericErrorContext,
5188	"HPP: entering CONTENT\n");
5189	#endif
5190	break;
5191	case XML_PARSER_ENTITY_DECL:
5192	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5193	"HPP: internal error, state == ENTITY_DECL\n",
5194	NULL, NULL);
5195	ctxt->instate = XML_PARSER_CONTENT;
5196	ctxt->checkIndex = 0;
5197	#ifdef DEBUG_PUSH
5198	xmlGenericError(xmlGenericErrorContext,
5199	"HPP: entering CONTENT\n");
5200	#endif
5201	break;
5202	case XML_PARSER_ENTITY_VALUE:
5203	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5204	"HPP: internal error, state == ENTITY_VALUE\n",
5205	NULL, NULL);
5206	ctxt->instate = XML_PARSER_CONTENT;
5207	ctxt->checkIndex = 0;
5208	#ifdef DEBUG_PUSH
5209	xmlGenericError(xmlGenericErrorContext,
5210	"HPP: entering DTD\n");
5211	#endif
5212	break;
5213	case XML_PARSER_ATTRIBUTE_VALUE:
5214	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5215	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5216	NULL, NULL);
5217	ctxt->instate = XML_PARSER_START_TAG;
5218	ctxt->checkIndex = 0;
5219	#ifdef DEBUG_PUSH
5220	xmlGenericError(xmlGenericErrorContext,
5221	"HPP: entering START_TAG\n");
5222	#endif
5223	break;
5224	case XML_PARSER_SYSTEM_LITERAL:
5225	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5226	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5227	NULL, NULL);
5228	ctxt->instate = XML_PARSER_CONTENT;
5229	ctxt->checkIndex = 0;
5230	#ifdef DEBUG_PUSH
5231	xmlGenericError(xmlGenericErrorContext,
5232	"HPP: entering CONTENT\n");
5233	#endif
5234	break;
5235	case XML_PARSER_IGNORE:
5236	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5237	"HPP: internal error, state == XML_PARSER_IGNORE\n",
5238	NULL, NULL);
5239	ctxt->instate = XML_PARSER_CONTENT;
5240	ctxt->checkIndex = 0;
5241	#ifdef DEBUG_PUSH
5242	xmlGenericError(xmlGenericErrorContext,
5243	"HPP: entering CONTENT\n");
5244	#endif
5245	break;
5246	case XML_PARSER_PUBLIC_LITERAL:
5247	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5248	"HPP: internal error, state == XML_PARSER_LITERAL\n",
5249	NULL, NULL);
5250	ctxt->instate = XML_PARSER_CONTENT;
5251	ctxt->checkIndex = 0;
5252	#ifdef DEBUG_PUSH
5253	xmlGenericError(xmlGenericErrorContext,
5254	"HPP: entering CONTENT\n");
5255	#endif
5256	break;
5257
5258	}
5259	}
5260	done:
5261	if ((avail == 0) && (terminate)) {
5262	htmlAutoCloseOnEnd(ctxt);
5263	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5264	/*
5265	* SAX: end of the document processing.
5266	*/
5267	ctxt->instate = XML_PARSER_EOF;
5268	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5269	ctxt->sax->endDocument(ctxt->userData);
5270	}
5271	}
5272	if ((ctxt->myDoc != NULL) &&
5273	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
5274	(ctxt->instate == XML_PARSER_EPILOG))) {
5275	xmlDtdPtr dtd;
5276	dtd = xmlGetIntSubset(ctxt->myDoc);
5277	if (dtd == NULL)
5278	ctxt->myDoc->intSubset =
5279	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5280	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5281	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5282	}
5283	#ifdef DEBUG_PUSH
5284	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5285	#endif
5286	return(ret);
5287	}
5288
5289	/**
5290	* htmlParseChunk:
5291	* @ctxt: an HTML parser context
5292	* @chunk: an char array
5293	* @size: the size in byte of the chunk
5294	* @terminate: last chunk indicator
5295	*
5296	* Parse a Chunk of memory
5297	*
5298	* Returns zero if no error, the xmlParserErrors otherwise.
5299	*/
5300	int
5301	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5302	int terminate) {
5303	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
5304	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5305	"htmlParseChunk: context error\n", NULL, NULL);
5306	return(XML_ERR_INTERNAL_ERROR);
5307	}
5308	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5309	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5310	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5311	int cur = ctxt->input->cur - ctxt->input->base;
5312	int res;
5313
5314	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5315	if (res < 0) {
5316	ctxt->errNo = XML_PARSER_EOF;
5317	ctxt->disableSAX = 1;
5318	return (XML_PARSER_EOF);
5319	}
5320	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5321	ctxt->input->cur = ctxt->input->base + cur;
5322	ctxt->input->end =
5323	&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5324	#ifdef DEBUG_PUSH
5325	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5326	#endif
5327
5328	#if 0
5329	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
5330	htmlParseTryOrFinish(ctxt, terminate);
5331	#endif
5332	} else if (ctxt->instate != XML_PARSER_EOF) {
5333	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5334	xmlParserInputBufferPtr in = ctxt->input->buf;
5335	if ((in->encoder != NULL) && (in->buffer != NULL) &&
5336	(in->raw != NULL)) {
5337	int nbchars;
5338
5339	nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5340	if (nbchars < 0) {
5341	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5342	"encoder error\n", NULL, NULL);
5343	return(XML_ERR_INVALID_ENCODING);
5344	}
5345	}
5346	}
5347	}
5348	htmlParseTryOrFinish(ctxt, terminate);
5349	if (terminate) {
5350	if ((ctxt->instate != XML_PARSER_EOF) &&
5351	(ctxt->instate != XML_PARSER_EPILOG) &&
5352	(ctxt->instate != XML_PARSER_MISC)) {
5353	ctxt->errNo = XML_ERR_DOCUMENT_END;
5354	ctxt->wellFormed = 0;
5355	}
5356	if (ctxt->instate != XML_PARSER_EOF) {
5357	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5358	ctxt->sax->endDocument(ctxt->userData);
5359	}
5360	ctxt->instate = XML_PARSER_EOF;
5361	}
5362	return((xmlParserErrors) ctxt->errNo);
5363	}
5364
5365	/************************************************************************
5366	* *
5367	* User entry points *
5368	* *
5369	************************************************************************/
5370
5371	/**
5372	* htmlCreatePushParserCtxt:
5373	* @sax: a SAX handler
5374	* @user_data: The user data returned on SAX callbacks
5375	* @chunk: a pointer to an array of chars
5376	* @size: number of chars in the array
5377	* @filename: an optional file name or URI
5378	* @enc: an optional encoding
5379	*
5380	* Create a parser context for using the HTML parser in push mode
5381	* The value of @filename is used for fetching external entities
5382	* and error/warning reports.
5383	*
5384	* Returns the new parser context or NULL
5385	*/
5386	htmlParserCtxtPtr
5387	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5388	const char chunk, int size, const char filename,
5389	xmlCharEncoding enc) {
5390	htmlParserCtxtPtr ctxt;
5391	htmlParserInputPtr inputStream;
5392	xmlParserInputBufferPtr buf;
5393
5394	xmlInitParser();
5395
5396	buf = xmlAllocParserInputBuffer(enc);
5397	if (buf == NULL) return(NULL);
5398
5399	ctxt = htmlNewParserCtxt();
5400	if (ctxt == NULL) {
5401	xmlFreeParserInputBuffer(buf);
5402	return(NULL);
5403	}
5404	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
5405	ctxt->charset=XML_CHAR_ENCODING_UTF8;
5406	if (sax != NULL) {
5407	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5408	xmlFree(ctxt->sax);
5409	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5410	if (ctxt->sax == NULL) {
5411	xmlFree(buf);
5412	xmlFree(ctxt);
5413	return(NULL);
5414	}
5415	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5416	if (user_data != NULL)
5417	ctxt->userData = user_data;
5418	}
5419	if (filename == NULL) {
5420	ctxt->directory = NULL;
5421	} else {
5422	ctxt->directory = xmlParserGetDirectory(filename);
5423	}
5424
5425	inputStream = htmlNewInputStream(ctxt);
5426	if (inputStream == NULL) {
5427	xmlFreeParserCtxt(ctxt);
5428	xmlFree(buf);
5429	return(NULL);
5430	}
5431
5432	if (filename == NULL)
5433	inputStream->filename = NULL;
5434	else
5435	inputStream->filename = (char *)
5436	xmlCanonicPath((const xmlChar *) filename);
5437	inputStream->buf = buf;
5438	inputStream->base = inputStream->buf->buffer->content;
5439	inputStream->cur = inputStream->buf->buffer->content;
5440	inputStream->end =
5441	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5442
5443	inputPush(ctxt, inputStream);
5444
5445	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5446	(ctxt->input->buf != NULL)) {
5447	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5448	int cur = ctxt->input->cur - ctxt->input->base;
5449
5450	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5451
5452	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5453	ctxt->input->cur = ctxt->input->base + cur;
5454	ctxt->input->end =
5455	&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5456	#ifdef DEBUG_PUSH
5457	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5458	#endif
5459	}
5460	ctxt->progressive = 1;
5461
5462	return(ctxt);
5463	}
5464	#endif /* LIBXML_PUSH_ENABLED */
5465
5466	/**
5467	* htmlSAXParseDoc:
5468	* @cur: a pointer to an array of xmlChar
5469	* @encoding: a free form C string describing the HTML document encoding, or NULL
5470	* @sax: the SAX handler block
5471	* @userData: if using SAX, this pointer will be provided on callbacks.
5472	*
5473	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5474	* to handle parse events. If sax is NULL, fallback to the default DOM
5475	* behavior and return a tree.
5476	*
5477	* Returns the resulting document tree unless SAX is NULL or the document is
5478	* not well formed.
5479	*/
5480
5481	htmlDocPtr
5482	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
5483	htmlDocPtr ret;
5484	htmlParserCtxtPtr ctxt;
5485
5486	xmlInitParser();
5487
5488	if (cur == NULL) return(NULL);
5489
5490
5491	ctxt = htmlCreateDocParserCtxt(cur, encoding);
5492	if (ctxt == NULL) return(NULL);
5493	if (sax != NULL) {
5494	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5495	ctxt->sax = sax;
5496	ctxt->userData = userData;
5497	}
5498
5499	htmlParseDocument(ctxt);
5500	ret = ctxt->myDoc;
5501	if (sax != NULL) {
5502	ctxt->sax = NULL;
5503	ctxt->userData = NULL;
5504	}
5505	htmlFreeParserCtxt(ctxt);
5506
5507	return(ret);
5508	}
5509
5510	/**
5511	* htmlParseDoc:
5512	* @cur: a pointer to an array of xmlChar
5513	* @encoding: a free form C string describing the HTML document encoding, or NULL
5514	*
5515	* parse an HTML in-memory document and build a tree.
5516	*
5517	* Returns the resulting document tree
5518	*/
5519
5520	htmlDocPtr
5521	htmlParseDoc(xmlChar cur, const char encoding) {
5522	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5523	}
5524
5525
5526	/**
5527	* htmlCreateFileParserCtxt:
5528	* @filename: the filename
5529	* @encoding: a free form C string describing the HTML document encoding, or NULL
5530	*
5531	* Create a parser context for a file content.
5532	* Automatic support for ZLIB/Compress compressed document is provided
5533	* by default if found at compile-time.
5534	*
5535	* Returns the new parser context or NULL
5536	*/
5537	htmlParserCtxtPtr
5538	htmlCreateFileParserCtxt(const char filename, const char encoding)
5539	{
5540	htmlParserCtxtPtr ctxt;
5541	htmlParserInputPtr inputStream;
5542	char *canonicFilename;
5543	/* htmlCharEncoding enc; */
5544	xmlChar content, content_line = (xmlChar *) "charset=";
5545
5546	if (filename == NULL)
5547	return(NULL);
5548
5549	ctxt = htmlNewParserCtxt();
5550	if (ctxt == NULL) {
5551	return(NULL);
5552	}
5553	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
5554	if (canonicFilename == NULL) {
5555	#ifdef LIBXML_SAX1_ENABLED
5556	if (xmlDefaultSAXHandler.error != NULL) {
5557	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5558	}
5559	#endif
5560	xmlFreeParserCtxt(ctxt);
5561	return(NULL);
5562	}
5563
5564	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5565	xmlFree(canonicFilename);
5566	if (inputStream == NULL) {
5567	xmlFreeParserCtxt(ctxt);
5568	return(NULL);
5569	}
5570
5571	inputPush(ctxt, inputStream);
5572
5573	/* set encoding */
5574	if (encoding) {
5575	content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5576	if (content) {
5577	strcpy ((char )content, (char )content_line);
5578	strcat ((char )content, (char )encoding);
5579	htmlCheckEncoding (ctxt, content);
5580	xmlFree (content);
5581	}
5582	}
5583
5584	return(ctxt);
5585	}
5586
5587	/**
5588	* htmlSAXParseFile:
5589	* @filename: the filename
5590	* @encoding: a free form C string describing the HTML document encoding, or NULL
5591	* @sax: the SAX handler block
5592	* @userData: if using SAX, this pointer will be provided on callbacks.
5593	*
5594	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5595	* compressed document is provided by default if found at compile-time.
5596	* It use the given SAX function block to handle the parsing callback.
5597	* If sax is NULL, fallback to the default DOM tree building routines.
5598	*
5599	* Returns the resulting document tree unless SAX is NULL or the document is
5600	* not well formed.
5601	*/
5602
5603	htmlDocPtr
5604	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
5605	void *userData) {
5606	htmlDocPtr ret;
5607	htmlParserCtxtPtr ctxt;
5608	htmlSAXHandlerPtr oldsax = NULL;
5609
5610	xmlInitParser();
5611
5612	ctxt = htmlCreateFileParserCtxt(filename, encoding);
5613	if (ctxt == NULL) return(NULL);
5614	if (sax != NULL) {
5615	oldsax = ctxt->sax;
5616	ctxt->sax = sax;
5617	ctxt->userData = userData;
5618	}
5619
5620	htmlParseDocument(ctxt);
5621
5622	ret = ctxt->myDoc;
5623	if (sax != NULL) {
5624	ctxt->sax = oldsax;
5625	ctxt->userData = NULL;
5626	}
5627	htmlFreeParserCtxt(ctxt);
5628
5629	return(ret);
5630	}
5631
5632	/**
5633	* htmlParseFile:
5634	* @filename: the filename
5635	* @encoding: a free form C string describing the HTML document encoding, or NULL
5636	*
5637	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5638	* compressed document is provided by default if found at compile-time.
5639	*
5640	* Returns the resulting document tree
5641	*/
5642
5643	htmlDocPtr
5644	htmlParseFile(const char filename, const char encoding) {
5645	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5646	}
5647
5648	/**
5649	* htmlHandleOmittedElem:
5650	* @val: int 0 or 1
5651	*
5652	* Set and return the previous value for handling HTML omitted tags.
5653	*
5654	* Returns the last value for 0 for no handling, 1 for auto insertion.
5655	*/
5656
5657	int
5658	htmlHandleOmittedElem(int val) {
5659	int old = htmlOmittedDefaultValue;
5660
5661	htmlOmittedDefaultValue = val;
5662	return(old);
5663	}
5664
5665	/**
5666	* htmlElementAllowedHere:
5667	* @parent: HTML parent element
5668	* @elt: HTML element
5669	*
5670	* Checks whether an HTML element may be a direct child of a parent element.
5671	* Note - doesn't check for deprecated elements
5672	*
5673	* Returns 1 if allowed; 0 otherwise.
5674	*/
5675	int
5676	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5677	const char** p ;
5678
5679	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
5680	return 0 ;
5681
5682	for ( p = parent->subelts; *p; ++p )
5683	if ( !xmlStrcmp((const xmlChar )p, elt) )
5684	return 1 ;
5685
5686	return 0 ;
5687	}
5688	/**
5689	* htmlElementStatusHere:
5690	* @parent: HTML parent element
5691	* @elt: HTML element
5692	*
5693	* Checks whether an HTML element may be a direct child of a parent element.
5694	* and if so whether it is valid or deprecated.
5695	*
5696	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5697	*/
5698	htmlStatus
5699	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5700	if ( ! parent \|\| ! elt )
5701	return HTML_INVALID ;
5702	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5703	return HTML_INVALID ;
5704
5705	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5706	}
5707	/**
5708	* htmlAttrAllowed:
5709	* @elt: HTML element
5710	* @attr: HTML attribute
5711	* @legacy: whether to allow deprecated attributes
5712	*
5713	* Checks whether an attribute is valid for an element
5714	* Has full knowledge of Required and Deprecated attributes
5715	*
5716	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5717	*/
5718	htmlStatus
5719	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5720	const char** p ;
5721
5722	if ( !elt \|\| ! attr )
5723	return HTML_INVALID ;
5724
5725	if ( elt->attrs_req )
5726	for ( p = elt->attrs_req; *p; ++p)
5727	if ( !xmlStrcmp((const xmlChar)p, attr) )
5728	return HTML_REQUIRED ;
5729
5730	if ( elt->attrs_opt )
5731	for ( p = elt->attrs_opt; *p; ++p)
5732	if ( !xmlStrcmp((const xmlChar)p, attr) )
5733	return HTML_VALID ;
5734
5735	if ( legacy && elt->attrs_depr )
5736	for ( p = elt->attrs_depr; *p; ++p)
5737	if ( !xmlStrcmp((const xmlChar)p, attr) )
5738	return HTML_DEPRECATED ;
5739
5740	return HTML_INVALID ;
5741	}
5742	/**
5743	* htmlNodeStatus:
5744	* @node: an htmlNodePtr in a tree
5745	* @legacy: whether to allow deprecated elements (YES is faster here
5746	* for Element nodes)
5747	*
5748	* Checks whether the tree node is valid. Experimental (the author
5749	* only uses the HTML enhancements in a SAX parser)
5750	*
5751	* Return: for Element nodes, a return from htmlElementAllowedHere (if
5752	* legacy allowed) or htmlElementStatusHere (otherwise).
5753	* for Attribute nodes, a return from htmlAttrAllowed
5754	* for other nodes, HTML_NA (no checks performed)
5755	*/
5756	htmlStatus
5757	htmlNodeStatus(const htmlNodePtr node, int legacy) {
5758	if ( ! node )
5759	return HTML_INVALID ;
5760
5761	switch ( node->type ) {
5762	case XML_ELEMENT_NODE:
5763	return legacy
5764	? ( htmlElementAllowedHere (
5765	htmlTagLookup(node->parent->name) , node->name
5766	) ? HTML_VALID : HTML_INVALID )
5767	: htmlElementStatusHere(
5768	htmlTagLookup(node->parent->name) ,
5769	htmlTagLookup(node->name) )
5770	;
5771	case XML_ATTRIBUTE_NODE:
5772	return htmlAttrAllowed(
5773	htmlTagLookup(node->parent->name) , node->name, legacy) ;
5774	default: return HTML_NA ;
5775	}
5776	}
5777	/************************************************************************
5778	* *
5779	* New set (2.6.0) of simpler and more flexible APIs *
5780	* *
5781	************************************************************************/
5782	/**
5783	* DICT_FREE:
5784	* @str: a string
5785	*
5786	* Free a string if it is not owned by the "dict" dictionnary in the
5787	* current scope
5788	*/
5789	#define DICT_FREE(str) \
5790	if ((str) && ((!dict) \|\| \
5791	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5792	xmlFree((char *)(str));
5793
5794	/**
5795	* htmlCtxtReset:
5796	* @ctxt: an HTML parser context
5797	*
5798	* Reset a parser context
5799	*/
5800	void
5801	htmlCtxtReset(htmlParserCtxtPtr ctxt)
5802	{
5803	xmlParserInputPtr input;
5804	xmlDictPtr dict;
5805
5806	if (ctxt == NULL)
5807	return;
5808
5809	xmlInitParser();
5810	dict = ctxt->dict;
5811
5812	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5813	xmlFreeInputStream(input);
5814	}
5815	ctxt->inputNr = 0;
5816	ctxt->input = NULL;
5817
5818	ctxt->spaceNr = 0;
5819	if (ctxt->spaceTab != NULL) {
5820	ctxt->spaceTab[0] = -1;
5821	ctxt->space = &ctxt->spaceTab[0];
5822	} else {
5823	ctxt->space = NULL;
5824	}
5825
5826
5827	ctxt->nodeNr = 0;
5828	ctxt->node = NULL;
5829
5830	ctxt->nameNr = 0;
5831	ctxt->name = NULL;
5832
5833	DICT_FREE(ctxt->version);
5834	ctxt->version = NULL;
5835	DICT_FREE(ctxt->encoding);
5836	ctxt->encoding = NULL;
5837	DICT_FREE(ctxt->directory);
5838	ctxt->directory = NULL;
5839	DICT_FREE(ctxt->extSubURI);
5840	ctxt->extSubURI = NULL;
5841	DICT_FREE(ctxt->extSubSystem);
5842	ctxt->extSubSystem = NULL;
5843	if (ctxt->myDoc != NULL)
5844	xmlFreeDoc(ctxt->myDoc);
5845	ctxt->myDoc = NULL;
5846
5847	ctxt->standalone = -1;
5848	ctxt->hasExternalSubset = 0;
5849	ctxt->hasPErefs = 0;
5850	ctxt->html = 1;
5851	ctxt->external = 0;
5852	ctxt->instate = XML_PARSER_START;
5853	ctxt->token = 0;
5854
5855	ctxt->wellFormed = 1;
5856	ctxt->nsWellFormed = 1;
5857	ctxt->valid = 1;
5858	ctxt->vctxt.userData = ctxt;
5859	ctxt->vctxt.error = xmlParserValidityError;
5860	ctxt->vctxt.warning = xmlParserValidityWarning;
5861	ctxt->record_info = 0;
5862	ctxt->nbChars = 0;
5863	ctxt->checkIndex = 0;
5864	ctxt->inSubset = 0;
5865	ctxt->errNo = XML_ERR_OK;
5866	ctxt->depth = 0;
5867	ctxt->charset = XML_CHAR_ENCODING_NONE;
5868	ctxt->catalogs = NULL;
5869	xmlInitNodeInfoSeq(&ctxt->node_seq);
5870
5871	if (ctxt->attsDefault != NULL) {
5872	xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5873	ctxt->attsDefault = NULL;
5874	}
5875	if (ctxt->attsSpecial != NULL) {
5876	xmlHashFree(ctxt->attsSpecial, NULL);
5877	ctxt->attsSpecial = NULL;
5878	}
5879	}
5880
5881	/**
5882	* htmlCtxtUseOptions:
5883	* @ctxt: an HTML parser context
5884	* @options: a combination of htmlParserOption(s)
5885	*
5886	* Applies the options to the parser context
5887	*
5888	* Returns 0 in case of success, the set of unknown or unimplemented options
5889	* in case of error.
5890	*/
5891	int
5892	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5893	{
5894	if (ctxt == NULL)
5895	return(-1);
5896
5897	if (options & HTML_PARSE_NOWARNING) {
5898	ctxt->sax->warning = NULL;
5899	ctxt->vctxt.warning = NULL;
5900	options -= XML_PARSE_NOWARNING;
5901	ctxt->options \|= XML_PARSE_NOWARNING;
5902	}
5903	if (options & HTML_PARSE_NOERROR) {
5904	ctxt->sax->error = NULL;
5905	ctxt->vctxt.error = NULL;
5906	ctxt->sax->fatalError = NULL;
5907	options -= XML_PARSE_NOERROR;
5908	ctxt->options \|= XML_PARSE_NOERROR;
5909	}
5910	if (options & HTML_PARSE_PEDANTIC) {
5911	ctxt->pedantic = 1;
5912	options -= XML_PARSE_PEDANTIC;
5913	ctxt->options \|= XML_PARSE_PEDANTIC;
5914	} else
5915	ctxt->pedantic = 0;
5916	if (options & XML_PARSE_NOBLANKS) {
5917	ctxt->keepBlanks = 0;
5918	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5919	options -= XML_PARSE_NOBLANKS;
5920	ctxt->options \|= XML_PARSE_NOBLANKS;
5921	} else
5922	ctxt->keepBlanks = 1;
5923	if (options & HTML_PARSE_RECOVER) {
5924	ctxt->recovery = 1;
5925	options -= HTML_PARSE_RECOVER;
5926	} else
5927	ctxt->recovery = 0;
5928	if (options & HTML_PARSE_COMPACT) {
5929	ctxt->options \|= HTML_PARSE_COMPACT;
5930	options -= HTML_PARSE_COMPACT;
5931	}
5932	ctxt->dictNames = 0;
5933	return (options);
5934	}
5935
5936	/**
5937	* htmlDoRead:
5938	* @ctxt: an HTML parser context
5939	* @URL: the base URL to use for the document
5940	* @encoding: the document encoding, or NULL
5941	* @options: a combination of htmlParserOption(s)
5942	* @reuse: keep the context for reuse
5943	*
5944	* Common front-end for the htmlRead functions
5945	*
5946	* Returns the resulting document tree or NULL
5947	*/
5948	static htmlDocPtr
5949	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
5950	int options, int reuse)
5951	{
5952	htmlDocPtr ret;
5953
5954	htmlCtxtUseOptions(ctxt, options);
5955	ctxt->html = 1;
5956	if (encoding != NULL) {
5957	xmlCharEncodingHandlerPtr hdlr;
5958
5959	hdlr = xmlFindCharEncodingHandler(encoding);
5960	if (hdlr != NULL)
5961	xmlSwitchToEncoding(ctxt, hdlr);
5962	}
5963	if ((URL != NULL) && (ctxt->input != NULL) &&
5964	(ctxt->input->filename == NULL))
5965	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
5966	htmlParseDocument(ctxt);
5967	ret = ctxt->myDoc;
5968	ctxt->myDoc = NULL;
5969	if (!reuse) {
5970	if ((ctxt->dictNames) &&
5971	(ret != NULL) &&
5972	(ret->dict == ctxt->dict))
5973	ctxt->dict = NULL;
5974	xmlFreeParserCtxt(ctxt);
5975	}
5976	return (ret);
5977	}
5978
5979	/**
5980	* htmlReadDoc:
5981	* @cur: a pointer to a zero terminated string
5982	* @URL: the base URL to use for the document
5983	* @encoding: the document encoding, or NULL
5984	* @options: a combination of htmlParserOption(s)
5985	*
5986	* parse an XML in-memory document and build a tree.
5987	*
5988	* Returns the resulting document tree
5989	*/
5990	htmlDocPtr
5991	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
5992	{
5993	htmlParserCtxtPtr ctxt;
5994
5995	if (cur == NULL)
5996	return (NULL);
5997
5998	xmlInitParser();
5999	ctxt = htmlCreateDocParserCtxt(cur, NULL);
6000	if (ctxt == NULL)
6001	return (NULL);
6002	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6003	}
6004
6005	/**
6006	* htmlReadFile:
6007	* @filename: a file or URL
6008	* @encoding: the document encoding, or NULL
6009	* @options: a combination of htmlParserOption(s)
6010	*
6011	* parse an XML file from the filesystem or the network.
6012	*
6013	* Returns the resulting document tree
6014	*/
6015	htmlDocPtr
6016	htmlReadFile(const char filename, const char encoding, int options)
6017	{
6018	htmlParserCtxtPtr ctxt;
6019
6020	xmlInitParser();
6021	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6022	if (ctxt == NULL)
6023	return (NULL);
6024	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6025	}
6026
6027	/**
6028	* htmlReadMemory:
6029	* @buffer: a pointer to a char array
6030	* @size: the size of the array
6031	* @URL: the base URL to use for the document
6032	* @encoding: the document encoding, or NULL
6033	* @options: a combination of htmlParserOption(s)
6034	*
6035	* parse an XML in-memory document and build a tree.
6036	*
6037	* Returns the resulting document tree
6038	*/
6039	htmlDocPtr
6040	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
6041	{
6042	htmlParserCtxtPtr ctxt;
6043
6044	xmlInitParser();
6045	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6046	if (ctxt == NULL)
6047	return (NULL);
6048	htmlDefaultSAXHandlerInit();
6049	if (ctxt->sax != NULL)
6050	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6051	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6052	}
6053
6054	/**
6055	* htmlReadFd:
6056	* @fd: an open file descriptor
6057	* @URL: the base URL to use for the document
6058	* @encoding: the document encoding, or NULL
6059	* @options: a combination of htmlParserOption(s)
6060	*
6061	* parse an XML from a file descriptor and build a tree.
6062	*
6063	* Returns the resulting document tree
6064	*/
6065	htmlDocPtr
6066	htmlReadFd(int fd, const char URL, const char encoding, int options)
6067	{
6068	htmlParserCtxtPtr ctxt;
6069	xmlParserInputBufferPtr input;
6070	xmlParserInputPtr stream;
6071
6072	if (fd < 0)
6073	return (NULL);
6074
6075	xmlInitParser();
6076	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6077	if (input == NULL)
6078	return (NULL);
6079	ctxt = xmlNewParserCtxt();
6080	if (ctxt == NULL) {
6081	xmlFreeParserInputBuffer(input);
6082	return (NULL);
6083	}
6084	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6085	if (stream == NULL) {
6086	xmlFreeParserInputBuffer(input);
6087	xmlFreeParserCtxt(ctxt);
6088	return (NULL);
6089	}
6090	inputPush(ctxt, stream);
6091	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6092	}
6093
6094	/**
6095	* htmlReadIO:
6096	* @ioread: an I/O read function
6097	* @ioclose: an I/O close function
6098	* @ioctx: an I/O handler
6099	* @URL: the base URL to use for the document
6100	* @encoding: the document encoding, or NULL
6101	* @options: a combination of htmlParserOption(s)
6102	*
6103	* parse an HTML document from I/O functions and source and build a tree.
6104	*
6105	* Returns the resulting document tree
6106	*/
6107	htmlDocPtr
6108	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6109	void ioctx, const char URL, const char *encoding, int options)
6110	{
6111	htmlParserCtxtPtr ctxt;
6112	xmlParserInputBufferPtr input;
6113	xmlParserInputPtr stream;
6114
6115	if (ioread == NULL)
6116	return (NULL);
6117	xmlInitParser();
6118
6119	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6120	XML_CHAR_ENCODING_NONE);
6121	if (input == NULL)
6122	return (NULL);
6123	ctxt = htmlNewParserCtxt();
6124	if (ctxt == NULL) {
6125	xmlFreeParserInputBuffer(input);
6126	return (NULL);
6127	}
6128	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6129	if (stream == NULL) {
6130	xmlFreeParserInputBuffer(input);
6131	xmlFreeParserCtxt(ctxt);
6132	return (NULL);
6133	}
6134	inputPush(ctxt, stream);
6135	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6136	}
6137
6138	/**
6139	* htmlCtxtReadDoc:
6140	* @ctxt: an HTML parser context
6141	* @cur: a pointer to a zero terminated string
6142	* @URL: the base URL to use for the document
6143	* @encoding: the document encoding, or NULL
6144	* @options: a combination of htmlParserOption(s)
6145	*
6146	* parse an XML in-memory document and build a tree.
6147	* This reuses the existing @ctxt parser context
6148	*
6149	* Returns the resulting document tree
6150	*/
6151	htmlDocPtr
6152	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6153	const char URL, const char encoding, int options)
6154	{
6155	xmlParserInputPtr stream;
6156
6157	if (cur == NULL)
6158	return (NULL);
6159	if (ctxt == NULL)
6160	return (NULL);
6161
6162	htmlCtxtReset(ctxt);
6163
6164	stream = xmlNewStringInputStream(ctxt, cur);
6165	if (stream == NULL) {
6166	return (NULL);
6167	}
6168	inputPush(ctxt, stream);
6169	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6170	}
6171
6172	/**
6173	* htmlCtxtReadFile:
6174	* @ctxt: an HTML parser context
6175	* @filename: a file or URL
6176	* @encoding: the document encoding, or NULL
6177	* @options: a combination of htmlParserOption(s)
6178	*
6179	* parse an XML file from the filesystem or the network.
6180	* This reuses the existing @ctxt parser context
6181	*
6182	* Returns the resulting document tree
6183	*/
6184	htmlDocPtr
6185	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6186	const char *encoding, int options)
6187	{
6188	xmlParserInputPtr stream;
6189
6190	if (filename == NULL)
6191	return (NULL);
6192	if (ctxt == NULL)
6193	return (NULL);
6194
6195	htmlCtxtReset(ctxt);
6196
6197	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6198	if (stream == NULL) {
6199	return (NULL);
6200	}
6201	inputPush(ctxt, stream);
6202	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6203	}
6204
6205	/**
6206	* htmlCtxtReadMemory:
6207	* @ctxt: an HTML parser context
6208	* @buffer: a pointer to a char array
6209	* @size: the size of the array
6210	* @URL: the base URL to use for the document
6211	* @encoding: the document encoding, or NULL
6212	* @options: a combination of htmlParserOption(s)
6213	*
6214	* parse an XML in-memory document and build a tree.
6215	* This reuses the existing @ctxt parser context
6216	*
6217	* Returns the resulting document tree
6218	*/
6219	htmlDocPtr
6220	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6221	const char URL, const char encoding, int options)
6222	{
6223	xmlParserInputBufferPtr input;
6224	xmlParserInputPtr stream;
6225
6226	if (ctxt == NULL)
6227	return (NULL);
6228	if (buffer == NULL)
6229	return (NULL);
6230
6231	htmlCtxtReset(ctxt);
6232
6233	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6234	if (input == NULL) {
6235	return(NULL);
6236	}
6237
6238	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6239	if (stream == NULL) {
6240	xmlFreeParserInputBuffer(input);
6241	return(NULL);
6242	}
6243
6244	inputPush(ctxt, stream);
6245	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6246	}
6247
6248	/**
6249	* htmlCtxtReadFd:
6250	* @ctxt: an HTML parser context
6251	* @fd: an open file descriptor
6252	* @URL: the base URL to use for the document
6253	* @encoding: the document encoding, or NULL
6254	* @options: a combination of htmlParserOption(s)
6255	*
6256	* parse an XML from a file descriptor and build a tree.
6257	* This reuses the existing @ctxt parser context
6258	*
6259	* Returns the resulting document tree
6260	*/
6261	htmlDocPtr
6262	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6263	const char URL, const char encoding, int options)
6264	{
6265	xmlParserInputBufferPtr input;
6266	xmlParserInputPtr stream;
6267
6268	if (fd < 0)
6269	return (NULL);
6270	if (ctxt == NULL)
6271	return (NULL);
6272
6273	htmlCtxtReset(ctxt);
6274
6275
6276	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6277	if (input == NULL)
6278	return (NULL);
6279	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6280	if (stream == NULL) {
6281	xmlFreeParserInputBuffer(input);
6282	return (NULL);
6283	}
6284	inputPush(ctxt, stream);
6285	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6286	}
6287
6288	/**
6289	* htmlCtxtReadIO:
6290	* @ctxt: an HTML parser context
6291	* @ioread: an I/O read function
6292	* @ioclose: an I/O close function
6293	* @ioctx: an I/O handler
6294	* @URL: the base URL to use for the document
6295	* @encoding: the document encoding, or NULL
6296	* @options: a combination of htmlParserOption(s)
6297	*
6298	* parse an HTML document from I/O functions and source and build a tree.
6299	* This reuses the existing @ctxt parser context
6300	*
6301	* Returns the resulting document tree
6302	*/
6303	htmlDocPtr
6304	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6305	xmlInputCloseCallback ioclose, void *ioctx,
6306	const char *URL,
6307	const char *encoding, int options)
6308	{
6309	xmlParserInputBufferPtr input;
6310	xmlParserInputPtr stream;
6311
6312	if (ioread == NULL)
6313	return (NULL);
6314	if (ctxt == NULL)
6315	return (NULL);
6316
6317	htmlCtxtReset(ctxt);
6318
6319	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6320	XML_CHAR_ENCODING_NONE);
6321	if (input == NULL)
6322	return (NULL);
6323	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6324	if (stream == NULL) {
6325	xmlFreeParserInputBuffer(input);
6326	return (NULL);
6327	}
6328	inputPush(ctxt, stream);
6329	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6330	}
6331
6332	#define bottom_HTMLparser
6333	#include "elfgcchack.h"
6334	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.6.31/HTMLparser.c@ 49482

Download in other formats: