HTMLparser.c@ 8234

Last change on this file since 8234 was 6076, checked in by vboxsync, 17 years ago
Merged dmik/s2 branch (r25959:26751) to the trunk.
Property svn:eol-style set to `native` Property svn:keywords set to `Date Revision Author Id`
File size: 183.3 KB

Line
1	/*
2	* HTMLparser.c : an HTML 4.0 non-verifying parser
3	*
4	* See Copyright for the status of this software.
5	*
6	* daniel@veillard.com
7	*/
8
9	#define IN_LIBXML
10	#include "libxml.h"
11	#ifdef LIBXML_HTML_ENABLED
12
13	#include <string.h>
14	#ifdef HAVE_CTYPE_H
15	#include <ctype.h>
16	#endif
17	#ifdef HAVE_STDLIB_H
18	#include <stdlib.h>
19	#endif
20	#ifdef HAVE_SYS_STAT_H
21	#include <sys/stat.h>
22	#endif
23	#ifdef HAVE_FCNTL_H
24	#include <fcntl.h>
25	#endif
26	#ifdef HAVE_UNISTD_H
27	#include <unistd.h>
28	#endif
29	#ifdef HAVE_ZLIB_H
30	#include <zlib.h>
31	#endif
32
33	#include <libxml/xmlmemory.h>
34	#include <libxml/tree.h>
35	#include <libxml/parser.h>
36	#include <libxml/parserInternals.h>
37	#include <libxml/xmlerror.h>
38	#include <libxml/HTMLparser.h>
39	#include <libxml/HTMLtree.h>
40	#include <libxml/entities.h>
41	#include <libxml/encoding.h>
42	#include <libxml/valid.h>
43	#include <libxml/xmlIO.h>
44	#include <libxml/globals.h>
45	#include <libxml/uri.h>
46
47	#define HTML_MAX_NAMELEN 1000
48	#define HTML_PARSER_BIG_BUFFER_SIZE 1000
49	#define HTML_PARSER_BUFFER_SIZE 100
50
51	/* #define DEBUG */
52	/* #define DEBUG_PUSH */
53
54	static int htmlOmittedDefaultValue = 1;
55
56	xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
57	xmlChar end, xmlChar end2, xmlChar end3);
58	static void htmlParseComment(htmlParserCtxtPtr ctxt);
59
60	/************************************************************************
61	* *
62	* Some factorized error routines *
63	* *
64	************************************************************************/
65
66	/**
67	* htmlErrMemory:
68	* @ctxt: an HTML parser context
69	* @extra: extra informations
70	*
71	* Handle a redefinition of attribute error
72	*/
73	static void
74	htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
75	{
76	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
77	(ctxt->instate == XML_PARSER_EOF))
78	return;
79	if (ctxt != NULL) {
80	ctxt->errNo = XML_ERR_NO_MEMORY;
81	ctxt->instate = XML_PARSER_EOF;
82	ctxt->disableSAX = 1;
83	}
84	if (extra)
85	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
86	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
87	NULL, NULL, 0, 0,
88	"Memory allocation failed : %s\n", extra);
89	else
90	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
91	XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
92	NULL, NULL, 0, 0, "Memory allocation failed\n");
93	}
94
95	/**
96	* htmlParseErr:
97	* @ctxt: an HTML parser context
98	* @error: the error number
99	* @msg: the error message
100	* @str1: string infor
101	* @str2: string infor
102	*
103	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
104	*/
105	static void
106	htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
107	const char msg, const xmlChar str1, const xmlChar *str2)
108	{
109	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
110	(ctxt->instate == XML_PARSER_EOF))
111	return;
112	if (ctxt != NULL)
113	ctxt->errNo = error;
114	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
115	XML_ERR_ERROR, NULL, 0,
116	(const char ) str1, (const char ) str2,
117	NULL, 0, 0,
118	msg, str1, str2);
119	if (ctxt != NULL)
120	ctxt->wellFormed = 0;
121	}
122
123	/**
124	* htmlParseErrInt:
125	* @ctxt: an HTML parser context
126	* @error: the error number
127	* @msg: the error message
128	* @val: integer info
129	*
130	* Handle a fatal parser error, i.e. violating Well-Formedness constraints
131	*/
132	static void
133	htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
134	const char *msg, int val)
135	{
136	if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
137	(ctxt->instate == XML_PARSER_EOF))
138	return;
139	if (ctxt != NULL)
140	ctxt->errNo = error;
141	__xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
142	XML_ERR_ERROR, NULL, 0, NULL, NULL,
143	NULL, val, 0, msg, val);
144	if (ctxt != NULL)
145	ctxt->wellFormed = 0;
146	}
147
148	/************************************************************************
149	* *
150	* Parser stacks related functions and macros *
151	* *
152	************************************************************************/
153
154	/**
155	* htmlnamePush:
156	* @ctxt: an HTML parser context
157	* @value: the element name
158	*
159	* Pushes a new element name on top of the name stack
160	*
161	* Returns 0 in case of error, the index in the stack otherwise
162	*/
163	static int
164	htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
165	{
166	if (ctxt->nameNr >= ctxt->nameMax) {
167	ctxt->nameMax *= 2;
168	ctxt->nameTab = (const xmlChar * *)
169	xmlRealloc((xmlChar * *)ctxt->nameTab,
170	ctxt->nameMax *
171	sizeof(ctxt->nameTab[0]));
172	if (ctxt->nameTab == NULL) {
173	htmlErrMemory(ctxt, NULL);
174	return (0);
175	}
176	}
177	ctxt->nameTab[ctxt->nameNr] = value;
178	ctxt->name = value;
179	return (ctxt->nameNr++);
180	}
181	/**
182	* htmlnamePop:
183	* @ctxt: an HTML parser context
184	*
185	* Pops the top element name from the name stack
186	*
187	* Returns the name just removed
188	*/
189	static const xmlChar *
190	htmlnamePop(htmlParserCtxtPtr ctxt)
191	{
192	const xmlChar *ret;
193
194	if (ctxt->nameNr <= 0)
195	return (NULL);
196	ctxt->nameNr--;
197	if (ctxt->nameNr < 0)
198	return (NULL);
199	if (ctxt->nameNr > 0)
200	ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
201	else
202	ctxt->name = NULL;
203	ret = ctxt->nameTab[ctxt->nameNr];
204	ctxt->nameTab[ctxt->nameNr] = NULL;
205	return (ret);
206	}
207
208	/*
209	* Macros for accessing the content. Those should be used only by the parser,
210	* and not exported.
211	*
212	* Dirty macros, i.e. one need to make assumption on the context to use them
213	*
214	* CUR_PTR return the current pointer to the xmlChar to be parsed.
215	* CUR returns the current xmlChar value, i.e. a 8 bit value if compiled
216	* in ISO-Latin or UTF-8, and the current 16 bit value if compiled
217	* in UNICODE mode. This should be used internally by the parser
218	* only to compare to ASCII values otherwise it would break when
219	* running with UTF-8 encoding.
220	* NXT(n) returns the n'th next xmlChar. Same as CUR is should be used only
221	* to compare on ASCII based substring.
222	* UPP(n) returns the n'th next xmlChar converted to uppercase. Same as CUR
223	* it should be used only to compare on ASCII based substring.
224	* SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
225	* strings without newlines within the parser.
226	*
227	* Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
228	*
229	* CURRENT Returns the current char value, with the full decoding of
230	* UTF-8 if we are using this mode. It returns an int.
231	* NEXT Skip to the next character, this does the proper decoding
232	* in UTF-8 mode. It also pop-up unfinished entities on the fly.
233	* NEXTL(l) Skip the current unicode character of l xmlChars long.
234	* COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
235	*/
236
237	#define UPPER (toupper(*ctxt->input->cur))
238
239	#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
240
241	#define NXT(val) ctxt->input->cur[(val)]
242
243	#define UPP(val) (toupper(ctxt->input->cur[(val)]))
244
245	#define CUR_PTR ctxt->input->cur
246
247	#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
248	(ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
249	xmlParserInputShrink(ctxt->input)
250
251	#define GROW if ((ctxt->progressive == 0) && \
252	(ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
253	xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
254
255	#define CURRENT ((int) (*ctxt->input->cur))
256
257	#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
258
259	/* Inported from XML */
260
261	/* #define CUR (ctxt->token ? ctxt->token : (int) (ctxt->input->cur)) /
262	#define CUR ((int) (*ctxt->input->cur))
263	#define NEXT xmlNextChar(ctxt)
264
265	#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
266	#define NXT(val) ctxt->input->cur[(val)]
267	#define CUR_PTR ctxt->input->cur
268
269
270	#define NEXTL(l) do { \
271	if (*(ctxt->input->cur) == '\n') { \
272	ctxt->input->line++; ctxt->input->col = 1; \
273	} else ctxt->input->col++; \
274	ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++; \
275	} while (0)
276
277	/************
278	\
279	if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
280	if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
281	************/
282
283	#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
284	#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
285
286	#define COPY_BUF(l,b,i,v) \
287	if (l == 1) b[i++] = (xmlChar) v; \
288	else i += xmlCopyChar(l,&b[i],v)
289
290	/**
291	* htmlCurrentChar:
292	* @ctxt: the HTML parser context
293	* @len: pointer to the length of the char read
294	*
295	* The current char value, if using UTF-8 this may actually span multiple
296	* bytes in the input buffer. Implement the end of line normalization:
297	* 2.11 End-of-Line Handling
298	* If the encoding is unspecified, in the case we find an ISO-Latin-1
299	* char, then the encoding converter is plugged in automatically.
300	*
301	* Returns the current char value and its length
302	*/
303
304	static int
305	htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
306	if (ctxt->instate == XML_PARSER_EOF)
307	return(0);
308
309	if (ctxt->token != 0) {
310	*len = 0;
311	return(ctxt->token);
312	}
313	if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
314	/*
315	* We are supposed to handle UTF8, check it's valid
316	* From rfc2044: encoding of the Unicode values on UTF-8:
317	*
318	* UCS-4 range (hex.) UTF-8 octet sequence (binary)
319	* 0000 0000-0000 007F 0xxxxxxx
320	* 0000 0080-0000 07FF 110xxxxx 10xxxxxx
321	* 0000 0800-0000 FFFF 1110xxxx 10xxxxxx 10xxxxxx
322	*
323	* Check for the 0x110000 limit too
324	*/
325	const unsigned char *cur = ctxt->input->cur;
326	unsigned char c;
327	unsigned int val;
328
329	c = *cur;
330	if (c & 0x80) {
331	if (cur[1] == 0)
332	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
333	if ((cur[1] & 0xc0) != 0x80)
334	goto encoding_error;
335	if ((c & 0xe0) == 0xe0) {
336
337	if (cur[2] == 0)
338	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
339	if ((cur[2] & 0xc0) != 0x80)
340	goto encoding_error;
341	if ((c & 0xf0) == 0xf0) {
342	if (cur[3] == 0)
343	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
344	if (((c & 0xf8) != 0xf0) \|\|
345	((cur[3] & 0xc0) != 0x80))
346	goto encoding_error;
347	/* 4-byte code */
348	*len = 4;
349	val = (cur[0] & 0x7) << 18;
350	val \|= (cur[1] & 0x3f) << 12;
351	val \|= (cur[2] & 0x3f) << 6;
352	val \|= cur[3] & 0x3f;
353	} else {
354	/* 3-byte code */
355	*len = 3;
356	val = (cur[0] & 0xf) << 12;
357	val \|= (cur[1] & 0x3f) << 6;
358	val \|= cur[2] & 0x3f;
359	}
360	} else {
361	/* 2-byte code */
362	*len = 2;
363	val = (cur[0] & 0x1f) << 6;
364	val \|= cur[1] & 0x3f;
365	}
366	if (!IS_CHAR(val)) {
367	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
368	"Char 0x%X out of allowed range\n", val);
369	}
370	return(val);
371	} else {
372	/* 1-byte code */
373	*len = 1;
374	return((int) *ctxt->input->cur);
375	}
376	}
377	/*
378	* Assume it's a fixed length encoding (1) with
379	* a compatible encoding for the ASCII set, since
380	* XML constructs only use < 128 chars
381	*/
382	*len = 1;
383	if ((int) *ctxt->input->cur < 0x80)
384	return((int) *ctxt->input->cur);
385
386	/*
387	* Humm this is bad, do an automatic flow conversion
388	*/
389	xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
390	ctxt->charset = XML_CHAR_ENCODING_UTF8;
391	return(xmlCurrentChar(ctxt, len));
392
393	encoding_error:
394	/*
395	* If we detect an UTF8 error that probably mean that the
396	* input encoding didn't get properly advertized in the
397	* declaration header. Report the error and switch the encoding
398	* to ISO-Latin-1 (if you don't like this policy, just declare the
399	* encoding !)
400	*/
401	{
402	char buffer[150];
403
404	if (ctxt->input->end - ctxt->input->cur >= 4) {
405	snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
406	ctxt->input->cur[0], ctxt->input->cur[1],
407	ctxt->input->cur[2], ctxt->input->cur[3]);
408	} else {
409	snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
410	}
411	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
412	"Input is not proper UTF-8, indicate encoding !\n",
413	BAD_CAST buffer, NULL);
414	}
415
416	ctxt->charset = XML_CHAR_ENCODING_8859_1;
417	*len = 1;
418	return((int) *ctxt->input->cur);
419	}
420
421	/**
422	* htmlSkipBlankChars:
423	* @ctxt: the HTML parser context
424	*
425	* skip all blanks character found at that point in the input streams.
426	*
427	* Returns the number of space chars skipped
428	*/
429
430	static int
431	htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
432	int res = 0;
433
434	while (IS_BLANK_CH(*(ctxt->input->cur))) {
435	if ((*ctxt->input->cur == 0) &&
436	(xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
437	xmlPopInput(ctxt);
438	} else {
439	if (*(ctxt->input->cur) == '\n') {
440	ctxt->input->line++; ctxt->input->col = 1;
441	} else ctxt->input->col++;
442	ctxt->input->cur++;
443	ctxt->nbChars++;
444	if (*ctxt->input->cur == 0)
445	xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
446	}
447	res++;
448	}
449	return(res);
450	}
451
452
453
454	/************************************************************************
455	* *
456	* The list of HTML elements and their properties *
457	* *
458	************************************************************************/
459
460	/*
461	* Start Tag: 1 means the start tag can be ommited
462	* End Tag: 1 means the end tag can be ommited
463	* 2 means it's forbidden (empty elements)
464	* 3 means the tag is stylistic and should be closed easily
465	* Depr: this element is deprecated
466	* DTD: 1 means that this element is valid only in the Loose DTD
467	* 2 means that this element is valid only in the Frameset DTD
468	*
469	* Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
470	, subElements , impliedsubelt , Attributes, userdata
471	*/
472
473	/* Definitions and a couple of vars for HTML Elements */
474
475	#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
476	#define NB_FONTSTYLE 8
477	#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
478	#define NB_PHRASE 10
479	#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
480	#define NB_SPECIAL 16
481	#define INLINE PCDATA FONTSTYLE PHRASE SPECIAL FORMCTRL
482	#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
483	#define BLOCK HEADING, LIST "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
484	#define NB_BLOCK NB_HEADING + NB_LIST + 14
485	#define FORMCTRL "input", "select", "textarea", "label", "button"
486	#define NB_FORMCTRL 5
487	#define PCDATA
488	#define NB_PCDATA 0
489	#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
490	#define NB_HEADING 6
491	#define LIST "ul", "ol", "dir", "menu"
492	#define NB_LIST 4
493	#define MODIFIER
494	#define NB_MODIFIER 0
495	#define FLOW BLOCK,INLINE
496	#define NB_FLOW NB_BLOCK + NB_INLINE
497	#define EMPTY NULL
498
499
500	static const char* const html_flow[] = { FLOW, NULL } ;
501	static const char* const html_inline[] = { INLINE, NULL } ;
502
503	/* placeholders: elts with content but no subelements */
504	static const char* const html_pcdata[] = { NULL } ;
505	#define html_cdata html_pcdata
506
507
508	/* ... and for HTML Attributes */
509
510	#define COREATTRS "id", "class", "style", "title"
511	#define NB_COREATTRS 4
512	#define I18N "lang", "dir"
513	#define NB_I18N 2
514	#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
515	#define NB_EVENTS 9
516	#define ATTRS COREATTRS,I18N,EVENTS
517	#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
518	#define CELLHALIGN "align", "char", "charoff"
519	#define NB_CELLHALIGN 3
520	#define CELLVALIGN "valign"
521	#define NB_CELLVALIGN 1
522
523	static const char* const html_attrs[] = { ATTRS, NULL } ;
524	static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
525	static const char* const core_attrs[] = { COREATTRS, NULL } ;
526	static const char* const i18n_attrs[] = { I18N, NULL } ;
527
528
529	/* Other declarations that should go inline ... */
530	static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
531	"href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
532	"tabindex", "onfocus", "onblur", NULL } ;
533	static const char* const target_attr[] = { "target", NULL } ;
534	static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
535	static const char* const alt_attr[] = { "alt", NULL } ;
536	static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
537	static const char* const href_attrs[] = { "href", NULL } ;
538	static const char* const clear_attrs[] = { "clear", NULL } ;
539	static const char* const inline_p[] = { INLINE, "p", NULL } ;
540
541	static const char* const flow_param[] = { FLOW, "param", NULL } ;
542	static const char* const applet_attrs[] = { COREATTRS , "codebase",
543	"archive", "alt", "name", "height", "width", "align",
544	"hspace", "vspace", NULL } ;
545	static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
546	"tabindex", "accesskey", "onfocus", "onblur", NULL } ;
547	static const char* const basefont_attrs[] =
548	{ "id", "size", "color", "face", NULL } ;
549	static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
550	static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
551	static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
552	static const char* const body_depr[] = { "background", "bgcolor", "text",
553	"link", "vlink", "alink", NULL } ;
554	static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
555	"disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
556
557
558	static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
559	static const char* const col_elt[] = { "col", NULL } ;
560	static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
561	static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
562	static const char* const dl_contents[] = { "dt", "dd", NULL } ;
563	static const char* const compact_attr[] = { "compact", NULL } ;
564	static const char* const label_attr[] = { "label", NULL } ;
565	static const char* const fieldset_contents[] = { FLOW, "legend" } ;
566	static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
567	static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
568	static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
569	static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
570	static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
571	static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
572	static const char* const head_attrs[] = { I18N, "profile", NULL } ;
573	static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
574	static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
575	static const char* const version_attr[] = { "version", NULL } ;
576	static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
577	static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
578	static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
579	static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
580	static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
581	static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
582	static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
583	static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
584	static const char* const align_attr[] = { "align", NULL } ;
585	static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
586	static const char* const map_contents[] = { BLOCK, "area", NULL } ;
587	static const char* const name_attr[] = { "name", NULL } ;
588	static const char* const action_attr[] = { "action", NULL } ;
589	static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
590	static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
591	static const char* const content_attr[] = { "content", NULL } ;
592	static const char* const type_attr[] = { "type", NULL } ;
593	static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
594	static const char* const object_contents[] = { FLOW, "param", NULL } ;
595	static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
596	static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
597	static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
598	static const char* const option_elt[] = { "option", NULL } ;
599	static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
600	static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
601	static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
602	static const char* const width_attr[] = { "width", NULL } ;
603	static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
604	static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
605	static const char* const language_attr[] = { "language", NULL } ;
606	static const char* const select_content[] = { "optgroup", "option", NULL } ;
607	static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
608	static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
609	static const char* const table_attrs[] = { ATTRS "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
610	static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
611	static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
612	static const char* const tr_elt[] = { "tr", NULL } ;
613	static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
614	static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
615	static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
616	static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
617	static const char* const tr_contents[] = { "th", "td", NULL } ;
618	static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
619	static const char* const li_elt[] = { "li", NULL } ;
620	static const char* const ul_depr[] = { "type", "compact", NULL} ;
621	static const char* const dir_attr[] = { "dir", NULL} ;
622
623	#define DECL (const char**)
624
625	static const htmlElemDesc
626	html40ElementTable[] = {
627	{ "a", 0, 0, 0, 0, 0, 0, 1, "anchor ",
628	DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
629	},
630	{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
631	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
632	},
633	{ "acronym", 0, 0, 0, 0, 0, 0, 1, "",
634	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
635	},
636	{ "address", 0, 0, 0, 0, 0, 0, 0, "information on author ",
637	DECL inline_p , NULL , DECL html_attrs, NULL, NULL
638	},
639	{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
640	DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
641	},
642	{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
643	EMPTY , NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
644	},
645	{ "b", 0, 3, 0, 0, 0, 0, 1, "bold text style",
646	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
647	},
648	{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
649	EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
650	},
651	{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
652	EMPTY , NULL , NULL, DECL basefont_attrs, NULL
653	},
654	{ "bdo", 0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
655	DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
656	},
657	{ "big", 0, 3, 0, 0, 0, 0, 1, "large text style",
658	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
659	},
660	{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
661	DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
662	},
663	{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
664	DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
665	},
666	{ "br", 0, 2, 2, 1, 0, 0, 1, "forced line break ",
667	EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
668	},
669	{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
670	DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
671	},
672	{ "caption", 0, 0, 0, 0, 0, 0, 0, "table caption ",
673	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
674	},
675	{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
676	DECL html_flow , NULL , NULL, DECL html_attrs, NULL
677	},
678	{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
679	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
680	},
681	{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
682	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
683	},
684	{ "col", 0, 2, 2, 1, 0, 0, 0, "table column ",
685	EMPTY , NULL , DECL col_attrs , NULL, NULL
686	},
687	{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
688	DECL col_elt , "col" , DECL col_attrs , NULL, NULL
689	},
690	{ "dd", 0, 1, 0, 0, 0, 0, 0, "definition description ",
691	DECL html_flow , NULL , DECL html_attrs, NULL, NULL
692	},
693	{ "del", 0, 0, 0, 0, 0, 0, 2, "deleted text ",
694	DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
695	},
696	{ "dfn", 0, 0, 0, 0, 0, 0, 1, "instance definition",
697	DECL html_inline , NULL , DECL html_attrs, NULL, NULL
698	},
699	{ "dir", 0, 0, 0, 0, 1, 1, 0, "directory list",
700	DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
701	},
702	{ "div", 0, 0, 0, 0, 0, 0, 0, "generic language/style container",
703	DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
704	},
705	{ "dl", 0, 0, 0, 0, 0, 0, 0, "definition list ",
706	DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
707	},
708	{ "dt", 0, 1, 0, 0, 0, 0, 0, "definition term ",
709	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
710	},
711	{ "em", 0, 3, 0, 0, 0, 0, 1, "emphasis",
712	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
713	},
714	{ "embed", 0, 1, 2, 0, 1, 1, 1, "generic embedded object ",
715	EMPTY, NULL, DECL embed_attrs, NULL, NULL
716	},
717	{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
718	DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
719	},
720	{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
721	DECL html_inline, NULL, NULL, DECL font_attrs, NULL
722	},
723	{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
724	DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
725	},
726	{ "frame", 0, 2, 2, 1, 0, 2, 0, "subwindow " ,
727	EMPTY, NULL, NULL, DECL frame_attrs, NULL
728	},
729	{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
730	DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
731	},
732	{ "h1", 0, 0, 0, 0, 0, 0, 0, "heading ",
733	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
734	},
735	{ "h2", 0, 0, 0, 0, 0, 0, 0, "heading ",
736	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
737	},
738	{ "h3", 0, 0, 0, 0, 0, 0, 0, "heading ",
739	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
740	},
741	{ "h4", 0, 0, 0, 0, 0, 0, 0, "heading ",
742	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
743	},
744	{ "h5", 0, 0, 0, 0, 0, 0, 0, "heading ",
745	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
746	},
747	{ "h6", 0, 0, 0, 0, 0, 0, 0, "heading ",
748	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
749	},
750	{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
751	DECL head_contents, NULL, DECL head_attrs, NULL, NULL
752	},
753	{ "hr", 0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
754	EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
755	},
756	{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
757	DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
758	},
759	{ "i", 0, 3, 0, 0, 0, 0, 1, "italic text style",
760	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
761	},
762	{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
763	DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
764	},
765	{ "img", 0, 2, 2, 1, 0, 0, 1, "embedded image ",
766	EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
767	},
768	{ "input", 0, 2, 2, 1, 0, 0, 1, "form control ",
769	EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
770	},
771	{ "ins", 0, 0, 0, 0, 0, 0, 2, "inserted text",
772	DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
773	},
774	{ "isindex", 0, 2, 2, 1, 1, 1, 0, "single line prompt ",
775	EMPTY, NULL, NULL, DECL prompt_attrs, NULL
776	},
777	{ "kbd", 0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
778	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
779	},
780	{ "label", 0, 0, 0, 0, 0, 0, 1, "form field label text ",
781	DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
782	},
783	{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
784	DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
785	},
786	{ "li", 0, 1, 1, 0, 0, 0, 0, "list item ",
787	DECL html_flow, NULL, DECL html_attrs, NULL, NULL
788	},
789	{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
790	EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
791	},
792	{ "map", 0, 0, 0, 0, 0, 0, 2, "client-side image map ",
793	DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
794	},
795	{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
796	DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
797	},
798	{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
799	EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
800	},
801	{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
802	DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
803	},
804	{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
805	DECL html_flow, "div", DECL html_attrs, NULL, NULL
806	},
807	{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
808	DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
809	},
810	{ "ol", 0, 0, 0, 0, 0, 0, 0, "ordered list ",
811	DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
812	},
813	{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
814	DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
815	},
816	{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
817	DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
818	},
819	{ "p", 0, 1, 0, 0, 0, 0, 0, "paragraph ",
820	DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
821	},
822	{ "param", 0, 2, 2, 1, 0, 0, 0, "named property value ",
823	EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
824	},
825	{ "pre", 0, 0, 0, 0, 0, 0, 0, "preformatted text ",
826	DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
827	},
828	{ "q", 0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
829	DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
830	},
831	{ "s", 0, 3, 0, 0, 1, 1, 1, "strike-through text style",
832	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
833	},
834	{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
835	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
836	},
837	{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
838	DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
839	},
840	{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
841	DECL select_content, NULL, DECL select_attrs, NULL, NULL
842	},
843	{ "small", 0, 3, 0, 0, 0, 0, 1, "small text style",
844	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
845	},
846	{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
847	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
848	},
849	{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
850	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
851	},
852	{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
853	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854	},
855	{ "style", 0, 0, 0, 0, 0, 0, 0, "style info ",
856	DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
857	},
858	{ "sub", 0, 3, 0, 0, 0, 0, 1, "subscript",
859	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
860	},
861	{ "sup", 0, 3, 0, 0, 0, 0, 1, "superscript ",
862	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
863	},
864	{ "table", 0, 0, 0, 0, 0, 0, 0, "",
865	DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
866	},
867	{ "tbody", 1, 0, 0, 0, 0, 0, 0, "table body ",
868	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
869	},
870	{ "td", 0, 0, 0, 0, 0, 0, 0, "table data cell",
871	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
872	},
873	{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
874	DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
875	},
876	{ "tfoot", 0, 1, 0, 0, 0, 0, 0, "table footer ",
877	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
878	},
879	{ "th", 0, 1, 0, 0, 0, 0, 0, "table header cell",
880	DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
881	},
882	{ "thead", 0, 1, 0, 0, 0, 0, 0, "table header ",
883	DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
884	},
885	{ "title", 0, 0, 0, 0, 0, 0, 0, "document title ",
886	DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
887	},
888	{ "tr", 0, 0, 0, 0, 0, 0, 0, "table row ",
889	DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
890	},
891	{ "tt", 0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
892	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
893	},
894	{ "u", 0, 3, 0, 0, 1, 1, 1, "underlined text style",
895	DECL html_inline, NULL, NULL, DECL html_attrs, NULL
896	},
897	{ "ul", 0, 0, 0, 0, 0, 0, 0, "unordered list ",
898	DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
899	},
900	{ "var", 0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
901	DECL html_inline, NULL, DECL html_attrs, NULL, NULL
902	}
903	};
904
905	/*
906	* start tags that imply the end of current element
907	*/
908	static const char * const htmlStartClose[] = {
909	"form", "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
910	"dl", "ul", "ol", "menu", "dir", "address", "pre",
911	"listing", "xmp", "head", NULL,
912	"head", "p", NULL,
913	"title", "p", NULL,
914	"body", "head", "style", "link", "title", "p", NULL,
915	"frameset", "head", "style", "link", "title", "p", NULL,
916	"li", "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
917	"pre", "listing", "xmp", "head", "li", NULL,
918	"hr", "p", "head", NULL,
919	"h1", "p", "head", NULL,
920	"h2", "p", "head", NULL,
921	"h3", "p", "head", NULL,
922	"h4", "p", "head", NULL,
923	"h5", "p", "head", NULL,
924	"h6", "p", "head", NULL,
925	"dir", "p", "head", NULL,
926	"address", "p", "head", "ul", NULL,
927	"pre", "p", "head", "ul", NULL,
928	"listing", "p", "head", NULL,
929	"xmp", "p", "head", NULL,
930	"blockquote", "p", "head", NULL,
931	"dl", "p", "dt", "menu", "dir", "address", "pre", "listing",
932	"xmp", "head", NULL,
933	"dt", "p", "menu", "dir", "address", "pre", "listing", "xmp",
934	"head", "dd", NULL,
935	"dd", "p", "menu", "dir", "address", "pre", "listing", "xmp",
936	"head", "dt", NULL,
937	"ul", "p", "head", "ol", "menu", "dir", "address", "pre",
938	"listing", "xmp", NULL,
939	"ol", "p", "head", "ul", NULL,
940	"menu", "p", "head", "ul", NULL,
941	"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", NULL,
942	"div", "p", "head", NULL,
943	"noscript", "p", "head", NULL,
944	"center", "font", "b", "i", "p", "head", NULL,
945	"a", "a", NULL,
946	"caption", "p", NULL,
947	"colgroup", "caption", "colgroup", "col", "p", NULL,
948	"col", "caption", "col", "p", NULL,
949	"table", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
950	"listing", "xmp", "a", NULL,
951	"th", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
952	"td", "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
953	"tr", "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
954	"thead", "caption", "col", "colgroup", NULL,
955	"tfoot", "th", "td", "tr", "caption", "col", "colgroup", "thead",
956	"tbody", "p", NULL,
957	"tbody", "th", "td", "tr", "caption", "col", "colgroup", "thead",
958	"tfoot", "tbody", "p", NULL,
959	"optgroup", "option", NULL,
960	"option", "option", NULL,
961	"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
962	"pre", "listing", "xmp", "a", NULL,
963	NULL
964	};
965
966	/*
967	* The list of HTML elements which are supposed not to have
968	* CDATA content and where a p element will be implied
969	*
970	* TODO: extend that list by reading the HTML SGML DTD on
971	* implied paragraph
972	*/
973	static const char *const htmlNoContentElements[] = {
974	"html",
975	"head",
976	NULL
977	};
978
979	/*
980	* The list of HTML attributes which are of content %Script;
981	* NOTE: when adding ones, check htmlIsScriptAttribute() since
982	* it assumes the name starts with 'on'
983	*/
984	static const char *const htmlScriptAttributes[] = {
985	"onclick",
986	"ondblclick",
987	"onmousedown",
988	"onmouseup",
989	"onmouseover",
990	"onmousemove",
991	"onmouseout",
992	"onkeypress",
993	"onkeydown",
994	"onkeyup",
995	"onload",
996	"onunload",
997	"onfocus",
998	"onblur",
999	"onsubmit",
1000	"onrest",
1001	"onchange",
1002	"onselect"
1003	};
1004
1005	/*
1006	* This table is used by the htmlparser to know what to do with
1007	* broken html pages. By assigning different priorities to different
1008	* elements the parser can decide how to handle extra endtags.
1009	* Endtags are only allowed to close elements with lower or equal
1010	* priority.
1011	*/
1012
1013	typedef struct {
1014	const char *name;
1015	int priority;
1016	} elementPriority;
1017
1018	static const elementPriority htmlEndPriority[] = {
1019	{"div", 150},
1020	{"td", 160},
1021	{"th", 160},
1022	{"tr", 170},
1023	{"thead", 180},
1024	{"tbody", 180},
1025	{"tfoot", 180},
1026	{"table", 190},
1027	{"head", 200},
1028	{"body", 200},
1029	{"html", 220},
1030	{NULL, 100} /* Default priority */
1031	};
1032
1033	static const char** htmlStartCloseIndex[100];
1034	static int htmlStartCloseIndexinitialized = 0;
1035
1036	/************************************************************************
1037	* *
1038	* functions to handle HTML specific data *
1039	* *
1040	************************************************************************/
1041
1042	/**
1043	* htmlInitAutoClose:
1044	*
1045	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1046	* This is not reentrant. Call xmlInitParser() once before processing in
1047	* case of use in multithreaded programs.
1048	*/
1049	void
1050	htmlInitAutoClose(void) {
1051	int indx, i = 0;
1052
1053	if (htmlStartCloseIndexinitialized) return;
1054
1055	for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1056	indx = 0;
1057	while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1058	htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1059	while (htmlStartClose[i] != NULL) i++;
1060	i++;
1061	}
1062	htmlStartCloseIndexinitialized = 1;
1063	}
1064
1065	/**
1066	* htmlTagLookup:
1067	* @tag: The tag name in lowercase
1068	*
1069	* Lookup the HTML tag in the ElementTable
1070	*
1071	* Returns the related htmlElemDescPtr or NULL if not found.
1072	*/
1073	const htmlElemDesc *
1074	htmlTagLookup(const xmlChar *tag) {
1075	unsigned int i;
1076
1077	for (i = 0; i < (sizeof(html40ElementTable) /
1078	sizeof(html40ElementTable[0]));i++) {
1079	if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1080	return((htmlElemDescPtr) &html40ElementTable[i]);
1081	}
1082	return(NULL);
1083	}
1084
1085	/**
1086	* htmlGetEndPriority:
1087	* @name: The name of the element to look up the priority for.
1088	*
1089	* Return value: The "endtag" priority.
1090	**/
1091	static int
1092	htmlGetEndPriority (const xmlChar *name) {
1093	int i = 0;
1094
1095	while ((htmlEndPriority[i].name != NULL) &&
1096	(!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1097	i++;
1098
1099	return(htmlEndPriority[i].priority);
1100	}
1101
1102
1103	/**
1104	* htmlCheckAutoClose:
1105	* @newtag: The new tag name
1106	* @oldtag: The old tag name
1107	*
1108	* Checks whether the new tag is one of the registered valid tags for
1109	* closing old.
1110	* Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1111	*
1112	* Returns 0 if no, 1 if yes.
1113	*/
1114	static int
1115	htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1116	{
1117	int i, indx;
1118	const char **closed = NULL;
1119
1120	if (htmlStartCloseIndexinitialized == 0)
1121	htmlInitAutoClose();
1122
1123	/* inefficient, but not a big deal */
1124	for (indx = 0; indx < 100; indx++) {
1125	closed = htmlStartCloseIndex[indx];
1126	if (closed == NULL)
1127	return (0);
1128	if (xmlStrEqual(BAD_CAST * closed, newtag))
1129	break;
1130	}
1131
1132	i = closed - htmlStartClose;
1133	i++;
1134	while (htmlStartClose[i] != NULL) {
1135	if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1136	return (1);
1137	}
1138	i++;
1139	}
1140	return (0);
1141	}
1142
1143	/**
1144	* htmlAutoCloseOnClose:
1145	* @ctxt: an HTML parser context
1146	* @newtag: The new tag name
1147	* @force: force the tag closure
1148	*
1149	* The HTML DTD allows an ending tag to implicitly close other tags.
1150	*/
1151	static void
1152	htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1153	{
1154	const htmlElemDesc *info;
1155	int i, priority;
1156
1157	priority = htmlGetEndPriority(newtag);
1158
1159	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1160
1161	if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1162	break;
1163	/*
1164	* A missplaced endtag can only close elements with lower
1165	* or equal priority, so if we find an element with higher
1166	* priority before we find an element with
1167	* matching name, we just ignore this endtag
1168	*/
1169	if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1170	return;
1171	}
1172	if (i < 0)
1173	return;
1174
1175	while (!xmlStrEqual(newtag, ctxt->name)) {
1176	info = htmlTagLookup(ctxt->name);
1177	if ((info != NULL) && (info->endTag == 3)) {
1178	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1179	"Opening and ending tag mismatch: %s and %s\n",
1180	newtag, ctxt->name);
1181	}
1182	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1183	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1184	htmlnamePop(ctxt);
1185	}
1186	}
1187
1188	/**
1189	* htmlAutoCloseOnEnd:
1190	* @ctxt: an HTML parser context
1191	*
1192	* Close all remaining tags at the end of the stream
1193	*/
1194	static void
1195	htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1196	{
1197	int i;
1198
1199	if (ctxt->nameNr == 0)
1200	return;
1201	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1202	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1203	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1204	htmlnamePop(ctxt);
1205	}
1206	}
1207
1208	/**
1209	* htmlAutoClose:
1210	* @ctxt: an HTML parser context
1211	* @newtag: The new tag name or NULL
1212	*
1213	* The HTML DTD allows a tag to implicitly close other tags.
1214	* The list is kept in htmlStartClose array. This function is
1215	* called when a new tag has been detected and generates the
1216	* appropriates closes if possible/needed.
1217	* If newtag is NULL this mean we are at the end of the resource
1218	* and we should check
1219	*/
1220	static void
1221	htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1222	{
1223	while ((newtag != NULL) && (ctxt->name != NULL) &&
1224	(htmlCheckAutoClose(newtag, ctxt->name))) {
1225	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1226	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1227	htmlnamePop(ctxt);
1228	}
1229	if (newtag == NULL) {
1230	htmlAutoCloseOnEnd(ctxt);
1231	return;
1232	}
1233	while ((newtag == NULL) && (ctxt->name != NULL) &&
1234	((xmlStrEqual(ctxt->name, BAD_CAST "head")) \|\|
1235	(xmlStrEqual(ctxt->name, BAD_CAST "body")) \|\|
1236	(xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1237	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1238	ctxt->sax->endElement(ctxt->userData, ctxt->name);
1239	htmlnamePop(ctxt);
1240	}
1241	}
1242
1243	/**
1244	* htmlAutoCloseTag:
1245	* @doc: the HTML document
1246	* @name: The tag name
1247	* @elem: the HTML element
1248	*
1249	* The HTML DTD allows a tag to implicitly close other tags.
1250	* The list is kept in htmlStartClose array. This function checks
1251	* if the element or one of it's children would autoclose the
1252	* given tag.
1253	*
1254	* Returns 1 if autoclose, 0 otherwise
1255	*/
1256	int
1257	htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1258	htmlNodePtr child;
1259
1260	if (elem == NULL) return(1);
1261	if (xmlStrEqual(name, elem->name)) return(0);
1262	if (htmlCheckAutoClose(elem->name, name)) return(1);
1263	child = elem->children;
1264	while (child != NULL) {
1265	if (htmlAutoCloseTag(doc, name, child)) return(1);
1266	child = child->next;
1267	}
1268	return(0);
1269	}
1270
1271	/**
1272	* htmlIsAutoClosed:
1273	* @doc: the HTML document
1274	* @elem: the HTML element
1275	*
1276	* The HTML DTD allows a tag to implicitly close other tags.
1277	* The list is kept in htmlStartClose array. This function checks
1278	* if a tag is autoclosed by one of it's child
1279	*
1280	* Returns 1 if autoclosed, 0 otherwise
1281	*/
1282	int
1283	htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1284	htmlNodePtr child;
1285
1286	if (elem == NULL) return(1);
1287	child = elem->children;
1288	while (child != NULL) {
1289	if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1290	child = child->next;
1291	}
1292	return(0);
1293	}
1294
1295	/**
1296	* htmlCheckImplied:
1297	* @ctxt: an HTML parser context
1298	* @newtag: The new tag name
1299	*
1300	* The HTML DTD allows a tag to exists only implicitly
1301	* called when a new tag has been detected and generates the
1302	* appropriates implicit tags if missing
1303	*/
1304	static void
1305	htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1306	if (!htmlOmittedDefaultValue)
1307	return;
1308	if (xmlStrEqual(newtag, BAD_CAST"html"))
1309	return;
1310	if (ctxt->nameNr <= 0) {
1311	htmlnamePush(ctxt, BAD_CAST"html");
1312	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1313	ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1314	}
1315	if ((xmlStrEqual(newtag, BAD_CAST"body")) \|\| (xmlStrEqual(newtag, BAD_CAST"head")))
1316	return;
1317	if ((ctxt->nameNr <= 1) &&
1318	((xmlStrEqual(newtag, BAD_CAST"script")) \|\|
1319	(xmlStrEqual(newtag, BAD_CAST"style")) \|\|
1320	(xmlStrEqual(newtag, BAD_CAST"meta")) \|\|
1321	(xmlStrEqual(newtag, BAD_CAST"link")) \|\|
1322	(xmlStrEqual(newtag, BAD_CAST"title")) \|\|
1323	(xmlStrEqual(newtag, BAD_CAST"base")))) {
1324	/*
1325	* dropped OBJECT ... i you put it first BODY will be
1326	* assumed !
1327	*/
1328	htmlnamePush(ctxt, BAD_CAST"head");
1329	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1330	ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1331	} else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1332	(!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1333	(!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1334	int i;
1335	for (i = 0;i < ctxt->nameNr;i++) {
1336	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1337	return;
1338	}
1339	if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1340	return;
1341	}
1342	}
1343
1344	htmlnamePush(ctxt, BAD_CAST"body");
1345	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1346	ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1347	}
1348	}
1349
1350	/**
1351	* htmlCheckParagraph
1352	* @ctxt: an HTML parser context
1353	*
1354	* Check whether a p element need to be implied before inserting
1355	* characters in the current element.
1356	*
1357	* Returns 1 if a paragraph has been inserted, 0 if not and -1
1358	* in case of error.
1359	*/
1360
1361	static int
1362	htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1363	const xmlChar *tag;
1364	int i;
1365
1366	if (ctxt == NULL)
1367	return(-1);
1368	tag = ctxt->name;
1369	if (tag == NULL) {
1370	htmlAutoClose(ctxt, BAD_CAST"p");
1371	htmlCheckImplied(ctxt, BAD_CAST"p");
1372	htmlnamePush(ctxt, BAD_CAST"p");
1373	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1374	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1375	return(1);
1376	}
1377	if (!htmlOmittedDefaultValue)
1378	return(0);
1379	for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1380	if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1381	htmlAutoClose(ctxt, BAD_CAST"p");
1382	htmlCheckImplied(ctxt, BAD_CAST"p");
1383	htmlnamePush(ctxt, BAD_CAST"p");
1384	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1385	ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1386	return(1);
1387	}
1388	}
1389	return(0);
1390	}
1391
1392	/**
1393	* htmlIsScriptAttribute:
1394	* @name: an attribute name
1395	*
1396	* Check if an attribute is of content type Script
1397	*
1398	* Returns 1 is the attribute is a script 0 otherwise
1399	*/
1400	int
1401	htmlIsScriptAttribute(const xmlChar *name) {
1402	unsigned int i;
1403
1404	if (name == NULL)
1405	return(0);
1406	/*
1407	* all script attributes start with 'on'
1408	*/
1409	if ((name[0] != 'o') \|\| (name[1] != 'n'))
1410	return(0);
1411	for (i = 0;
1412	i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1413	i++) {
1414	if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1415	return(1);
1416	}
1417	return(0);
1418	}
1419
1420	/************************************************************************
1421	* *
1422	* The list of HTML predefined entities *
1423	* *
1424	************************************************************************/
1425
1426
1427	static const htmlEntityDesc html40EntitiesTable[] = {
1428	/*
1429	* the 4 absolute ones, plus apostrophe.
1430	*/
1431	{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1432	{ 38, "amp", "ampersand, U+0026 ISOnum" },
1433	{ 39, "apos", "single quote" },
1434	{ 60, "lt", "less-than sign, U+003C ISOnum" },
1435	{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1436
1437	/*
1438	* A bunch still in the 128-255 range
1439	* Replacing them depend really on the charset used.
1440	*/
1441	{ 160, "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1442	{ 161, "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1443	{ 162, "cent", "cent sign, U+00A2 ISOnum" },
1444	{ 163, "pound","pound sign, U+00A3 ISOnum" },
1445	{ 164, "curren","currency sign, U+00A4 ISOnum" },
1446	{ 165, "yen", "yen sign = yuan sign, U+00A5 ISOnum" },
1447	{ 166, "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1448	{ 167, "sect", "section sign, U+00A7 ISOnum" },
1449	{ 168, "uml", "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1450	{ 169, "copy", "copyright sign, U+00A9 ISOnum" },
1451	{ 170, "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1452	{ 171, "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1453	{ 172, "not", "not sign, U+00AC ISOnum" },
1454	{ 173, "shy", "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1455	{ 174, "reg", "registered sign = registered trade mark sign, U+00AE ISOnum" },
1456	{ 175, "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1457	{ 176, "deg", "degree sign, U+00B0 ISOnum" },
1458	{ 177, "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1459	{ 178, "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1460	{ 179, "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1461	{ 180, "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1462	{ 181, "micro","micro sign, U+00B5 ISOnum" },
1463	{ 182, "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1464	{ 183, "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1465	{ 184, "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1466	{ 185, "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1467	{ 186, "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1468	{ 187, "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1469	{ 188, "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1470	{ 189, "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1471	{ 190, "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1472	{ 191, "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1473	{ 192, "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1474	{ 193, "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1475	{ 194, "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1476	{ 195, "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1477	{ 196, "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1478	{ 197, "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1479	{ 198, "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1480	{ 199, "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1481	{ 200, "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1482	{ 201, "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1483	{ 202, "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1484	{ 203, "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1485	{ 204, "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1486	{ 205, "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1487	{ 206, "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1488	{ 207, "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1489	{ 208, "ETH", "latin capital letter ETH, U+00D0 ISOlat1" },
1490	{ 209, "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1491	{ 210, "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1492	{ 211, "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1493	{ 212, "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1494	{ 213, "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1495	{ 214, "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1496	{ 215, "times","multiplication sign, U+00D7 ISOnum" },
1497	{ 216, "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1498	{ 217, "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1499	{ 218, "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1500	{ 219, "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1501	{ 220, "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1502	{ 221, "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1503	{ 222, "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1504	{ 223, "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1505	{ 224, "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1506	{ 225, "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1507	{ 226, "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1508	{ 227, "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1509	{ 228, "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1510	{ 229, "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1511	{ 230, "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1512	{ 231, "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1513	{ 232, "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1514	{ 233, "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1515	{ 234, "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1516	{ 235, "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1517	{ 236, "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1518	{ 237, "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1519	{ 238, "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1520	{ 239, "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1521	{ 240, "eth", "latin small letter eth, U+00F0 ISOlat1" },
1522	{ 241, "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1523	{ 242, "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1524	{ 243, "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1525	{ 244, "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1526	{ 245, "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1527	{ 246, "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1528	{ 247, "divide","division sign, U+00F7 ISOnum" },
1529	{ 248, "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1530	{ 249, "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1531	{ 250, "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1532	{ 251, "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1533	{ 252, "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1534	{ 253, "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1535	{ 254, "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1536	{ 255, "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1537
1538	{ 338, "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1539	{ 339, "oelig","latin small ligature oe, U+0153 ISOlat2" },
1540	{ 352, "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1541	{ 353, "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1542	{ 376, "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1543
1544	/*
1545	* Anything below should really be kept as entities references
1546	*/
1547	{ 402, "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1548
1549	{ 710, "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1550	{ 732, "tilde","small tilde, U+02DC ISOdia" },
1551
1552	{ 913, "Alpha","greek capital letter alpha, U+0391" },
1553	{ 914, "Beta", "greek capital letter beta, U+0392" },
1554	{ 915, "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1555	{ 916, "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1556	{ 917, "Epsilon","greek capital letter epsilon, U+0395" },
1557	{ 918, "Zeta", "greek capital letter zeta, U+0396" },
1558	{ 919, "Eta", "greek capital letter eta, U+0397" },
1559	{ 920, "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1560	{ 921, "Iota", "greek capital letter iota, U+0399" },
1561	{ 922, "Kappa","greek capital letter kappa, U+039A" },
1562	{ 923, "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1563	{ 924, "Mu", "greek capital letter mu, U+039C" },
1564	{ 925, "Nu", "greek capital letter nu, U+039D" },
1565	{ 926, "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1566	{ 927, "Omicron","greek capital letter omicron, U+039F" },
1567	{ 928, "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1568	{ 929, "Rho", "greek capital letter rho, U+03A1" },
1569	{ 931, "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1570	{ 932, "Tau", "greek capital letter tau, U+03A4" },
1571	{ 933, "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1572	{ 934, "Phi", "greek capital letter phi, U+03A6 ISOgrk3" },
1573	{ 935, "Chi", "greek capital letter chi, U+03A7" },
1574	{ 936, "Psi", "greek capital letter psi, U+03A8 ISOgrk3" },
1575	{ 937, "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1576
1577	{ 945, "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1578	{ 946, "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1579	{ 947, "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1580	{ 948, "delta","greek small letter delta, U+03B4 ISOgrk3" },
1581	{ 949, "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1582	{ 950, "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1583	{ 951, "eta", "greek small letter eta, U+03B7 ISOgrk3" },
1584	{ 952, "theta","greek small letter theta, U+03B8 ISOgrk3" },
1585	{ 953, "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1586	{ 954, "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1587	{ 955, "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1588	{ 956, "mu", "greek small letter mu, U+03BC ISOgrk3" },
1589	{ 957, "nu", "greek small letter nu, U+03BD ISOgrk3" },
1590	{ 958, "xi", "greek small letter xi, U+03BE ISOgrk3" },
1591	{ 959, "omicron","greek small letter omicron, U+03BF NEW" },
1592	{ 960, "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1593	{ 961, "rho", "greek small letter rho, U+03C1 ISOgrk3" },
1594	{ 962, "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1595	{ 963, "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1596	{ 964, "tau", "greek small letter tau, U+03C4 ISOgrk3" },
1597	{ 965, "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1598	{ 966, "phi", "greek small letter phi, U+03C6 ISOgrk3" },
1599	{ 967, "chi", "greek small letter chi, U+03C7 ISOgrk3" },
1600	{ 968, "psi", "greek small letter psi, U+03C8 ISOgrk3" },
1601	{ 969, "omega","greek small letter omega, U+03C9 ISOgrk3" },
1602	{ 977, "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1603	{ 978, "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1604	{ 982, "piv", "greek pi symbol, U+03D6 ISOgrk3" },
1605
1606	{ 8194, "ensp", "en space, U+2002 ISOpub" },
1607	{ 8195, "emsp", "em space, U+2003 ISOpub" },
1608	{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1609	{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1610	{ 8205, "zwj", "zero width joiner, U+200D NEW RFC 2070" },
1611	{ 8206, "lrm", "left-to-right mark, U+200E NEW RFC 2070" },
1612	{ 8207, "rlm", "right-to-left mark, U+200F NEW RFC 2070" },
1613	{ 8211, "ndash","en dash, U+2013 ISOpub" },
1614	{ 8212, "mdash","em dash, U+2014 ISOpub" },
1615	{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1616	{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1617	{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1618	{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1619	{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1620	{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1621	{ 8224, "dagger","dagger, U+2020 ISOpub" },
1622	{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1623
1624	{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1625	{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1626
1627	{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1628
1629	{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1630	{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1631
1632	{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1633	{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1634
1635	{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1636	{ 8260, "frasl","fraction slash, U+2044 NEW" },
1637
1638	{ 8364, "euro", "euro sign, U+20AC NEW" },
1639
1640	{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1641	{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1642	{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1643	{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1644	{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1645	{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1646	{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1647	{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1648	{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1649	{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1650	{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1651	{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1652	{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1653	{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1654	{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1655	{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1656
1657	{ 8704, "forall","for all, U+2200 ISOtech" },
1658	{ 8706, "part", "partial differential, U+2202 ISOtech" },
1659	{ 8707, "exist","there exists, U+2203 ISOtech" },
1660	{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1661	{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1662	{ 8712, "isin", "element of, U+2208 ISOtech" },
1663	{ 8713, "notin","not an element of, U+2209 ISOtech" },
1664	{ 8715, "ni", "contains as member, U+220B ISOtech" },
1665	{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1666	{ 8721, "sum", "n-ary summation, U+2211 ISOamsb" },
1667	{ 8722, "minus","minus sign, U+2212 ISOtech" },
1668	{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1669	{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1670	{ 8733, "prop", "proportional to, U+221D ISOtech" },
1671	{ 8734, "infin","infinity, U+221E ISOtech" },
1672	{ 8736, "ang", "angle, U+2220 ISOamso" },
1673	{ 8743, "and", "logical and = wedge, U+2227 ISOtech" },
1674	{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1675	{ 8745, "cap", "intersection = cap, U+2229 ISOtech" },
1676	{ 8746, "cup", "union = cup, U+222A ISOtech" },
1677	{ 8747, "int", "integral, U+222B ISOtech" },
1678	{ 8756, "there4","therefore, U+2234 ISOtech" },
1679	{ 8764, "sim", "tilde operator = varies with = similar to, U+223C ISOtech" },
1680	{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1681	{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1682	{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1683	{ 8801, "equiv","identical to, U+2261 ISOtech" },
1684	{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1685	{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1686	{ 8834, "sub", "subset of, U+2282 ISOtech" },
1687	{ 8835, "sup", "superset of, U+2283 ISOtech" },
1688	{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1689	{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1690	{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1691	{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1692	{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1693	{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1694	{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1695	{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1696	{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1697	{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1698	{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1699	{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1700	{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1701	{ 9674, "loz", "lozenge, U+25CA ISOpub" },
1702
1703	{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1704	{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1705	{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1706	{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1707
1708	};
1709
1710	/************************************************************************
1711	* *
1712	* Commodity functions to handle entities *
1713	* *
1714	************************************************************************/
1715
1716	/*
1717	* Macro used to grow the current buffer.
1718	*/
1719	#define growBuffer(buffer) { \
1720	xmlChar *tmp; \
1721	buffer##_size *= 2; \
1722	tmp = (xmlChar ) xmlRealloc(buffer, buffer##_size sizeof(xmlChar)); \
1723	if (tmp == NULL) { \
1724	htmlErrMemory(ctxt, "growing buffer\n"); \
1725	xmlFree(buffer); \
1726	return(NULL); \
1727	} \
1728	buffer = tmp; \
1729	}
1730
1731	/**
1732	* htmlEntityLookup:
1733	* @name: the entity name
1734	*
1735	* Lookup the given entity in EntitiesTable
1736	*
1737	* TODO: the linear scan is really ugly, an hash table is really needed.
1738	*
1739	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1740	*/
1741	const htmlEntityDesc *
1742	htmlEntityLookup(const xmlChar *name) {
1743	unsigned int i;
1744
1745	for (i = 0;i < (sizeof(html40EntitiesTable)/
1746	sizeof(html40EntitiesTable[0]));i++) {
1747	if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1748	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1749	}
1750	}
1751	return(NULL);
1752	}
1753
1754	/**
1755	* htmlEntityValueLookup:
1756	* @value: the entity's unicode value
1757	*
1758	* Lookup the given entity in EntitiesTable
1759	*
1760	* TODO: the linear scan is really ugly, an hash table is really needed.
1761	*
1762	* Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1763	*/
1764	const htmlEntityDesc *
1765	htmlEntityValueLookup(unsigned int value) {
1766	unsigned int i;
1767
1768	for (i = 0;i < (sizeof(html40EntitiesTable)/
1769	sizeof(html40EntitiesTable[0]));i++) {
1770	if (html40EntitiesTable[i].value >= value) {
1771	if (html40EntitiesTable[i].value > value)
1772	break;
1773	return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1774	}
1775	}
1776	return(NULL);
1777	}
1778
1779	/**
1780	* UTF8ToHtml:
1781	* @out: a pointer to an array of bytes to store the result
1782	* @outlen: the length of @out
1783	* @in: a pointer to an array of UTF-8 chars
1784	* @inlen: the length of @in
1785	*
1786	* Take a block of UTF-8 chars in and try to convert it to an ASCII
1787	* plus HTML entities block of chars out.
1788	*
1789	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1790	* The value of @inlen after return is the number of octets consumed
1791	* as the return value is positive, else unpredictable.
1792	* The value of @outlen after return is the number of octets consumed.
1793	*/
1794	int
1795	UTF8ToHtml(unsigned char* out, int *outlen,
1796	const unsigned char* in, int *inlen) {
1797	const unsigned char* processed = in;
1798	const unsigned char* outend;
1799	const unsigned char* outstart = out;
1800	const unsigned char* instart = in;
1801	const unsigned char* inend;
1802	unsigned int c, d;
1803	int trailing;
1804
1805	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL)) return(-1);
1806	if (in == NULL) {
1807	/*
1808	* initialization nothing to do
1809	*/
1810	*outlen = 0;
1811	*inlen = 0;
1812	return(0);
1813	}
1814	inend = in + (*inlen);
1815	outend = out + (*outlen);
1816	while (in < inend) {
1817	d = *in++;
1818	if (d < 0x80) { c= d; trailing= 0; }
1819	else if (d < 0xC0) {
1820	/* trailing byte in leading position */
1821	*outlen = out - outstart;
1822	*inlen = processed - instart;
1823	return(-2);
1824	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1825	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1826	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1827	else {
1828	/* no chance for this in Ascii */
1829	*outlen = out - outstart;
1830	*inlen = processed - instart;
1831	return(-2);
1832	}
1833
1834	if (inend - in < trailing) {
1835	break;
1836	}
1837
1838	for ( ; trailing; trailing--) {
1839	if ((in >= inend) \|\| (((d= *in++) & 0xC0) != 0x80))
1840	break;
1841	c <<= 6;
1842	c \|= d & 0x3F;
1843	}
1844
1845	/* assertion: c is a single UTF-4 value */
1846	if (c < 0x80) {
1847	if (out + 1 >= outend)
1848	break;
1849	*out++ = c;
1850	} else {
1851	int len;
1852	const htmlEntityDesc * ent;
1853	const char *cp;
1854	char nbuf[16];
1855
1856	/*
1857	* Try to lookup a predefined HTML entity for it
1858	*/
1859
1860	ent = htmlEntityValueLookup(c);
1861	if (ent == NULL) {
1862	snprintf(nbuf, sizeof(nbuf), "#%u", c);
1863	cp = nbuf;
1864	}
1865	else
1866	cp = ent->name;
1867	len = strlen(cp);
1868	if (out + 2 + len >= outend)
1869	break;
1870	*out++ = '&';
1871	memcpy(out, cp, len);
1872	out += len;
1873	*out++ = ';';
1874	}
1875	processed = in;
1876	}
1877	*outlen = out - outstart;
1878	*inlen = processed - instart;
1879	return(0);
1880	}
1881
1882	/**
1883	* htmlEncodeEntities:
1884	* @out: a pointer to an array of bytes to store the result
1885	* @outlen: the length of @out
1886	* @in: a pointer to an array of UTF-8 chars
1887	* @inlen: the length of @in
1888	* @quoteChar: the quote character to escape (' or ") or zero.
1889	*
1890	* Take a block of UTF-8 chars in and try to convert it to an ASCII
1891	* plus HTML entities block of chars out.
1892	*
1893	* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1894	* The value of @inlen after return is the number of octets consumed
1895	* as the return value is positive, else unpredictable.
1896	* The value of @outlen after return is the number of octets consumed.
1897	*/
1898	int
1899	htmlEncodeEntities(unsigned char* out, int *outlen,
1900	const unsigned char* in, int *inlen, int quoteChar) {
1901	const unsigned char* processed = in;
1902	const unsigned char* outend;
1903	const unsigned char* outstart = out;
1904	const unsigned char* instart = in;
1905	const unsigned char* inend;
1906	unsigned int c, d;
1907	int trailing;
1908
1909	if ((out == NULL) \|\| (outlen == NULL) \|\| (inlen == NULL) \|\| (in == NULL))
1910	return(-1);
1911	outend = out + (*outlen);
1912	inend = in + (*inlen);
1913	while (in < inend) {
1914	d = *in++;
1915	if (d < 0x80) { c= d; trailing= 0; }
1916	else if (d < 0xC0) {
1917	/* trailing byte in leading position */
1918	*outlen = out - outstart;
1919	*inlen = processed - instart;
1920	return(-2);
1921	} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
1922	else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
1923	else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
1924	else {
1925	/* no chance for this in Ascii */
1926	*outlen = out - outstart;
1927	*inlen = processed - instart;
1928	return(-2);
1929	}
1930
1931	if (inend - in < trailing)
1932	break;
1933
1934	while (trailing--) {
1935	if (((d= *in++) & 0xC0) != 0x80) {
1936	*outlen = out - outstart;
1937	*inlen = processed - instart;
1938	return(-2);
1939	}
1940	c <<= 6;
1941	c \|= d & 0x3F;
1942	}
1943
1944	/* assertion: c is a single UTF-4 value */
1945	if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
1946	(c != '&') && (c != '<') && (c != '>')) {
1947	if (out >= outend)
1948	break;
1949	*out++ = c;
1950	} else {
1951	const htmlEntityDesc * ent;
1952	const char *cp;
1953	char nbuf[16];
1954	int len;
1955
1956	/*
1957	* Try to lookup a predefined HTML entity for it
1958	*/
1959	ent = htmlEntityValueLookup(c);
1960	if (ent == NULL) {
1961	snprintf(nbuf, sizeof(nbuf), "#%u", c);
1962	cp = nbuf;
1963	}
1964	else
1965	cp = ent->name;
1966	len = strlen(cp);
1967	if (out + 2 + len > outend)
1968	break;
1969	*out++ = '&';
1970	memcpy(out, cp, len);
1971	out += len;
1972	*out++ = ';';
1973	}
1974	processed = in;
1975	}
1976	*outlen = out - outstart;
1977	*inlen = processed - instart;
1978	return(0);
1979	}
1980
1981	/************************************************************************
1982	* *
1983	* Commodity functions to handle streams *
1984	* *
1985	************************************************************************/
1986
1987	/**
1988	* htmlNewInputStream:
1989	* @ctxt: an HTML parser context
1990	*
1991	* Create a new input stream structure
1992	* Returns the new input stream or NULL
1993	*/
1994	static htmlParserInputPtr
1995	htmlNewInputStream(htmlParserCtxtPtr ctxt) {
1996	htmlParserInputPtr input;
1997
1998	input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
1999	if (input == NULL) {
2000	htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2001	return(NULL);
2002	}
2003	memset(input, 0, sizeof(htmlParserInput));
2004	input->filename = NULL;
2005	input->directory = NULL;
2006	input->base = NULL;
2007	input->cur = NULL;
2008	input->buf = NULL;
2009	input->line = 1;
2010	input->col = 1;
2011	input->buf = NULL;
2012	input->free = NULL;
2013	input->version = NULL;
2014	input->consumed = 0;
2015	input->length = 0;
2016	return(input);
2017	}
2018
2019
2020	/************************************************************************
2021	* *
2022	* Commodity functions, cleanup needed ? *
2023	* *
2024	************************************************************************/
2025	/*
2026	* all tags allowing pc data from the html 4.01 loose dtd
2027	* NOTE: it might be more apropriate to integrate this information
2028	* into the html40ElementTable array but I don't want to risk any
2029	* binary incomptibility
2030	*/
2031	static const char *allowPCData[] = {
2032	"a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2033	"blockquote", "body", "button", "caption", "center", "cite", "code",
2034	"dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2035	"h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2036	"li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2037	"small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2038	};
2039
2040	/**
2041	* areBlanks:
2042	* @ctxt: an HTML parser context
2043	* @str: a xmlChar *
2044	* @len: the size of @str
2045	*
2046	* Is this a sequence of blank chars that one can ignore ?
2047	*
2048	* Returns 1 if ignorable 0 otherwise.
2049	*/
2050
2051	static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2052	unsigned int i;
2053	int j;
2054	xmlNodePtr lastChild;
2055	xmlDtdPtr dtd;
2056
2057	for (j = 0;j < len;j++)
2058	if (!(IS_BLANK_CH(str[j]))) return(0);
2059
2060	if (CUR == 0) return(1);
2061	if (CUR != '<') return(0);
2062	if (ctxt->name == NULL)
2063	return(1);
2064	if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2065	return(1);
2066	if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2067	return(1);
2068
2069	/* Only strip CDATA children of the body tag for strict HTML DTDs */
2070	if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2071	dtd = xmlGetIntSubset(ctxt->myDoc);
2072	if (dtd != NULL && dtd->ExternalID != NULL) {
2073	if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") \|\|
2074	!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2075	return(1);
2076	}
2077	}
2078
2079	if (ctxt->node == NULL) return(0);
2080	lastChild = xmlGetLastChild(ctxt->node);
2081	while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2082	lastChild = lastChild->prev;
2083	if (lastChild == NULL) {
2084	if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2085	(ctxt->node->content != NULL)) return(0);
2086	/* keep ws in constructs like ...<b> </b>...
2087	for all tags "b" allowing PCDATA */
2088	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2089	if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2090	return(0);
2091	}
2092	}
2093	} else if (xmlNodeIsText(lastChild)) {
2094	return(0);
2095	} else {
2096	/* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2097	for all tags "p" allowing PCDATA */
2098	for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2099	if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2100	return(0);
2101	}
2102	}
2103	}
2104	return(1);
2105	}
2106
2107	/**
2108	* htmlNewDocNoDtD:
2109	* @URI: URI for the dtd, or NULL
2110	* @ExternalID: the external ID of the DTD, or NULL
2111	*
2112	* Creates a new HTML document without a DTD node if @URI and @ExternalID
2113	* are NULL
2114	*
2115	* Returns a new document, do not initialize the DTD if not provided
2116	*/
2117	htmlDocPtr
2118	htmlNewDocNoDtD(const xmlChar URI, const xmlChar ExternalID) {
2119	xmlDocPtr cur;
2120
2121	/*
2122	* Allocate a new document and fill the fields.
2123	*/
2124	cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2125	if (cur == NULL) {
2126	htmlErrMemory(NULL, "HTML document creation failed\n");
2127	return(NULL);
2128	}
2129	memset(cur, 0, sizeof(xmlDoc));
2130
2131	cur->type = XML_HTML_DOCUMENT_NODE;
2132	cur->version = NULL;
2133	cur->intSubset = NULL;
2134	cur->doc = cur;
2135	cur->name = NULL;
2136	cur->children = NULL;
2137	cur->extSubset = NULL;
2138	cur->oldNs = NULL;
2139	cur->encoding = NULL;
2140	cur->standalone = 1;
2141	cur->compression = 0;
2142	cur->ids = NULL;
2143	cur->refs = NULL;
2144	cur->_private = NULL;
2145	cur->charset = XML_CHAR_ENCODING_UTF8;
2146	if ((ExternalID != NULL) \|\|
2147	(URI != NULL))
2148	xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2149	return(cur);
2150	}
2151
2152	/**
2153	* htmlNewDoc:
2154	* @URI: URI for the dtd, or NULL
2155	* @ExternalID: the external ID of the DTD, or NULL
2156	*
2157	* Creates a new HTML document
2158	*
2159	* Returns a new document
2160	*/
2161	htmlDocPtr
2162	htmlNewDoc(const xmlChar URI, const xmlChar ExternalID) {
2163	if ((URI == NULL) && (ExternalID == NULL))
2164	return(htmlNewDocNoDtD(
2165	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2166	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2167
2168	return(htmlNewDocNoDtD(URI, ExternalID));
2169	}
2170
2171
2172	/************************************************************************
2173	* *
2174	* The parser itself *
2175	* Relates to http://www.w3.org/TR/html40 *
2176	* *
2177	************************************************************************/
2178
2179	/************************************************************************
2180	* *
2181	* The parser itself *
2182	* *
2183	************************************************************************/
2184
2185	static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2186
2187	/**
2188	* htmlParseHTMLName:
2189	* @ctxt: an HTML parser context
2190	*
2191	* parse an HTML tag or attribute name, note that we convert it to lowercase
2192	* since HTML names are not case-sensitive.
2193	*
2194	* Returns the Tag Name parsed or NULL
2195	*/
2196
2197	static const xmlChar *
2198	htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2199	int i = 0;
2200	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2201
2202	if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2203	(CUR != ':')) return(NULL);
2204
2205	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2206	((IS_ASCII_LETTER(CUR)) \|\| (IS_ASCII_DIGIT(CUR)) \|\|
2207	(CUR == ':') \|\| (CUR == '-') \|\| (CUR == '_'))) {
2208	if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2209	else loc[i] = CUR;
2210	i++;
2211
2212	NEXT;
2213	}
2214
2215	return(xmlDictLookup(ctxt->dict, loc, i));
2216	}
2217
2218
2219	/**
2220	* htmlParseHTMLName_nonInvasive:
2221	* @ctxt: an HTML parser context
2222	*
2223	* parse an HTML tag or attribute name, note that we convert it to lowercase
2224	* since HTML names are not case-sensitive, this doesn't consume the data
2225	* from the stream, it's a look-ahead
2226	*
2227	* Returns the Tag Name parsed or NULL
2228	*/
2229
2230	static const xmlChar *
2231	htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2232	int i = 0;
2233	xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2234
2235	if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2236	(NXT(1) != ':')) return(NULL);
2237
2238	while ((i < HTML_PARSER_BUFFER_SIZE) &&
2239	((IS_ASCII_LETTER(NXT(1+i))) \|\| (IS_ASCII_DIGIT(NXT(1+i))) \|\|
2240	(NXT(1+i) == ':') \|\| (NXT(1+i) == '-') \|\| (NXT(1+i) == '_'))) {
2241	if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2242	else loc[i] = NXT(1+i);
2243	i++;
2244	}
2245
2246	return(xmlDictLookup(ctxt->dict, loc, i));
2247	}
2248
2249
2250	/**
2251	* htmlParseName:
2252	* @ctxt: an HTML parser context
2253	*
2254	* parse an HTML name, this routine is case sensitive.
2255	*
2256	* Returns the Name parsed or NULL
2257	*/
2258
2259	static const xmlChar *
2260	htmlParseName(htmlParserCtxtPtr ctxt) {
2261	const xmlChar *in;
2262	const xmlChar *ret;
2263	int count = 0;
2264
2265	GROW;
2266
2267	/*
2268	* Accelerator for simple ASCII names
2269	*/
2270	in = ctxt->input->cur;
2271	if (((in >= 0x61) && (in <= 0x7A)) \|\|
2272	((in >= 0x41) && (in <= 0x5A)) \|\|
2273	(in == '_') \|\| (in == ':')) {
2274	in++;
2275	while (((in >= 0x61) && (in <= 0x7A)) \|\|
2276	((in >= 0x41) && (in <= 0x5A)) \|\|
2277	((in >= 0x30) && (in <= 0x39)) \|\|
2278	(in == '_') \|\| (in == '-') \|\|
2279	(in == ':') \|\| (in == '.'))
2280	in++;
2281	if ((in > 0) && (in < 0x80)) {
2282	count = in - ctxt->input->cur;
2283	ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2284	ctxt->input->cur = in;
2285	ctxt->nbChars += count;
2286	ctxt->input->col += count;
2287	return(ret);
2288	}
2289	}
2290	return(htmlParseNameComplex(ctxt));
2291	}
2292
2293	static const xmlChar *
2294	htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2295	int len = 0, l;
2296	int c;
2297	int count = 0;
2298
2299	/*
2300	* Handler for more complex cases
2301	*/
2302	GROW;
2303	c = CUR_CHAR(l);
2304	if ((c == ' ') \|\| (c == '>') \|\| (c == '/') \|\| /* accelerators */
2305	(!IS_LETTER(c) && (c != '_') &&
2306	(c != ':'))) {
2307	return(NULL);
2308	}
2309
2310	while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2311	((IS_LETTER(c)) \|\| (IS_DIGIT(c)) \|\|
2312	(c == '.') \|\| (c == '-') \|\|
2313	(c == '_') \|\| (c == ':') \|\|
2314	(IS_COMBINING(c)) \|\|
2315	(IS_EXTENDER(c)))) {
2316	if (count++ > 100) {
2317	count = 0;
2318	GROW;
2319	}
2320	len += l;
2321	NEXTL(l);
2322	c = CUR_CHAR(l);
2323	}
2324	return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2325	}
2326
2327
2328	/**
2329	* htmlParseHTMLAttribute:
2330	* @ctxt: an HTML parser context
2331	* @stop: a char stop value
2332	*
2333	* parse an HTML attribute value till the stop (quote), if
2334	* stop is 0 then it stops at the first space
2335	*
2336	* Returns the attribute parsed or NULL
2337	*/
2338
2339	static xmlChar *
2340	htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2341	xmlChar *buffer = NULL;
2342	int buffer_size = 0;
2343	xmlChar *out = NULL;
2344	const xmlChar *name = NULL;
2345	const xmlChar *cur = NULL;
2346	const htmlEntityDesc * ent;
2347
2348	/*
2349	* allocate a translation buffer.
2350	*/
2351	buffer_size = HTML_PARSER_BUFFER_SIZE;
2352	buffer = (xmlChar ) xmlMallocAtomic(buffer_size sizeof(xmlChar));
2353	if (buffer == NULL) {
2354	htmlErrMemory(ctxt, "buffer allocation failed\n");
2355	return(NULL);
2356	}
2357	out = buffer;
2358
2359	/*
2360	* Ok loop until we reach one of the ending chars
2361	*/
2362	while ((CUR != 0) && (CUR != stop)) {
2363	if ((stop == 0) && (CUR == '>')) break;
2364	if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2365	if (CUR == '&') {
2366	if (NXT(1) == '#') {
2367	unsigned int c;
2368	int bits;
2369
2370	c = htmlParseCharRef(ctxt);
2371	if (c < 0x80)
2372	{ *out++ = c; bits= -6; }
2373	else if (c < 0x800)
2374	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2375	else if (c < 0x10000)
2376	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2377	else
2378	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2379
2380	for ( ; bits >= 0; bits-= 6) {
2381	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2382	}
2383
2384	if (out - buffer > buffer_size - 100) {
2385	int indx = out - buffer;
2386
2387	growBuffer(buffer);
2388	out = &buffer[indx];
2389	}
2390	} else {
2391	ent = htmlParseEntityRef(ctxt, &name);
2392	if (name == NULL) {
2393	*out++ = '&';
2394	if (out - buffer > buffer_size - 100) {
2395	int indx = out - buffer;
2396
2397	growBuffer(buffer);
2398	out = &buffer[indx];
2399	}
2400	} else if (ent == NULL) {
2401	*out++ = '&';
2402	cur = name;
2403	while (*cur != 0) {
2404	if (out - buffer > buffer_size - 100) {
2405	int indx = out - buffer;
2406
2407	growBuffer(buffer);
2408	out = &buffer[indx];
2409	}
2410	out++ = cur++;
2411	}
2412	} else {
2413	unsigned int c;
2414	int bits;
2415
2416	if (out - buffer > buffer_size - 100) {
2417	int indx = out - buffer;
2418
2419	growBuffer(buffer);
2420	out = &buffer[indx];
2421	}
2422	c = ent->value;
2423	if (c < 0x80)
2424	{ *out++ = c; bits= -6; }
2425	else if (c < 0x800)
2426	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2427	else if (c < 0x10000)
2428	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2429	else
2430	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2431
2432	for ( ; bits >= 0; bits-= 6) {
2433	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2434	}
2435	}
2436	}
2437	} else {
2438	unsigned int c;
2439	int bits, l;
2440
2441	if (out - buffer > buffer_size - 100) {
2442	int indx = out - buffer;
2443
2444	growBuffer(buffer);
2445	out = &buffer[indx];
2446	}
2447	c = CUR_CHAR(l);
2448	if (c < 0x80)
2449	{ *out++ = c; bits= -6; }
2450	else if (c < 0x800)
2451	{ *out++ =((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
2452	else if (c < 0x10000)
2453	{ *out++ =((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
2454	else
2455	{ *out++ =((c >> 18) & 0x07) \| 0xF0; bits= 12; }
2456
2457	for ( ; bits >= 0; bits-= 6) {
2458	*out++ = ((c >> bits) & 0x3F) \| 0x80;
2459	}
2460	NEXT;
2461	}
2462	}
2463	*out++ = 0;
2464	return(buffer);
2465	}
2466
2467	/**
2468	* htmlParseEntityRef:
2469	* @ctxt: an HTML parser context
2470	* @str: location to store the entity name
2471	*
2472	* parse an HTML ENTITY references
2473	*
2474	* [68] EntityRef ::= '&' Name ';'
2475	*
2476	* Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2477	* if non-NULL *str will have to be freed by the caller.
2478	*/
2479	const htmlEntityDesc *
2480	htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2481	const xmlChar *name;
2482	const htmlEntityDesc * ent = NULL;
2483
2484	if (str != NULL) *str = NULL;
2485	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) return(NULL);
2486
2487	if (CUR == '&') {
2488	NEXT;
2489	name = htmlParseName(ctxt);
2490	if (name == NULL) {
2491	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2492	"htmlParseEntityRef: no name\n", NULL, NULL);
2493	} else {
2494	GROW;
2495	if (CUR == ';') {
2496	if (str != NULL)
2497	*str = name;
2498
2499	/*
2500	* Lookup the entity in the table.
2501	*/
2502	ent = htmlEntityLookup(name);
2503	if (ent != NULL) /* OK that's ugly !!! */
2504	NEXT;
2505	} else {
2506	htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2507	"htmlParseEntityRef: expecting ';'\n",
2508	NULL, NULL);
2509	if (str != NULL)
2510	*str = name;
2511	}
2512	}
2513	}
2514	return(ent);
2515	}
2516
2517	/**
2518	* htmlParseAttValue:
2519	* @ctxt: an HTML parser context
2520	*
2521	* parse a value for an attribute
2522	* Note: the parser won't do substitution of entities here, this
2523	* will be handled later in xmlStringGetNodeList, unless it was
2524	* asked for ctxt->replaceEntities != 0
2525	*
2526	* Returns the AttValue parsed or NULL.
2527	*/
2528
2529	static xmlChar *
2530	htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2531	xmlChar *ret = NULL;
2532
2533	if (CUR == '"') {
2534	NEXT;
2535	ret = htmlParseHTMLAttribute(ctxt, '"');
2536	if (CUR != '"') {
2537	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2538	"AttValue: \" expected\n", NULL, NULL);
2539	} else
2540	NEXT;
2541	} else if (CUR == '\'') {
2542	NEXT;
2543	ret = htmlParseHTMLAttribute(ctxt, '\'');
2544	if (CUR != '\'') {
2545	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2546	"AttValue: ' expected\n", NULL, NULL);
2547	} else
2548	NEXT;
2549	} else {
2550	/*
2551	* That's an HTMLism, the attribute value may not be quoted
2552	*/
2553	ret = htmlParseHTMLAttribute(ctxt, 0);
2554	if (ret == NULL) {
2555	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2556	"AttValue: no value found\n", NULL, NULL);
2557	}
2558	}
2559	return(ret);
2560	}
2561
2562	/**
2563	* htmlParseSystemLiteral:
2564	* @ctxt: an HTML parser context
2565	*
2566	* parse an HTML Literal
2567	*
2568	* [11] SystemLiteral ::= ('"' [^"]* '"') \| ("'" [^']* "'")
2569	*
2570	* Returns the SystemLiteral parsed or NULL
2571	*/
2572
2573	static xmlChar *
2574	htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2575	const xmlChar *q;
2576	xmlChar *ret = NULL;
2577
2578	if (CUR == '"') {
2579	NEXT;
2580	q = CUR_PTR;
2581	while ((IS_CHAR_CH(CUR)) && (CUR != '"'))
2582	NEXT;
2583	if (!IS_CHAR_CH(CUR)) {
2584	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2585	"Unfinished SystemLiteral\n", NULL, NULL);
2586	} else {
2587	ret = xmlStrndup(q, CUR_PTR - q);
2588	NEXT;
2589	}
2590	} else if (CUR == '\'') {
2591	NEXT;
2592	q = CUR_PTR;
2593	while ((IS_CHAR_CH(CUR)) && (CUR != '\''))
2594	NEXT;
2595	if (!IS_CHAR_CH(CUR)) {
2596	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2597	"Unfinished SystemLiteral\n", NULL, NULL);
2598	} else {
2599	ret = xmlStrndup(q, CUR_PTR - q);
2600	NEXT;
2601	}
2602	} else {
2603	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2604	" or ' expected\n", NULL, NULL);
2605	}
2606
2607	return(ret);
2608	}
2609
2610	/**
2611	* htmlParsePubidLiteral:
2612	* @ctxt: an HTML parser context
2613	*
2614	* parse an HTML public literal
2615	*
2616	* [12] PubidLiteral ::= '"' PubidChar* '"' \| "'" (PubidChar - "'")* "'"
2617	*
2618	* Returns the PubidLiteral parsed or NULL.
2619	*/
2620
2621	static xmlChar *
2622	htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2623	const xmlChar *q;
2624	xmlChar *ret = NULL;
2625	/*
2626	* Name ::= (Letter \| '_') (NameChar)*
2627	*/
2628	if (CUR == '"') {
2629	NEXT;
2630	q = CUR_PTR;
2631	while (IS_PUBIDCHAR_CH(CUR)) NEXT;
2632	if (CUR != '"') {
2633	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2634	"Unfinished PubidLiteral\n", NULL, NULL);
2635	} else {
2636	ret = xmlStrndup(q, CUR_PTR - q);
2637	NEXT;
2638	}
2639	} else if (CUR == '\'') {
2640	NEXT;
2641	q = CUR_PTR;
2642	while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\''))
2643	NEXT;
2644	if (CUR != '\'') {
2645	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2646	"Unfinished PubidLiteral\n", NULL, NULL);
2647	} else {
2648	ret = xmlStrndup(q, CUR_PTR - q);
2649	NEXT;
2650	}
2651	} else {
2652	htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2653	"PubidLiteral \" or ' expected\n", NULL, NULL);
2654	}
2655
2656	return(ret);
2657	}
2658
2659	/**
2660	* htmlParseScript:
2661	* @ctxt: an HTML parser context
2662	*
2663	* parse the content of an HTML SCRIPT or STYLE element
2664	* http://www.w3.org/TR/html4/sgml/dtd.html#Script
2665	* http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2666	* http://www.w3.org/TR/html4/types.html#type-script
2667	* http://www.w3.org/TR/html4/types.html#h-6.15
2668	* http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2669	*
2670	* Script data ( %Script; in the DTD) can be the content of the SCRIPT
2671	* element and the value of intrinsic event attributes. User agents must
2672	* not evaluate script data as HTML markup but instead must pass it on as
2673	* data to a script engine.
2674	* NOTES:
2675	* - The content is passed like CDATA
2676	* - the attributes for style and scripting "onXXX" are also described
2677	* as CDATA but SGML allows entities references in attributes so their
2678	* processing is identical as other attributes
2679	*/
2680	static void
2681	htmlParseScript(htmlParserCtxtPtr ctxt) {
2682	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2683	int nbchar = 0;
2684	int cur,l;
2685
2686	SHRINK;
2687	cur = CUR_CHAR(l);
2688	while (IS_CHAR_CH(cur)) {
2689	if ((cur == '<') && (NXT(1) == '/')) {
2690	/*
2691	* One should break here, the specification is clear:
2692	* Authors should therefore escape "</" within the content.
2693	* Escape mechanisms are specific to each scripting or
2694	* style sheet language.
2695	*
2696	* In recovery mode, only break if end tag match the
2697	* current tag, effectively ignoring all tags inside the
2698	* script/style block and treating the entire block as
2699	* CDATA.
2700	*/
2701	if (ctxt->recovery) {
2702	if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2703	xmlStrlen(ctxt->name)) == 0)
2704	{
2705	break; /* while */
2706	} else {
2707	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2708	"Element %s embeds close tag\n",
2709	ctxt->name, NULL);
2710	}
2711	} else {
2712	if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) \|\|
2713	((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2714	{
2715	break; /* while */
2716	}
2717	}
2718	}
2719	COPY_BUF(l,buf,nbchar,cur);
2720	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2721	if (ctxt->sax->cdataBlock!= NULL) {
2722	/*
2723	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2724	*/
2725	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2726	} else if (ctxt->sax->characters != NULL) {
2727	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2728	}
2729	nbchar = 0;
2730	}
2731	GROW;
2732	NEXTL(l);
2733	cur = CUR_CHAR(l);
2734	}
2735
2736	if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2737	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2738	"Invalid char in CDATA 0x%X\n", cur);
2739	NEXT;
2740	}
2741
2742	if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2743	if (ctxt->sax->cdataBlock!= NULL) {
2744	/*
2745	* Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2746	*/
2747	ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2748	} else if (ctxt->sax->characters != NULL) {
2749	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2750	}
2751	}
2752	}
2753
2754
2755	/**
2756	* htmlParseCharData:
2757	* @ctxt: an HTML parser context
2758	*
2759	* parse a CharData section.
2760	* if we are within a CDATA section ']]>' marks an end of section.
2761	*
2762	* [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
2763	*/
2764
2765	static void
2766	htmlParseCharData(htmlParserCtxtPtr ctxt) {
2767	xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2768	int nbchar = 0;
2769	int cur, l;
2770
2771	SHRINK;
2772	cur = CUR_CHAR(l);
2773	while (((cur != '<') \|\| (ctxt->token == '<')) &&
2774	((cur != '&') \|\| (ctxt->token == '&')) &&
2775	(IS_CHAR(cur))) {
2776	COPY_BUF(l,buf,nbchar,cur);
2777	if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2778	/*
2779	* Ok the segment is to be consumed as chars.
2780	*/
2781	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2782	if (areBlanks(ctxt, buf, nbchar)) {
2783	if (ctxt->sax->ignorableWhitespace != NULL)
2784	ctxt->sax->ignorableWhitespace(ctxt->userData,
2785	buf, nbchar);
2786	} else {
2787	htmlCheckParagraph(ctxt);
2788	if (ctxt->sax->characters != NULL)
2789	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2790	}
2791	}
2792	nbchar = 0;
2793	}
2794	NEXTL(l);
2795	cur = CUR_CHAR(l);
2796	if (cur == 0) {
2797	SHRINK;
2798	GROW;
2799	cur = CUR_CHAR(l);
2800	}
2801	}
2802	if (nbchar != 0) {
2803	buf[nbchar] = 0;
2804
2805	/*
2806	* Ok the segment is to be consumed as chars.
2807	*/
2808	if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2809	if (areBlanks(ctxt, buf, nbchar)) {
2810	if (ctxt->sax->ignorableWhitespace != NULL)
2811	ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar);
2812	} else {
2813	htmlCheckParagraph(ctxt);
2814	if (ctxt->sax->characters != NULL)
2815	ctxt->sax->characters(ctxt->userData, buf, nbchar);
2816	}
2817	}
2818	} else {
2819	/*
2820	* Loop detection
2821	*/
2822	if (cur == 0)
2823	ctxt->instate = XML_PARSER_EOF;
2824	}
2825	}
2826
2827	/**
2828	* htmlParseExternalID:
2829	* @ctxt: an HTML parser context
2830	* @publicID: a xmlChar** receiving PubidLiteral
2831	*
2832	* Parse an External ID or a Public ID
2833	*
2834	* [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2835	* \| 'PUBLIC' S PubidLiteral S SystemLiteral
2836	*
2837	* [83] PublicID ::= 'PUBLIC' S PubidLiteral
2838	*
2839	* Returns the function returns SystemLiteral and in the second
2840	* case publicID receives PubidLiteral, is strict is off
2841	* it is possible to return NULL and have publicID set.
2842	*/
2843
2844	static xmlChar *
2845	htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
2846	xmlChar *URI = NULL;
2847
2848	if ((UPPER == 'S') && (UPP(1) == 'Y') &&
2849	(UPP(2) == 'S') && (UPP(3) == 'T') &&
2850	(UPP(4) == 'E') && (UPP(5) == 'M')) {
2851	SKIP(6);
2852	if (!IS_BLANK_CH(CUR)) {
2853	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2854	"Space required after 'SYSTEM'\n", NULL, NULL);
2855	}
2856	SKIP_BLANKS;
2857	URI = htmlParseSystemLiteral(ctxt);
2858	if (URI == NULL) {
2859	htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
2860	"htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
2861	}
2862	} else if ((UPPER == 'P') && (UPP(1) == 'U') &&
2863	(UPP(2) == 'B') && (UPP(3) == 'L') &&
2864	(UPP(4) == 'I') && (UPP(5) == 'C')) {
2865	SKIP(6);
2866	if (!IS_BLANK_CH(CUR)) {
2867	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2868	"Space required after 'PUBLIC'\n", NULL, NULL);
2869	}
2870	SKIP_BLANKS;
2871	*publicID = htmlParsePubidLiteral(ctxt);
2872	if (*publicID == NULL) {
2873	htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
2874	"htmlParseExternalID: PUBLIC, no Public Identifier\n",
2875	NULL, NULL);
2876	}
2877	SKIP_BLANKS;
2878	if ((CUR == '"') \|\| (CUR == '\'')) {
2879	URI = htmlParseSystemLiteral(ctxt);
2880	}
2881	}
2882	return(URI);
2883	}
2884
2885	/**
2886	* xmlParsePI:
2887	* @ctxt: an XML parser context
2888	*
2889	* parse an XML Processing Instruction.
2890	*
2891	* [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
2892	*/
2893	static void
2894	htmlParsePI(htmlParserCtxtPtr ctxt) {
2895	xmlChar *buf = NULL;
2896	int len = 0;
2897	int size = HTML_PARSER_BUFFER_SIZE;
2898	int cur, l;
2899	const xmlChar *target;
2900	xmlParserInputState state;
2901	int count = 0;
2902
2903	if ((RAW == '<') && (NXT(1) == '?')) {
2904	state = ctxt->instate;
2905	ctxt->instate = XML_PARSER_PI;
2906	/*
2907	* this is a Processing Instruction.
2908	*/
2909	SKIP(2);
2910	SHRINK;
2911
2912	/*
2913	* Parse the target name and check for special support like
2914	* namespace.
2915	*/
2916	target = htmlParseName(ctxt);
2917	if (target != NULL) {
2918	if (RAW == '>') {
2919	SKIP(1);
2920
2921	/*
2922	* SAX: PI detected.
2923	*/
2924	if ((ctxt->sax) && (!ctxt->disableSAX) &&
2925	(ctxt->sax->processingInstruction != NULL))
2926	ctxt->sax->processingInstruction(ctxt->userData,
2927	target, NULL);
2928	ctxt->instate = state;
2929	return;
2930	}
2931	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
2932	if (buf == NULL) {
2933	htmlErrMemory(ctxt, NULL);
2934	ctxt->instate = state;
2935	return;
2936	}
2937	cur = CUR;
2938	if (!IS_BLANK(cur)) {
2939	htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
2940	"ParsePI: PI %s space expected\n", target, NULL);
2941	}
2942	SKIP_BLANKS;
2943	cur = CUR_CHAR(l);
2944	while (IS_CHAR(cur) && (cur != '>')) {
2945	if (len + 5 >= size) {
2946	xmlChar *tmp;
2947
2948	size *= 2;
2949	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
2950	if (tmp == NULL) {
2951	htmlErrMemory(ctxt, NULL);
2952	xmlFree(buf);
2953	ctxt->instate = state;
2954	return;
2955	}
2956	buf = tmp;
2957	}
2958	count++;
2959	if (count > 50) {
2960	GROW;
2961	count = 0;
2962	}
2963	COPY_BUF(l,buf,len,cur);
2964	NEXTL(l);
2965	cur = CUR_CHAR(l);
2966	if (cur == 0) {
2967	SHRINK;
2968	GROW;
2969	cur = CUR_CHAR(l);
2970	}
2971	}
2972	buf[len] = 0;
2973	if (cur != '>') {
2974	htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
2975	"ParsePI: PI %s never end ...\n", target, NULL);
2976	} else {
2977	SKIP(1);
2978
2979	/*
2980	* SAX: PI detected.
2981	*/
2982	if ((ctxt->sax) && (!ctxt->disableSAX) &&
2983	(ctxt->sax->processingInstruction != NULL))
2984	ctxt->sax->processingInstruction(ctxt->userData,
2985	target, buf);
2986	}
2987	xmlFree(buf);
2988	} else {
2989	htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
2990	"PI is not started correctly", NULL, NULL);
2991	}
2992	ctxt->instate = state;
2993	}
2994	}
2995
2996	/**
2997	* htmlParseComment:
2998	* @ctxt: an HTML parser context
2999	*
3000	* Parse an XML (SGML) comment <!-- .... -->
3001	*
3002	* [15] Comment ::= '<!--' ((Char - '-') \| ('-' (Char - '-')))* '-->'
3003	*/
3004	static void
3005	htmlParseComment(htmlParserCtxtPtr ctxt) {
3006	xmlChar *buf = NULL;
3007	int len;
3008	int size = HTML_PARSER_BUFFER_SIZE;
3009	int q, ql;
3010	int r, rl;
3011	int cur, l;
3012	xmlParserInputState state;
3013
3014	/*
3015	* Check that there is a comment right here.
3016	*/
3017	if ((RAW != '<') \|\| (NXT(1) != '!') \|\|
3018	(NXT(2) != '-') \|\| (NXT(3) != '-')) return;
3019
3020	state = ctxt->instate;
3021	ctxt->instate = XML_PARSER_COMMENT;
3022	SHRINK;
3023	SKIP(4);
3024	buf = (xmlChar ) xmlMallocAtomic(size sizeof(xmlChar));
3025	if (buf == NULL) {
3026	htmlErrMemory(ctxt, "buffer allocation failed\n");
3027	ctxt->instate = state;
3028	return;
3029	}
3030	q = CUR_CHAR(ql);
3031	NEXTL(ql);
3032	r = CUR_CHAR(rl);
3033	NEXTL(rl);
3034	cur = CUR_CHAR(l);
3035	len = 0;
3036	while (IS_CHAR(cur) &&
3037	((cur != '>') \|\|
3038	(r != '-') \|\| (q != '-'))) {
3039	if (len + 5 >= size) {
3040	xmlChar *tmp;
3041
3042	size *= 2;
3043	tmp = (xmlChar ) xmlRealloc(buf, size sizeof(xmlChar));
3044	if (tmp == NULL) {
3045	xmlFree(buf);
3046	htmlErrMemory(ctxt, "growing buffer failed\n");
3047	ctxt->instate = state;
3048	return;
3049	}
3050	buf = tmp;
3051	}
3052	COPY_BUF(ql,buf,len,q);
3053	q = r;
3054	ql = rl;
3055	r = cur;
3056	rl = l;
3057	NEXTL(l);
3058	cur = CUR_CHAR(l);
3059	if (cur == 0) {
3060	SHRINK;
3061	GROW;
3062	cur = CUR_CHAR(l);
3063	}
3064	}
3065	buf[len] = 0;
3066	if (!IS_CHAR(cur)) {
3067	htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3068	"Comment not terminated \n<!--%.50s\n", buf, NULL);
3069	xmlFree(buf);
3070	} else {
3071	NEXT;
3072	if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3073	(!ctxt->disableSAX))
3074	ctxt->sax->comment(ctxt->userData, buf);
3075	xmlFree(buf);
3076	}
3077	ctxt->instate = state;
3078	}
3079
3080	/**
3081	* htmlParseCharRef:
3082	* @ctxt: an HTML parser context
3083	*
3084	* parse Reference declarations
3085	*
3086	* [66] CharRef ::= '&#' [0-9]+ ';' \|
3087	* '&#x' [0-9a-fA-F]+ ';'
3088	*
3089	* Returns the value parsed (as an int)
3090	*/
3091	int
3092	htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3093	int val = 0;
3094
3095	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3096	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3097	"htmlParseCharRef: context error\n",
3098	NULL, NULL);
3099	return(0);
3100	}
3101	if ((CUR == '&') && (NXT(1) == '#') &&
3102	((NXT(2) == 'x') \|\| NXT(2) == 'X')) {
3103	SKIP(3);
3104	while (CUR != ';') {
3105	if ((CUR >= '0') && (CUR <= '9'))
3106	val = val * 16 + (CUR - '0');
3107	else if ((CUR >= 'a') && (CUR <= 'f'))
3108	val = val * 16 + (CUR - 'a') + 10;
3109	else if ((CUR >= 'A') && (CUR <= 'F'))
3110	val = val * 16 + (CUR - 'A') + 10;
3111	else {
3112	htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3113	"htmlParseCharRef: invalid hexadecimal value\n",
3114	NULL, NULL);
3115	return(0);
3116	}
3117	NEXT;
3118	}
3119	if (CUR == ';')
3120	NEXT;
3121	} else if ((CUR == '&') && (NXT(1) == '#')) {
3122	SKIP(2);
3123	while (CUR != ';') {
3124	if ((CUR >= '0') && (CUR <= '9'))
3125	val = val * 10 + (CUR - '0');
3126	else {
3127	htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3128	"htmlParseCharRef: invalid decimal value\n",
3129	NULL, NULL);
3130	return(0);
3131	}
3132	NEXT;
3133	}
3134	if (CUR == ';')
3135	NEXT;
3136	} else {
3137	htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3138	"htmlParseCharRef: invalid value\n", NULL, NULL);
3139	}
3140	/*
3141	* Check the value IS_CHAR ...
3142	*/
3143	if (IS_CHAR(val)) {
3144	return(val);
3145	} else {
3146	htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3147	"htmlParseCharRef: invalid xmlChar value %d\n",
3148	val);
3149	}
3150	return(0);
3151	}
3152
3153
3154	/**
3155	* htmlParseDocTypeDecl:
3156	* @ctxt: an HTML parser context
3157	*
3158	* parse a DOCTYPE declaration
3159	*
3160	* [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3161	* ('[' (markupdecl \| PEReference \| S)* ']' S?)? '>'
3162	*/
3163
3164	static void
3165	htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3166	const xmlChar *name;
3167	xmlChar *ExternalID = NULL;
3168	xmlChar *URI = NULL;
3169
3170	/*
3171	* We know that '<!DOCTYPE' has been detected.
3172	*/
3173	SKIP(9);
3174
3175	SKIP_BLANKS;
3176
3177	/*
3178	* Parse the DOCTYPE name.
3179	*/
3180	name = htmlParseName(ctxt);
3181	if (name == NULL) {
3182	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3183	"htmlParseDocTypeDecl : no DOCTYPE name !\n",
3184	NULL, NULL);
3185	}
3186	/*
3187	* Check that upper(name) == "HTML" !!!!!!!!!!!!!
3188	*/
3189
3190	SKIP_BLANKS;
3191
3192	/*
3193	* Check for SystemID and ExternalID
3194	*/
3195	URI = htmlParseExternalID(ctxt, &ExternalID);
3196	SKIP_BLANKS;
3197
3198	/*
3199	* We should be at the end of the DOCTYPE declaration.
3200	*/
3201	if (CUR != '>') {
3202	htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3203	"DOCTYPE improperly terminated\n", NULL, NULL);
3204	/* We shouldn't try to resynchronize ... */
3205	}
3206	NEXT;
3207
3208	/*
3209	* Create or update the document accordingly to the DOCTYPE
3210	*/
3211	if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3212	(!ctxt->disableSAX))
3213	ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3214
3215	/*
3216	* Cleanup, since we don't use all those identifiers
3217	*/
3218	if (URI != NULL) xmlFree(URI);
3219	if (ExternalID != NULL) xmlFree(ExternalID);
3220	}
3221
3222	/**
3223	* htmlParseAttribute:
3224	* @ctxt: an HTML parser context
3225	* @value: a xmlChar ** used to store the value of the attribute
3226	*
3227	* parse an attribute
3228	*
3229	* [41] Attribute ::= Name Eq AttValue
3230	*
3231	* [25] Eq ::= S? '=' S?
3232	*
3233	* With namespace:
3234	*
3235	* [NS 11] Attribute ::= QName Eq AttValue
3236	*
3237	* Also the case QName == xmlns:??? is handled independently as a namespace
3238	* definition.
3239	*
3240	* Returns the attribute name, and the value in *value.
3241	*/
3242
3243	static const xmlChar *
3244	htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3245	const xmlChar *name;
3246	xmlChar *val = NULL;
3247
3248	*value = NULL;
3249	name = htmlParseHTMLName(ctxt);
3250	if (name == NULL) {
3251	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3252	"error parsing attribute name\n", NULL, NULL);
3253	return(NULL);
3254	}
3255
3256	/*
3257	* read the value
3258	*/
3259	SKIP_BLANKS;
3260	if (CUR == '=') {
3261	NEXT;
3262	SKIP_BLANKS;
3263	val = htmlParseAttValue(ctxt);
3264	} else if (htmlIsBooleanAttr(name)) {
3265	/*
3266	* assume a minimized attribute
3267	*/
3268	val = xmlStrdup(name);
3269	}
3270
3271	*value = val;
3272	return(name);
3273	}
3274
3275	/**
3276	* htmlCheckEncoding:
3277	* @ctxt: an HTML parser context
3278	* @attvalue: the attribute value
3279	*
3280	* Checks an http-equiv attribute from a Meta tag to detect
3281	* the encoding
3282	* If a new encoding is detected the parser is switched to decode
3283	* it and pass UTF8
3284	*/
3285	static void
3286	htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3287	const xmlChar *encoding;
3288
3289	if ((ctxt == NULL) \|\| (attvalue == NULL))
3290	return;
3291
3292	/* do not change encoding */
3293	if (ctxt->input->encoding != NULL)
3294	return;
3295
3296	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
3297	if (encoding != NULL) {
3298	encoding += 8;
3299	} else {
3300	encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
3301	if (encoding != NULL)
3302	encoding += 9;
3303	}
3304	if (encoding != NULL) {
3305	xmlCharEncoding enc;
3306	xmlCharEncodingHandlerPtr handler;
3307
3308	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
3309
3310	if (ctxt->input->encoding != NULL)
3311	xmlFree((xmlChar *) ctxt->input->encoding);
3312	ctxt->input->encoding = xmlStrdup(encoding);
3313
3314	enc = xmlParseCharEncoding((const char *) encoding);
3315	/*
3316	* registered set of known encodings
3317	*/
3318	if (enc != XML_CHAR_ENCODING_ERROR) {
3319	if (((enc == XML_CHAR_ENCODING_UTF16LE) \|\|
3320	(enc == XML_CHAR_ENCODING_UTF16BE) \|\|
3321	(enc == XML_CHAR_ENCODING_UCS4LE) \|\|
3322	(enc == XML_CHAR_ENCODING_UCS4BE)) &&
3323	(ctxt->input->buf != NULL) &&
3324	(ctxt->input->buf->encoder == NULL)) {
3325	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3326	"htmlCheckEncoding: wrong encoding meta\n",
3327	NULL, NULL);
3328	} else {
3329	xmlSwitchEncoding(ctxt, enc);
3330	}
3331	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3332	} else {
3333	/*
3334	* fallback for unknown encodings
3335	*/
3336	handler = xmlFindCharEncodingHandler((const char *) encoding);
3337	if (handler != NULL) {
3338	xmlSwitchToEncoding(ctxt, handler);
3339	ctxt->charset = XML_CHAR_ENCODING_UTF8;
3340	} else {
3341	ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
3342	}
3343	}
3344
3345	if ((ctxt->input->buf != NULL) &&
3346	(ctxt->input->buf->encoder != NULL) &&
3347	(ctxt->input->buf->raw != NULL) &&
3348	(ctxt->input->buf->buffer != NULL)) {
3349	int nbchars;
3350	int processed;
3351
3352	/*
3353	* convert as much as possible to the parser reading buffer.
3354	*/
3355	processed = ctxt->input->cur - ctxt->input->base;
3356	xmlBufferShrink(ctxt->input->buf->buffer, processed);
3357	nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder,
3358	ctxt->input->buf->buffer,
3359	ctxt->input->buf->raw);
3360	if (nbchars < 0) {
3361	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3362	"htmlCheckEncoding: encoder error\n",
3363	NULL, NULL);
3364	}
3365	ctxt->input->base =
3366	ctxt->input->cur = ctxt->input->buf->buffer->content;
3367	}
3368	}
3369	}
3370
3371	/**
3372	* htmlCheckMeta:
3373	* @ctxt: an HTML parser context
3374	* @atts: the attributes values
3375	*
3376	* Checks an attributes from a Meta tag
3377	*/
3378	static void
3379	htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3380	int i;
3381	const xmlChar att, value;
3382	int http = 0;
3383	const xmlChar *content = NULL;
3384
3385	if ((ctxt == NULL) \|\| (atts == NULL))
3386	return;
3387
3388	i = 0;
3389	att = atts[i++];
3390	while (att != NULL) {
3391	value = atts[i++];
3392	if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3393	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3394	http = 1;
3395	else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3396	content = value;
3397	att = atts[i++];
3398	}
3399	if ((http) && (content != NULL))
3400	htmlCheckEncoding(ctxt, content);
3401
3402	}
3403
3404	/**
3405	* htmlParseStartTag:
3406	* @ctxt: an HTML parser context
3407	*
3408	* parse a start of tag either for rule element or
3409	* EmptyElement. In both case we don't parse the tag closing chars.
3410	*
3411	* [40] STag ::= '<' Name (S Attribute)* S? '>'
3412	*
3413	* [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3414	*
3415	* With namespace:
3416	*
3417	* [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3418	*
3419	* [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3420	*
3421	* Returns 0 in case of success and -1 in case of error.
3422	*/
3423
3424	static int
3425	htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3426	const xmlChar *name;
3427	const xmlChar *attname;
3428	xmlChar *attvalue;
3429	const xmlChar **atts;
3430	int nbatts = 0;
3431	int maxatts;
3432	int meta = 0;
3433	int i;
3434
3435	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3436	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3437	"htmlParseStartTag: context error\n", NULL, NULL);
3438	return -1;
3439	}
3440	if (CUR != '<') return -1;
3441	NEXT;
3442
3443	atts = ctxt->atts;
3444	maxatts = ctxt->maxatts;
3445
3446	GROW;
3447	name = htmlParseHTMLName(ctxt);
3448	if (name == NULL) {
3449	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3450	"htmlParseStartTag: invalid element name\n",
3451	NULL, NULL);
3452	/* Dump the bogus tag like browsers do */
3453	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3454	NEXT;
3455	return -1;
3456	}
3457	if (xmlStrEqual(name, BAD_CAST"meta"))
3458	meta = 1;
3459
3460	/*
3461	* Check for auto-closure of HTML elements.
3462	*/
3463	htmlAutoClose(ctxt, name);
3464
3465	/*
3466	* Check for implied HTML elements.
3467	*/
3468	htmlCheckImplied(ctxt, name);
3469
3470	/*
3471	* Avoid html at any level > 0, head at any level != 1
3472	* or any attempt to recurse body
3473	*/
3474	if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3475	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3476	"htmlParseStartTag: misplaced <html> tag\n",
3477	name, NULL);
3478	return 0;
3479	}
3480	if ((ctxt->nameNr != 1) &&
3481	(xmlStrEqual(name, BAD_CAST"head"))) {
3482	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3483	"htmlParseStartTag: misplaced <head> tag\n",
3484	name, NULL);
3485	return 0;
3486	}
3487	if (xmlStrEqual(name, BAD_CAST"body")) {
3488	int indx;
3489	for (indx = 0;indx < ctxt->nameNr;indx++) {
3490	if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3491	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3492	"htmlParseStartTag: misplaced <body> tag\n",
3493	name, NULL);
3494	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3495	NEXT;
3496	return 0;
3497	}
3498	}
3499	}
3500
3501	/*
3502	* Now parse the attributes, it ends up with the ending
3503	*
3504	* (S Attribute)* S?
3505	*/
3506	SKIP_BLANKS;
3507	while ((IS_CHAR_CH(CUR)) &&
3508	(CUR != '>') &&
3509	((CUR != '/') \|\| (NXT(1) != '>'))) {
3510	long cons = ctxt->nbChars;
3511
3512	GROW;
3513	attname = htmlParseAttribute(ctxt, &attvalue);
3514	if (attname != NULL) {
3515
3516	/*
3517	* Well formedness requires at most one declaration of an attribute
3518	*/
3519	for (i = 0; i < nbatts;i += 2) {
3520	if (xmlStrEqual(atts[i], attname)) {
3521	htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3522	"Attribute %s redefined\n", attname, NULL);
3523	if (attvalue != NULL)
3524	xmlFree(attvalue);
3525	goto failed;
3526	}
3527	}
3528
3529	/*
3530	* Add the pair to atts
3531	*/
3532	if (atts == NULL) {
3533	maxatts = 22; /* allow for 10 attrs by default */
3534	atts = (const xmlChar **)
3535	xmlMalloc(maxatts * sizeof(xmlChar *));
3536	if (atts == NULL) {
3537	htmlErrMemory(ctxt, NULL);
3538	if (attvalue != NULL)
3539	xmlFree(attvalue);
3540	goto failed;
3541	}
3542	ctxt->atts = atts;
3543	ctxt->maxatts = maxatts;
3544	} else if (nbatts + 4 > maxatts) {
3545	const xmlChar **n;
3546
3547	maxatts *= 2;
3548	n = (const xmlChar *) xmlRealloc((void ) atts,
3549	maxatts * sizeof(const xmlChar *));
3550	if (n == NULL) {
3551	htmlErrMemory(ctxt, NULL);
3552	if (attvalue != NULL)
3553	xmlFree(attvalue);
3554	goto failed;
3555	}
3556	atts = n;
3557	ctxt->atts = atts;
3558	ctxt->maxatts = maxatts;
3559	}
3560	atts[nbatts++] = attname;
3561	atts[nbatts++] = attvalue;
3562	atts[nbatts] = NULL;
3563	atts[nbatts + 1] = NULL;
3564	}
3565	else {
3566	if (attvalue != NULL)
3567	xmlFree(attvalue);
3568	/* Dump the bogus attribute string up to the next blank or
3569	* the end of the tag. */
3570	while ((IS_CHAR_CH(CUR)) &&
3571	!(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3572	((CUR != '/') \|\| (NXT(1) != '>')))
3573	NEXT;
3574	}
3575
3576	failed:
3577	SKIP_BLANKS;
3578	if (cons == ctxt->nbChars) {
3579	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3580	"htmlParseStartTag: problem parsing attributes\n",
3581	NULL, NULL);
3582	break;
3583	}
3584	}
3585
3586	/*
3587	* Handle specific association to the META tag
3588	*/
3589	if (meta && (nbatts != 0))
3590	htmlCheckMeta(ctxt, atts);
3591
3592	/*
3593	* SAX: Start of Element !
3594	*/
3595	htmlnamePush(ctxt, name);
3596	if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3597	if (nbatts != 0)
3598	ctxt->sax->startElement(ctxt->userData, name, atts);
3599	else
3600	ctxt->sax->startElement(ctxt->userData, name, NULL);
3601	}
3602
3603	if (atts != NULL) {
3604	for (i = 1;i < nbatts;i += 2) {
3605	if (atts[i] != NULL)
3606	xmlFree((xmlChar *) atts[i]);
3607	}
3608	}
3609
3610	return 0;
3611	}
3612
3613	/**
3614	* htmlParseEndTag:
3615	* @ctxt: an HTML parser context
3616	*
3617	* parse an end of tag
3618	*
3619	* [42] ETag ::= '</' Name S? '>'
3620	*
3621	* With namespace
3622	*
3623	* [NS 9] ETag ::= '</' QName S? '>'
3624	*
3625	* Returns 1 if the current level should be closed.
3626	*/
3627
3628	static int
3629	htmlParseEndTag(htmlParserCtxtPtr ctxt)
3630	{
3631	const xmlChar *name;
3632	const xmlChar *oldname;
3633	int i, ret;
3634
3635	if ((CUR != '<') \|\| (NXT(1) != '/')) {
3636	htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3637	"htmlParseEndTag: '</' not found\n", NULL, NULL);
3638	return (0);
3639	}
3640	SKIP(2);
3641
3642	name = htmlParseHTMLName(ctxt);
3643	if (name == NULL)
3644	return (0);
3645
3646	/*
3647	* We should definitely be at the ending "S? '>'" part
3648	*/
3649	SKIP_BLANKS;
3650	if ((!IS_CHAR_CH(CUR)) \|\| (CUR != '>')) {
3651	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3652	"End tag : expected '>'\n", NULL, NULL);
3653	if (ctxt->recovery) {
3654	/*
3655	* We're not at the ending > !!
3656	* Error, unless in recover mode where we search forwards
3657	* until we find a >
3658	*/
3659	while (CUR != '\0' && CUR != '>') NEXT;
3660	NEXT;
3661	}
3662	} else
3663	NEXT;
3664
3665	/*
3666	* If the name read is not one of the element in the parsing stack
3667	* then return, it's just an error.
3668	*/
3669	for (i = (ctxt->nameNr - 1); i >= 0; i--) {
3670	if (xmlStrEqual(name, ctxt->nameTab[i]))
3671	break;
3672	}
3673	if (i < 0) {
3674	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3675	"Unexpected end tag : %s\n", name, NULL);
3676	return (0);
3677	}
3678
3679
3680	/*
3681	* Check for auto-closure of HTML elements.
3682	*/
3683
3684	htmlAutoCloseOnClose(ctxt, name);
3685
3686	/*
3687	* Well formedness constraints, opening and closing must match.
3688	* With the exception that the autoclose may have popped stuff out
3689	* of the stack.
3690	*/
3691	if (!xmlStrEqual(name, ctxt->name)) {
3692	if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
3693	htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3694	"Opening and ending tag mismatch: %s and %s\n",
3695	name, ctxt->name);
3696	}
3697	}
3698
3699	/*
3700	* SAX: End of Tag
3701	*/
3702	oldname = ctxt->name;
3703	if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
3704	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
3705	ctxt->sax->endElement(ctxt->userData, name);
3706	htmlnamePop(ctxt);
3707	ret = 1;
3708	} else {
3709	ret = 0;
3710	}
3711
3712	return (ret);
3713	}
3714
3715
3716	/**
3717	* htmlParseReference:
3718	* @ctxt: an HTML parser context
3719	*
3720	* parse and handle entity references in content,
3721	* this will end-up in a call to character() since this is either a
3722	* CharRef, or a predefined entity.
3723	*/
3724	static void
3725	htmlParseReference(htmlParserCtxtPtr ctxt) {
3726	const htmlEntityDesc * ent;
3727	xmlChar out[6];
3728	const xmlChar *name;
3729	if (CUR != '&') return;
3730
3731	if (NXT(1) == '#') {
3732	unsigned int c;
3733	int bits, i = 0;
3734
3735	c = htmlParseCharRef(ctxt);
3736	if (c == 0)
3737	return;
3738
3739	if (c < 0x80) { out[i++]= c; bits= -6; }
3740	else if (c < 0x800) { out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
3741	else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
3742	else { out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
3743
3744	for ( ; bits >= 0; bits-= 6) {
3745	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
3746	}
3747	out[i] = 0;
3748
3749	htmlCheckParagraph(ctxt);
3750	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3751	ctxt->sax->characters(ctxt->userData, out, i);
3752	} else {
3753	ent = htmlParseEntityRef(ctxt, &name);
3754	if (name == NULL) {
3755	htmlCheckParagraph(ctxt);
3756	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3757	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3758	return;
3759	}
3760	if ((ent == NULL) \|\| !(ent->value > 0)) {
3761	htmlCheckParagraph(ctxt);
3762	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
3763	ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
3764	ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
3765	/* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
3766	}
3767	} else {
3768	unsigned int c;
3769	int bits, i = 0;
3770
3771	c = ent->value;
3772	if (c < 0x80)
3773	{ out[i++]= c; bits= -6; }
3774	else if (c < 0x800)
3775	{ out[i++]=((c >> 6) & 0x1F) \| 0xC0; bits= 0; }
3776	else if (c < 0x10000)
3777	{ out[i++]=((c >> 12) & 0x0F) \| 0xE0; bits= 6; }
3778	else
3779	{ out[i++]=((c >> 18) & 0x07) \| 0xF0; bits= 12; }
3780
3781	for ( ; bits >= 0; bits-= 6) {
3782	out[i++]= ((c >> bits) & 0x3F) \| 0x80;
3783	}
3784	out[i] = 0;
3785
3786	htmlCheckParagraph(ctxt);
3787	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
3788	ctxt->sax->characters(ctxt->userData, out, i);
3789	}
3790	}
3791	}
3792
3793	/**
3794	* htmlParseContent:
3795	* @ctxt: an HTML parser context
3796	*
3797	* Parse a content: comment, sub-element, reference or text.
3798	*/
3799
3800	static void
3801	htmlParseContent(htmlParserCtxtPtr ctxt) {
3802	xmlChar *currentNode;
3803	int depth;
3804	const xmlChar *name;
3805
3806	currentNode = xmlStrdup(ctxt->name);
3807	depth = ctxt->nameNr;
3808	while (1) {
3809	long cons = ctxt->nbChars;
3810
3811	GROW;
3812	/*
3813	* Our tag or one of it's parent or children is ending.
3814	*/
3815	if ((CUR == '<') && (NXT(1) == '/')) {
3816	if (htmlParseEndTag(ctxt) &&
3817	((currentNode != NULL) \|\| (ctxt->nameNr == 0))) {
3818	if (currentNode != NULL)
3819	xmlFree(currentNode);
3820	return;
3821	}
3822	continue; /* while */
3823	}
3824
3825	else if ((CUR == '<') &&
3826	((IS_ASCII_LETTER(NXT(1))) \|\|
3827	(NXT(1) == '_') \|\| (NXT(1) == ':'))) {
3828	name = htmlParseHTMLName_nonInvasive(ctxt);
3829	if (name == NULL) {
3830	htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3831	"htmlParseStartTag: invalid element name\n",
3832	NULL, NULL);
3833	/* Dump the bogus tag like browsers do */
3834	while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
3835	NEXT;
3836
3837	if (currentNode != NULL)
3838	xmlFree(currentNode);
3839	return;
3840	}
3841
3842	if (ctxt->name != NULL) {
3843	if (htmlCheckAutoClose(name, ctxt->name) == 1) {
3844	htmlAutoClose(ctxt, name);
3845	continue;
3846	}
3847	}
3848	}
3849
3850	/*
3851	* Has this node been popped out during parsing of
3852	* the next element
3853	*/
3854	if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
3855	(!xmlStrEqual(currentNode, ctxt->name)))
3856	{
3857	if (currentNode != NULL) xmlFree(currentNode);
3858	return;
3859	}
3860
3861	if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) \|\|
3862	(xmlStrEqual(currentNode, BAD_CAST"style")))) {
3863	/*
3864	* Handle SCRIPT/STYLE separately
3865	*/
3866	htmlParseScript(ctxt);
3867	} else {
3868	/*
3869	* Sometimes DOCTYPE arrives in the middle of the document
3870	*/
3871	if ((CUR == '<') && (NXT(1) == '!') &&
3872	(UPP(2) == 'D') && (UPP(3) == 'O') &&
3873	(UPP(4) == 'C') && (UPP(5) == 'T') &&
3874	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
3875	(UPP(8) == 'E')) {
3876	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3877	"Misplaced DOCTYPE declaration\n",
3878	BAD_CAST "DOCTYPE" , NULL);
3879	htmlParseDocTypeDecl(ctxt);
3880	}
3881
3882	/*
3883	* First case : a comment
3884	*/
3885	if ((CUR == '<') && (NXT(1) == '!') &&
3886	(NXT(2) == '-') && (NXT(3) == '-')) {
3887	htmlParseComment(ctxt);
3888	}
3889
3890	/*
3891	* Second case : a Processing Instruction.
3892	*/
3893	else if ((CUR == '<') && (NXT(1) == '?')) {
3894	htmlParsePI(ctxt);
3895	}
3896
3897	/*
3898	* Third case : a sub-element.
3899	*/
3900	else if (CUR == '<') {
3901	htmlParseElement(ctxt);
3902	}
3903
3904	/*
3905	* Fourth case : a reference. If if has not been resolved,
3906	* parsing returns it's Name, create the node
3907	*/
3908	else if (CUR == '&') {
3909	htmlParseReference(ctxt);
3910	}
3911
3912	/*
3913	* Fifth case : end of the resource
3914	*/
3915	else if (CUR == 0) {
3916	htmlAutoCloseOnEnd(ctxt);
3917	break;
3918	}
3919
3920	/*
3921	* Last case, text. Note that References are handled directly.
3922	*/
3923	else {
3924	htmlParseCharData(ctxt);
3925	}
3926
3927	if (cons == ctxt->nbChars) {
3928	if (ctxt->node != NULL) {
3929	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3930	"detected an error in element content\n",
3931	NULL, NULL);
3932	}
3933	break;
3934	}
3935	}
3936	GROW;
3937	}
3938	if (currentNode != NULL) xmlFree(currentNode);
3939	}
3940
3941	/**
3942	* htmlParseContent:
3943	* @ctxt: an HTML parser context
3944	*
3945	* Parse a content: comment, sub-element, reference or text.
3946	*/
3947
3948	void
3949	__htmlParseContent(void *ctxt) {
3950	if (ctxt != NULL)
3951	htmlParseContent((htmlParserCtxtPtr) ctxt);
3952	}
3953
3954	/**
3955	* htmlParseElement:
3956	* @ctxt: an HTML parser context
3957	*
3958	* parse an HTML element, this is highly recursive
3959	*
3960	* [39] element ::= EmptyElemTag \| STag content ETag
3961	*
3962	* [41] Attribute ::= Name Eq AttValue
3963	*/
3964
3965	void
3966	htmlParseElement(htmlParserCtxtPtr ctxt) {
3967	const xmlChar *name;
3968	xmlChar *currentNode = NULL;
3969	const htmlElemDesc * info;
3970	htmlParserNodeInfo node_info;
3971	int failed;
3972	int depth;
3973	const xmlChar *oldptr;
3974
3975	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
3976	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3977	"htmlParseElement: context error\n", NULL, NULL);
3978	return;
3979	}
3980	/* Capture start position */
3981	if (ctxt->record_info) {
3982	node_info.begin_pos = ctxt->input->consumed +
3983	(CUR_PTR - ctxt->input->base);
3984	node_info.begin_line = ctxt->input->line;
3985	}
3986
3987	failed = htmlParseStartTag(ctxt);
3988	name = ctxt->name;
3989	if (failed \|\| (name == NULL)) {
3990	if (CUR == '>')
3991	NEXT;
3992	return;
3993	}
3994
3995	/*
3996	* Lookup the info for that element.
3997	*/
3998	info = htmlTagLookup(name);
3999	if (info == NULL) {
4000	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4001	"Tag %s invalid\n", name, NULL);
4002	}
4003
4004	/*
4005	* Check for an Empty Element labeled the XML/SGML way
4006	*/
4007	if ((CUR == '/') && (NXT(1) == '>')) {
4008	SKIP(2);
4009	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4010	ctxt->sax->endElement(ctxt->userData, name);
4011	htmlnamePop(ctxt);
4012	return;
4013	}
4014
4015	if (CUR == '>') {
4016	NEXT;
4017	} else {
4018	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4019	"Couldn't find end of Start Tag %s\n", name, NULL);
4020
4021	/*
4022	* end of parsing of this node.
4023	*/
4024	if (xmlStrEqual(name, ctxt->name)) {
4025	nodePop(ctxt);
4026	htmlnamePop(ctxt);
4027	}
4028
4029	/*
4030	* Capture end position and add node
4031	*/
4032	if (ctxt->record_info) {
4033	node_info.end_pos = ctxt->input->consumed +
4034	(CUR_PTR - ctxt->input->base);
4035	node_info.end_line = ctxt->input->line;
4036	node_info.node = ctxt->node;
4037	xmlParserAddNodeInfo(ctxt, &node_info);
4038	}
4039	return;
4040	}
4041
4042	/*
4043	* Check for an Empty Element from DTD definition
4044	*/
4045	if ((info != NULL) && (info->empty)) {
4046	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4047	ctxt->sax->endElement(ctxt->userData, name);
4048	htmlnamePop(ctxt);
4049	return;
4050	}
4051
4052	/*
4053	* Parse the content of the element:
4054	*/
4055	currentNode = xmlStrdup(ctxt->name);
4056	depth = ctxt->nameNr;
4057	while (IS_CHAR_CH(CUR)) {
4058	oldptr = ctxt->input->cur;
4059	htmlParseContent(ctxt);
4060	if (oldptr==ctxt->input->cur) break;
4061	if (ctxt->nameNr < depth) break;
4062	}
4063
4064	/*
4065	* Capture end position and add node
4066	*/
4067	if ( currentNode != NULL && ctxt->record_info ) {
4068	node_info.end_pos = ctxt->input->consumed +
4069	(CUR_PTR - ctxt->input->base);
4070	node_info.end_line = ctxt->input->line;
4071	node_info.node = ctxt->node;
4072	xmlParserAddNodeInfo(ctxt, &node_info);
4073	}
4074	if (!IS_CHAR_CH(CUR)) {
4075	htmlAutoCloseOnEnd(ctxt);
4076	}
4077
4078	if (currentNode != NULL)
4079	xmlFree(currentNode);
4080	}
4081
4082	/**
4083	* htmlParseDocument:
4084	* @ctxt: an HTML parser context
4085	*
4086	* parse an HTML document (and build a tree if using the standard SAX
4087	* interface).
4088	*
4089	* Returns 0, -1 in case of error. the parser context is augmented
4090	* as a result of the parsing.
4091	*/
4092
4093	int
4094	htmlParseDocument(htmlParserCtxtPtr ctxt) {
4095	xmlDtdPtr dtd;
4096
4097	xmlInitParser();
4098
4099	htmlDefaultSAXHandlerInit();
4100
4101	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
4102	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4103	"htmlParseDocument: context error\n", NULL, NULL);
4104	return(XML_ERR_INTERNAL_ERROR);
4105	}
4106	ctxt->html = 1;
4107	GROW;
4108	/*
4109	* SAX: beginning of the document processing.
4110	*/
4111	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4112	ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4113
4114	/*
4115	* Wipe out everything which is before the first '<'
4116	*/
4117	SKIP_BLANKS;
4118	if (CUR == 0) {
4119	htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4120	"Document is empty\n", NULL, NULL);
4121	}
4122
4123	if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4124	ctxt->sax->startDocument(ctxt->userData);
4125
4126
4127	/*
4128	* Parse possible comments and PIs before any content
4129	*/
4130	while (((CUR == '<') && (NXT(1) == '!') &&
4131	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4132	((CUR == '<') && (NXT(1) == '?'))) {
4133	htmlParseComment(ctxt);
4134	htmlParsePI(ctxt);
4135	SKIP_BLANKS;
4136	}
4137
4138
4139	/*
4140	* Then possibly doc type declaration(s) and more Misc
4141	* (doctypedecl Misc*)?
4142	*/
4143	if ((CUR == '<') && (NXT(1) == '!') &&
4144	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4145	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4146	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4147	(UPP(8) == 'E')) {
4148	htmlParseDocTypeDecl(ctxt);
4149	}
4150	SKIP_BLANKS;
4151
4152	/*
4153	* Parse possible comments and PIs before any content
4154	*/
4155	while (((CUR == '<') && (NXT(1) == '!') &&
4156	(NXT(2) == '-') && (NXT(3) == '-')) \|\|
4157	((CUR == '<') && (NXT(1) == '?'))) {
4158	htmlParseComment(ctxt);
4159	htmlParsePI(ctxt);
4160	SKIP_BLANKS;
4161	}
4162
4163	/*
4164	* Time to start parsing the tree itself
4165	*/
4166	htmlParseContent(ctxt);
4167
4168	/*
4169	* autoclose
4170	*/
4171	if (CUR == 0)
4172	htmlAutoCloseOnEnd(ctxt);
4173
4174
4175	/*
4176	* SAX: end of the document processing.
4177	*/
4178	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4179	ctxt->sax->endDocument(ctxt->userData);
4180
4181	if (ctxt->myDoc != NULL) {
4182	dtd = xmlGetIntSubset(ctxt->myDoc);
4183	if (dtd == NULL)
4184	ctxt->myDoc->intSubset =
4185	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4186	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4187	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4188	}
4189	if (! ctxt->wellFormed) return(-1);
4190	return(0);
4191	}
4192
4193
4194	/************************************************************************
4195	* *
4196	* Parser contexts handling *
4197	* *
4198	************************************************************************/
4199
4200	/**
4201	* htmlInitParserCtxt:
4202	* @ctxt: an HTML parser context
4203	*
4204	* Initialize a parser context
4205	*
4206	* Returns 0 in case of success and -1 in case of error
4207	*/
4208
4209	static int
4210	htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4211	{
4212	htmlSAXHandler *sax;
4213
4214	if (ctxt == NULL) return(-1);
4215	memset(ctxt, 0, sizeof(htmlParserCtxt));
4216
4217	ctxt->dict = xmlDictCreate();
4218	if (ctxt->dict == NULL) {
4219	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4220	return(-1);
4221	}
4222	sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4223	if (sax == NULL) {
4224	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4225	return(-1);
4226	}
4227	else
4228	memset(sax, 0, sizeof(htmlSAXHandler));
4229
4230	/* Allocate the Input stack */
4231	ctxt->inputTab = (htmlParserInputPtr *)
4232	xmlMalloc(5 * sizeof(htmlParserInputPtr));
4233	if (ctxt->inputTab == NULL) {
4234	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4235	ctxt->inputNr = 0;
4236	ctxt->inputMax = 0;
4237	ctxt->input = NULL;
4238	return(-1);
4239	}
4240	ctxt->inputNr = 0;
4241	ctxt->inputMax = 5;
4242	ctxt->input = NULL;
4243	ctxt->version = NULL;
4244	ctxt->encoding = NULL;
4245	ctxt->standalone = -1;
4246	ctxt->instate = XML_PARSER_START;
4247
4248	/* Allocate the Node stack */
4249	ctxt->nodeTab = (htmlNodePtr ) xmlMalloc(10 sizeof(htmlNodePtr));
4250	if (ctxt->nodeTab == NULL) {
4251	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4252	ctxt->nodeNr = 0;
4253	ctxt->nodeMax = 0;
4254	ctxt->node = NULL;
4255	ctxt->inputNr = 0;
4256	ctxt->inputMax = 0;
4257	ctxt->input = NULL;
4258	return(-1);
4259	}
4260	ctxt->nodeNr = 0;
4261	ctxt->nodeMax = 10;
4262	ctxt->node = NULL;
4263
4264	/* Allocate the Name stack */
4265	ctxt->nameTab = (const xmlChar *) xmlMalloc(10 sizeof(xmlChar *));
4266	if (ctxt->nameTab == NULL) {
4267	htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4268	ctxt->nameNr = 0;
4269	ctxt->nameMax = 10;
4270	ctxt->name = NULL;
4271	ctxt->nodeNr = 0;
4272	ctxt->nodeMax = 0;
4273	ctxt->node = NULL;
4274	ctxt->inputNr = 0;
4275	ctxt->inputMax = 0;
4276	ctxt->input = NULL;
4277	return(-1);
4278	}
4279	ctxt->nameNr = 0;
4280	ctxt->nameMax = 10;
4281	ctxt->name = NULL;
4282
4283	if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4284	else {
4285	ctxt->sax = sax;
4286	memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4287	}
4288	ctxt->userData = ctxt;
4289	ctxt->myDoc = NULL;
4290	ctxt->wellFormed = 1;
4291	ctxt->replaceEntities = 0;
4292	ctxt->linenumbers = xmlLineNumbersDefaultValue;
4293	ctxt->html = 1;
4294	ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4295	ctxt->vctxt.userData = ctxt;
4296	ctxt->vctxt.error = xmlParserValidityError;
4297	ctxt->vctxt.warning = xmlParserValidityWarning;
4298	ctxt->record_info = 0;
4299	ctxt->validate = 0;
4300	ctxt->nbChars = 0;
4301	ctxt->checkIndex = 0;
4302	ctxt->catalogs = NULL;
4303	xmlInitNodeInfoSeq(&ctxt->node_seq);
4304	return(0);
4305	}
4306
4307	/**
4308	* htmlFreeParserCtxt:
4309	* @ctxt: an HTML parser context
4310	*
4311	* Free all the memory used by a parser context. However the parsed
4312	* document in ctxt->myDoc is not freed.
4313	*/
4314
4315	void
4316	htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4317	{
4318	xmlFreeParserCtxt(ctxt);
4319	}
4320
4321	/**
4322	* htmlNewParserCtxt:
4323	*
4324	* Allocate and initialize a new parser context.
4325	*
4326	* Returns the htmlParserCtxtPtr or NULL in case of allocation error
4327	*/
4328
4329	htmlParserCtxtPtr
4330	htmlNewParserCtxt(void)
4331	{
4332	xmlParserCtxtPtr ctxt;
4333
4334	ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4335	if (ctxt == NULL) {
4336	htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4337	return(NULL);
4338	}
4339	memset(ctxt, 0, sizeof(xmlParserCtxt));
4340	if (htmlInitParserCtxt(ctxt) < 0) {
4341	htmlFreeParserCtxt(ctxt);
4342	return(NULL);
4343	}
4344	return(ctxt);
4345	}
4346
4347	/**
4348	* htmlCreateMemoryParserCtxt:
4349	* @buffer: a pointer to a char array
4350	* @size: the size of the array
4351	*
4352	* Create a parser context for an HTML in-memory document.
4353	*
4354	* Returns the new parser context or NULL
4355	*/
4356	htmlParserCtxtPtr
4357	htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4358	xmlParserCtxtPtr ctxt;
4359	xmlParserInputPtr input;
4360	xmlParserInputBufferPtr buf;
4361
4362	if (buffer == NULL)
4363	return(NULL);
4364	if (size <= 0)
4365	return(NULL);
4366
4367	ctxt = htmlNewParserCtxt();
4368	if (ctxt == NULL)
4369	return(NULL);
4370
4371	buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
4372	if (buf == NULL) return(NULL);
4373
4374	input = xmlNewInputStream(ctxt);
4375	if (input == NULL) {
4376	xmlFreeParserCtxt(ctxt);
4377	return(NULL);
4378	}
4379
4380	input->filename = NULL;
4381	input->buf = buf;
4382	input->base = input->buf->buffer->content;
4383	input->cur = input->buf->buffer->content;
4384	input->end = &input->buf->buffer->content[input->buf->buffer->use];
4385
4386	inputPush(ctxt, input);
4387	return(ctxt);
4388	}
4389
4390	/**
4391	* htmlCreateDocParserCtxt:
4392	* @cur: a pointer to an array of xmlChar
4393	* @encoding: a free form C string describing the HTML document encoding, or NULL
4394	*
4395	* Create a parser context for an HTML document.
4396	*
4397	* TODO: check the need to add encoding handling there
4398	*
4399	* Returns the new parser context or NULL
4400	*/
4401	static htmlParserCtxtPtr
4402	htmlCreateDocParserCtxt(const xmlChar cur, const char encoding) {
4403	int len;
4404	htmlParserCtxtPtr ctxt;
4405
4406	if (cur == NULL)
4407	return(NULL);
4408	len = xmlStrlen(cur);
4409	ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
4410	if (ctxt == NULL)
4411	return(NULL);
4412
4413	if (encoding != NULL) {
4414	xmlCharEncoding enc;
4415	xmlCharEncodingHandlerPtr handler;
4416
4417	if (ctxt->input->encoding != NULL)
4418	xmlFree((xmlChar *) ctxt->input->encoding);
4419	ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
4420
4421	enc = xmlParseCharEncoding(encoding);
4422	/*
4423	* registered set of known encodings
4424	*/
4425	if (enc != XML_CHAR_ENCODING_ERROR) {
4426	xmlSwitchEncoding(ctxt, enc);
4427	if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
4428	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4429	"Unsupported encoding %s\n",
4430	(const xmlChar *) encoding, NULL);
4431	}
4432	} else {
4433	/*
4434	* fallback for unknown encodings
4435	*/
4436	handler = xmlFindCharEncodingHandler((const char *) encoding);
4437	if (handler != NULL) {
4438	xmlSwitchToEncoding(ctxt, handler);
4439	} else {
4440	htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
4441	"Unsupported encoding %s\n",
4442	(const xmlChar *) encoding, NULL);
4443	}
4444	}
4445	}
4446	return(ctxt);
4447	}
4448
4449	#ifdef LIBXML_PUSH_ENABLED
4450	/************************************************************************
4451	* *
4452	* Progressive parsing interfaces *
4453	* *
4454	************************************************************************/
4455
4456	/**
4457	* htmlParseLookupSequence:
4458	* @ctxt: an HTML parser context
4459	* @first: the first char to lookup
4460	* @next: the next char to lookup or zero
4461	* @third: the next char to lookup or zero
4462	* @comment: flag to force checking inside comments
4463	*
4464	* Try to find if a sequence (first, next, third) or just (first next) or
4465	* (first) is available in the input stream.
4466	* This function has a side effect of (possibly) incrementing ctxt->checkIndex
4467	* to avoid rescanning sequences of bytes, it DOES change the state of the
4468	* parser, do not use liberally.
4469	* This is basically similar to xmlParseLookupSequence()
4470	*
4471	* Returns the index to the current parsing point if the full sequence
4472	* is available, -1 otherwise.
4473	*/
4474	static int
4475	htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
4476	xmlChar next, xmlChar third, int iscomment) {
4477	int base, len;
4478	htmlParserInputPtr in;
4479	const xmlChar *buf;
4480	int incomment = 0;
4481
4482	in = ctxt->input;
4483	if (in == NULL) return(-1);
4484	base = in->cur - in->base;
4485	if (base < 0) return(-1);
4486	if (ctxt->checkIndex > base)
4487	base = ctxt->checkIndex;
4488	if (in->buf == NULL) {
4489	buf = in->base;
4490	len = in->length;
4491	} else {
4492	buf = in->buf->buffer->content;
4493	len = in->buf->buffer->use;
4494	}
4495	/* take into account the sequence length */
4496	if (third) len -= 2;
4497	else if (next) len --;
4498	for (;base < len;base++) {
4499	if (!incomment && (base + 4 < len) && !iscomment) {
4500	if ((buf[base] == '<') && (buf[base + 1] == '!') &&
4501	(buf[base + 2] == '-') && (buf[base + 3] == '-')) {
4502	incomment = 1;
4503	/* do not increment past <! - some people use <!--> */
4504	base += 2;
4505	}
4506	}
4507	if (incomment) {
4508	if (base + 3 > len)
4509	return(-1);
4510	if ((buf[base] == '-') && (buf[base + 1] == '-') &&
4511	(buf[base + 2] == '>')) {
4512	incomment = 0;
4513	base += 2;
4514	}
4515	continue;
4516	}
4517	if (buf[base] == first) {
4518	if (third != 0) {
4519	if ((buf[base + 1] != next) \|\|
4520	(buf[base + 2] != third)) continue;
4521	} else if (next != 0) {
4522	if (buf[base + 1] != next) continue;
4523	}
4524	ctxt->checkIndex = 0;
4525	#ifdef DEBUG_PUSH
4526	if (next == 0)
4527	xmlGenericError(xmlGenericErrorContext,
4528	"HPP: lookup '%c' found at %d\n",
4529	first, base);
4530	else if (third == 0)
4531	xmlGenericError(xmlGenericErrorContext,
4532	"HPP: lookup '%c%c' found at %d\n",
4533	first, next, base);
4534	else
4535	xmlGenericError(xmlGenericErrorContext,
4536	"HPP: lookup '%c%c%c' found at %d\n",
4537	first, next, third, base);
4538	#endif
4539	return(base - (in->cur - in->base));
4540	}
4541	}
4542	ctxt->checkIndex = base;
4543	#ifdef DEBUG_PUSH
4544	if (next == 0)
4545	xmlGenericError(xmlGenericErrorContext,
4546	"HPP: lookup '%c' failed\n", first);
4547	else if (third == 0)
4548	xmlGenericError(xmlGenericErrorContext,
4549	"HPP: lookup '%c%c' failed\n", first, next);
4550	else
4551	xmlGenericError(xmlGenericErrorContext,
4552	"HPP: lookup '%c%c%c' failed\n", first, next, third);
4553	#endif
4554	return(-1);
4555	}
4556
4557	/**
4558	* htmlParseTryOrFinish:
4559	* @ctxt: an HTML parser context
4560	* @terminate: last chunk indicator
4561	*
4562	* Try to progress on parsing
4563	*
4564	* Returns zero if no parsing was possible
4565	*/
4566	static int
4567	htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4568	int ret = 0;
4569	htmlParserInputPtr in;
4570	int avail = 0;
4571	xmlChar cur, next;
4572
4573	#ifdef DEBUG_PUSH
4574	switch (ctxt->instate) {
4575	case XML_PARSER_EOF:
4576	xmlGenericError(xmlGenericErrorContext,
4577	"HPP: try EOF\n"); break;
4578	case XML_PARSER_START:
4579	xmlGenericError(xmlGenericErrorContext,
4580	"HPP: try START\n"); break;
4581	case XML_PARSER_MISC:
4582	xmlGenericError(xmlGenericErrorContext,
4583	"HPP: try MISC\n");break;
4584	case XML_PARSER_COMMENT:
4585	xmlGenericError(xmlGenericErrorContext,
4586	"HPP: try COMMENT\n");break;
4587	case XML_PARSER_PROLOG:
4588	xmlGenericError(xmlGenericErrorContext,
4589	"HPP: try PROLOG\n");break;
4590	case XML_PARSER_START_TAG:
4591	xmlGenericError(xmlGenericErrorContext,
4592	"HPP: try START_TAG\n");break;
4593	case XML_PARSER_CONTENT:
4594	xmlGenericError(xmlGenericErrorContext,
4595	"HPP: try CONTENT\n");break;
4596	case XML_PARSER_CDATA_SECTION:
4597	xmlGenericError(xmlGenericErrorContext,
4598	"HPP: try CDATA_SECTION\n");break;
4599	case XML_PARSER_END_TAG:
4600	xmlGenericError(xmlGenericErrorContext,
4601	"HPP: try END_TAG\n");break;
4602	case XML_PARSER_ENTITY_DECL:
4603	xmlGenericError(xmlGenericErrorContext,
4604	"HPP: try ENTITY_DECL\n");break;
4605	case XML_PARSER_ENTITY_VALUE:
4606	xmlGenericError(xmlGenericErrorContext,
4607	"HPP: try ENTITY_VALUE\n");break;
4608	case XML_PARSER_ATTRIBUTE_VALUE:
4609	xmlGenericError(xmlGenericErrorContext,
4610	"HPP: try ATTRIBUTE_VALUE\n");break;
4611	case XML_PARSER_DTD:
4612	xmlGenericError(xmlGenericErrorContext,
4613	"HPP: try DTD\n");break;
4614	case XML_PARSER_EPILOG:
4615	xmlGenericError(xmlGenericErrorContext,
4616	"HPP: try EPILOG\n");break;
4617	case XML_PARSER_PI:
4618	xmlGenericError(xmlGenericErrorContext,
4619	"HPP: try PI\n");break;
4620	case XML_PARSER_SYSTEM_LITERAL:
4621	xmlGenericError(xmlGenericErrorContext,
4622	"HPP: try SYSTEM_LITERAL\n");break;
4623	}
4624	#endif
4625
4626	while (1) {
4627
4628	in = ctxt->input;
4629	if (in == NULL) break;
4630	if (in->buf == NULL)
4631	avail = in->length - (in->cur - in->base);
4632	else
4633	avail = in->buf->buffer->use - (in->cur - in->base);
4634	if ((avail == 0) && (terminate)) {
4635	htmlAutoCloseOnEnd(ctxt);
4636	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
4637	/*
4638	* SAX: end of the document processing.
4639	*/
4640	ctxt->instate = XML_PARSER_EOF;
4641	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4642	ctxt->sax->endDocument(ctxt->userData);
4643	}
4644	}
4645	if (avail < 1)
4646	goto done;
4647	cur = in->cur[0];
4648	if (cur == 0) {
4649	SKIP(1);
4650	continue;
4651	}
4652
4653	switch (ctxt->instate) {
4654	case XML_PARSER_EOF:
4655	/*
4656	* Document parsing is done !
4657	*/
4658	goto done;
4659	case XML_PARSER_START:
4660	/*
4661	* Very first chars read from the document flow.
4662	*/
4663	cur = in->cur[0];
4664	if (IS_BLANK_CH(cur)) {
4665	SKIP_BLANKS;
4666	if (in->buf == NULL)
4667	avail = in->length - (in->cur - in->base);
4668	else
4669	avail = in->buf->buffer->use - (in->cur - in->base);
4670	}
4671	if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4672	ctxt->sax->setDocumentLocator(ctxt->userData,
4673	&xmlDefaultSAXLocator);
4674	if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4675	(!ctxt->disableSAX))
4676	ctxt->sax->startDocument(ctxt->userData);
4677
4678	cur = in->cur[0];
4679	next = in->cur[1];
4680	if ((cur == '<') && (next == '!') &&
4681	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4682	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4683	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4684	(UPP(8) == 'E')) {
4685	if ((!terminate) &&
4686	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4687	goto done;
4688	#ifdef DEBUG_PUSH
4689	xmlGenericError(xmlGenericErrorContext,
4690	"HPP: Parsing internal subset\n");
4691	#endif
4692	htmlParseDocTypeDecl(ctxt);
4693	ctxt->instate = XML_PARSER_PROLOG;
4694	#ifdef DEBUG_PUSH
4695	xmlGenericError(xmlGenericErrorContext,
4696	"HPP: entering PROLOG\n");
4697	#endif
4698	} else {
4699	ctxt->instate = XML_PARSER_MISC;
4700	#ifdef DEBUG_PUSH
4701	xmlGenericError(xmlGenericErrorContext,
4702	"HPP: entering MISC\n");
4703	#endif
4704	}
4705	break;
4706	case XML_PARSER_MISC:
4707	SKIP_BLANKS;
4708	if (in->buf == NULL)
4709	avail = in->length - (in->cur - in->base);
4710	else
4711	avail = in->buf->buffer->use - (in->cur - in->base);
4712	if (avail < 2)
4713	goto done;
4714	cur = in->cur[0];
4715	next = in->cur[1];
4716	if ((cur == '<') && (next == '!') &&
4717	(in->cur[2] == '-') && (in->cur[3] == '-')) {
4718	if ((!terminate) &&
4719	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4720	goto done;
4721	#ifdef DEBUG_PUSH
4722	xmlGenericError(xmlGenericErrorContext,
4723	"HPP: Parsing Comment\n");
4724	#endif
4725	htmlParseComment(ctxt);
4726	ctxt->instate = XML_PARSER_MISC;
4727	} else if ((cur == '<') && (next == '?')) {
4728	if ((!terminate) &&
4729	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4730	goto done;
4731	#ifdef DEBUG_PUSH
4732	xmlGenericError(xmlGenericErrorContext,
4733	"HPP: Parsing PI\n");
4734	#endif
4735	htmlParsePI(ctxt);
4736	ctxt->instate = XML_PARSER_MISC;
4737	} else if ((cur == '<') && (next == '!') &&
4738	(UPP(2) == 'D') && (UPP(3) == 'O') &&
4739	(UPP(4) == 'C') && (UPP(5) == 'T') &&
4740	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
4741	(UPP(8) == 'E')) {
4742	if ((!terminate) &&
4743	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4744	goto done;
4745	#ifdef DEBUG_PUSH
4746	xmlGenericError(xmlGenericErrorContext,
4747	"HPP: Parsing internal subset\n");
4748	#endif
4749	htmlParseDocTypeDecl(ctxt);
4750	ctxt->instate = XML_PARSER_PROLOG;
4751	#ifdef DEBUG_PUSH
4752	xmlGenericError(xmlGenericErrorContext,
4753	"HPP: entering PROLOG\n");
4754	#endif
4755	} else if ((cur == '<') && (next == '!') &&
4756	(avail < 9)) {
4757	goto done;
4758	} else {
4759	ctxt->instate = XML_PARSER_START_TAG;
4760	#ifdef DEBUG_PUSH
4761	xmlGenericError(xmlGenericErrorContext,
4762	"HPP: entering START_TAG\n");
4763	#endif
4764	}
4765	break;
4766	case XML_PARSER_PROLOG:
4767	SKIP_BLANKS;
4768	if (in->buf == NULL)
4769	avail = in->length - (in->cur - in->base);
4770	else
4771	avail = in->buf->buffer->use - (in->cur - in->base);
4772	if (avail < 2)
4773	goto done;
4774	cur = in->cur[0];
4775	next = in->cur[1];
4776	if ((cur == '<') && (next == '!') &&
4777	(in->cur[2] == '-') && (in->cur[3] == '-')) {
4778	if ((!terminate) &&
4779	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4780	goto done;
4781	#ifdef DEBUG_PUSH
4782	xmlGenericError(xmlGenericErrorContext,
4783	"HPP: Parsing Comment\n");
4784	#endif
4785	htmlParseComment(ctxt);
4786	ctxt->instate = XML_PARSER_PROLOG;
4787	} else if ((cur == '<') && (next == '?')) {
4788	if ((!terminate) &&
4789	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4790	goto done;
4791	#ifdef DEBUG_PUSH
4792	xmlGenericError(xmlGenericErrorContext,
4793	"HPP: Parsing PI\n");
4794	#endif
4795	htmlParsePI(ctxt);
4796	ctxt->instate = XML_PARSER_PROLOG;
4797	} else if ((cur == '<') && (next == '!') &&
4798	(avail < 4)) {
4799	goto done;
4800	} else {
4801	ctxt->instate = XML_PARSER_START_TAG;
4802	#ifdef DEBUG_PUSH
4803	xmlGenericError(xmlGenericErrorContext,
4804	"HPP: entering START_TAG\n");
4805	#endif
4806	}
4807	break;
4808	case XML_PARSER_EPILOG:
4809	if (in->buf == NULL)
4810	avail = in->length - (in->cur - in->base);
4811	else
4812	avail = in->buf->buffer->use - (in->cur - in->base);
4813	if (avail < 1)
4814	goto done;
4815	cur = in->cur[0];
4816	if (IS_BLANK_CH(cur)) {
4817	htmlParseCharData(ctxt);
4818	goto done;
4819	}
4820	if (avail < 2)
4821	goto done;
4822	next = in->cur[1];
4823	if ((cur == '<') && (next == '!') &&
4824	(in->cur[2] == '-') && (in->cur[3] == '-')) {
4825	if ((!terminate) &&
4826	(htmlParseLookupSequence(ctxt, '-', '-', '>', 1) < 0))
4827	goto done;
4828	#ifdef DEBUG_PUSH
4829	xmlGenericError(xmlGenericErrorContext,
4830	"HPP: Parsing Comment\n");
4831	#endif
4832	htmlParseComment(ctxt);
4833	ctxt->instate = XML_PARSER_EPILOG;
4834	} else if ((cur == '<') && (next == '?')) {
4835	if ((!terminate) &&
4836	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4837	goto done;
4838	#ifdef DEBUG_PUSH
4839	xmlGenericError(xmlGenericErrorContext,
4840	"HPP: Parsing PI\n");
4841	#endif
4842	htmlParsePI(ctxt);
4843	ctxt->instate = XML_PARSER_EPILOG;
4844	} else if ((cur == '<') && (next == '!') &&
4845	(avail < 4)) {
4846	goto done;
4847	} else {
4848	ctxt->errNo = XML_ERR_DOCUMENT_END;
4849	ctxt->wellFormed = 0;
4850	ctxt->instate = XML_PARSER_EOF;
4851	#ifdef DEBUG_PUSH
4852	xmlGenericError(xmlGenericErrorContext,
4853	"HPP: entering EOF\n");
4854	#endif
4855	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4856	ctxt->sax->endDocument(ctxt->userData);
4857	goto done;
4858	}
4859	break;
4860	case XML_PARSER_START_TAG: {
4861	const xmlChar *name;
4862	int failed;
4863	const htmlElemDesc * info;
4864
4865	if (avail < 2)
4866	goto done;
4867	cur = in->cur[0];
4868	if (cur != '<') {
4869	ctxt->instate = XML_PARSER_CONTENT;
4870	#ifdef DEBUG_PUSH
4871	xmlGenericError(xmlGenericErrorContext,
4872	"HPP: entering CONTENT\n");
4873	#endif
4874	break;
4875	}
4876	if (in->cur[1] == '/') {
4877	ctxt->instate = XML_PARSER_END_TAG;
4878	ctxt->checkIndex = 0;
4879	#ifdef DEBUG_PUSH
4880	xmlGenericError(xmlGenericErrorContext,
4881	"HPP: entering END_TAG\n");
4882	#endif
4883	break;
4884	}
4885	if ((!terminate) &&
4886	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
4887	goto done;
4888
4889	failed = htmlParseStartTag(ctxt);
4890	name = ctxt->name;
4891	if (failed \|\|
4892	(name == NULL)) {
4893	if (CUR == '>')
4894	NEXT;
4895	break;
4896	}
4897
4898	/*
4899	* Lookup the info for that element.
4900	*/
4901	info = htmlTagLookup(name);
4902	if (info == NULL) {
4903	htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4904	"Tag %s invalid\n", name, NULL);
4905	}
4906
4907	/*
4908	* Check for an Empty Element labeled the XML/SGML way
4909	*/
4910	if ((CUR == '/') && (NXT(1) == '>')) {
4911	SKIP(2);
4912	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4913	ctxt->sax->endElement(ctxt->userData, name);
4914	htmlnamePop(ctxt);
4915	ctxt->instate = XML_PARSER_CONTENT;
4916	#ifdef DEBUG_PUSH
4917	xmlGenericError(xmlGenericErrorContext,
4918	"HPP: entering CONTENT\n");
4919	#endif
4920	break;
4921	}
4922
4923	if (CUR == '>') {
4924	NEXT;
4925	} else {
4926	htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4927	"Couldn't find end of Start Tag %s\n",
4928	name, NULL);
4929
4930	/*
4931	* end of parsing of this node.
4932	*/
4933	if (xmlStrEqual(name, ctxt->name)) {
4934	nodePop(ctxt);
4935	htmlnamePop(ctxt);
4936	}
4937
4938	ctxt->instate = XML_PARSER_CONTENT;
4939	#ifdef DEBUG_PUSH
4940	xmlGenericError(xmlGenericErrorContext,
4941	"HPP: entering CONTENT\n");
4942	#endif
4943	break;
4944	}
4945
4946	/*
4947	* Check for an Empty Element from DTD definition
4948	*/
4949	if ((info != NULL) && (info->empty)) {
4950	if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4951	ctxt->sax->endElement(ctxt->userData, name);
4952	htmlnamePop(ctxt);
4953	}
4954	ctxt->instate = XML_PARSER_CONTENT;
4955	#ifdef DEBUG_PUSH
4956	xmlGenericError(xmlGenericErrorContext,
4957	"HPP: entering CONTENT\n");
4958	#endif
4959	break;
4960	}
4961	case XML_PARSER_CONTENT: {
4962	long cons;
4963	/*
4964	* Handle preparsed entities and charRef
4965	*/
4966	if (ctxt->token != 0) {
4967	xmlChar chr[2] = { 0 , 0 } ;
4968
4969	chr[0] = (xmlChar) ctxt->token;
4970	htmlCheckParagraph(ctxt);
4971	if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4972	ctxt->sax->characters(ctxt->userData, chr, 1);
4973	ctxt->token = 0;
4974	ctxt->checkIndex = 0;
4975	}
4976	if ((avail == 1) && (terminate)) {
4977	cur = in->cur[0];
4978	if ((cur != '<') && (cur != '&')) {
4979	if (ctxt->sax != NULL) {
4980	if (IS_BLANK_CH(cur)) {
4981	if (ctxt->sax->ignorableWhitespace != NULL)
4982	ctxt->sax->ignorableWhitespace(
4983	ctxt->userData, &cur, 1);
4984	} else {
4985	htmlCheckParagraph(ctxt);
4986	if (ctxt->sax->characters != NULL)
4987	ctxt->sax->characters(
4988	ctxt->userData, &cur, 1);
4989	}
4990	}
4991	ctxt->token = 0;
4992	ctxt->checkIndex = 0;
4993	in->cur++;
4994	break;
4995	}
4996	}
4997	if (avail < 2)
4998	goto done;
4999	cur = in->cur[0];
5000	next = in->cur[1];
5001	cons = ctxt->nbChars;
5002	if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) \|\|
5003	(xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5004	/*
5005	* Handle SCRIPT/STYLE separately
5006	*/
5007	if (!terminate) {
5008	int idx;
5009	xmlChar val;
5010
5011	idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5012	if (idx < 0)
5013	goto done;
5014	val = in->cur[idx + 2];
5015	if (val == 0) /* bad cut of input */
5016	goto done;
5017	}
5018	htmlParseScript(ctxt);
5019	if ((cur == '<') && (next == '/')) {
5020	ctxt->instate = XML_PARSER_END_TAG;
5021	ctxt->checkIndex = 0;
5022	#ifdef DEBUG_PUSH
5023	xmlGenericError(xmlGenericErrorContext,
5024	"HPP: entering END_TAG\n");
5025	#endif
5026	break;
5027	}
5028	} else {
5029	/*
5030	* Sometimes DOCTYPE arrives in the middle of the document
5031	*/
5032	if ((cur == '<') && (next == '!') &&
5033	(UPP(2) == 'D') && (UPP(3) == 'O') &&
5034	(UPP(4) == 'C') && (UPP(5) == 'T') &&
5035	(UPP(6) == 'Y') && (UPP(7) == 'P') &&
5036	(UPP(8) == 'E')) {
5037	if ((!terminate) &&
5038	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5039	goto done;
5040	htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5041	"Misplaced DOCTYPE declaration\n",
5042	BAD_CAST "DOCTYPE" , NULL);
5043	htmlParseDocTypeDecl(ctxt);
5044	} else if ((cur == '<') && (next == '!') &&
5045	(in->cur[2] == '-') && (in->cur[3] == '-')) {
5046	if ((!terminate) &&
5047	(htmlParseLookupSequence(
5048	ctxt, '-', '-', '>', 1) < 0))
5049	goto done;
5050	#ifdef DEBUG_PUSH
5051	xmlGenericError(xmlGenericErrorContext,
5052	"HPP: Parsing Comment\n");
5053	#endif
5054	htmlParseComment(ctxt);
5055	ctxt->instate = XML_PARSER_CONTENT;
5056	} else if ((cur == '<') && (next == '?')) {
5057	if ((!terminate) &&
5058	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5059	goto done;
5060	#ifdef DEBUG_PUSH
5061	xmlGenericError(xmlGenericErrorContext,
5062	"HPP: Parsing PI\n");
5063	#endif
5064	htmlParsePI(ctxt);
5065	ctxt->instate = XML_PARSER_CONTENT;
5066	} else if ((cur == '<') && (next == '!') && (avail < 4)) {
5067	goto done;
5068	} else if ((cur == '<') && (next == '/')) {
5069	ctxt->instate = XML_PARSER_END_TAG;
5070	ctxt->checkIndex = 0;
5071	#ifdef DEBUG_PUSH
5072	xmlGenericError(xmlGenericErrorContext,
5073	"HPP: entering END_TAG\n");
5074	#endif
5075	break;
5076	} else if (cur == '<') {
5077	ctxt->instate = XML_PARSER_START_TAG;
5078	ctxt->checkIndex = 0;
5079	#ifdef DEBUG_PUSH
5080	xmlGenericError(xmlGenericErrorContext,
5081	"HPP: entering START_TAG\n");
5082	#endif
5083	break;
5084	} else if (cur == '&') {
5085	if ((!terminate) &&
5086	(htmlParseLookupSequence(ctxt, ';', 0, 0, 0) < 0))
5087	goto done;
5088	#ifdef DEBUG_PUSH
5089	xmlGenericError(xmlGenericErrorContext,
5090	"HPP: Parsing Reference\n");
5091	#endif
5092	/* TODO: check generation of subtrees if noent !!! */
5093	htmlParseReference(ctxt);
5094	} else {
5095	/*
5096	* check that the text sequence is complete
5097	* before handing out the data to the parser
5098	* to avoid problems with erroneous end of
5099	* data detection.
5100	*/
5101	if ((!terminate) &&
5102	(htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5103	goto done;
5104	ctxt->checkIndex = 0;
5105	#ifdef DEBUG_PUSH
5106	xmlGenericError(xmlGenericErrorContext,
5107	"HPP: Parsing char data\n");
5108	#endif
5109	htmlParseCharData(ctxt);
5110	}
5111	}
5112	if (cons == ctxt->nbChars) {
5113	if (ctxt->node != NULL) {
5114	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5115	"detected an error in element content\n",
5116	NULL, NULL);
5117	}
5118	NEXT;
5119	break;
5120	}
5121
5122	break;
5123	}
5124	case XML_PARSER_END_TAG:
5125	if (avail < 2)
5126	goto done;
5127	if ((!terminate) &&
5128	(htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5129	goto done;
5130	htmlParseEndTag(ctxt);
5131	if (ctxt->nameNr == 0) {
5132	ctxt->instate = XML_PARSER_EPILOG;
5133	} else {
5134	ctxt->instate = XML_PARSER_CONTENT;
5135	}
5136	ctxt->checkIndex = 0;
5137	#ifdef DEBUG_PUSH
5138	xmlGenericError(xmlGenericErrorContext,
5139	"HPP: entering CONTENT\n");
5140	#endif
5141	break;
5142	case XML_PARSER_CDATA_SECTION:
5143	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5144	"HPP: internal error, state == CDATA\n",
5145	NULL, NULL);
5146	ctxt->instate = XML_PARSER_CONTENT;
5147	ctxt->checkIndex = 0;
5148	#ifdef DEBUG_PUSH
5149	xmlGenericError(xmlGenericErrorContext,
5150	"HPP: entering CONTENT\n");
5151	#endif
5152	break;
5153	case XML_PARSER_DTD:
5154	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5155	"HPP: internal error, state == DTD\n",
5156	NULL, NULL);
5157	ctxt->instate = XML_PARSER_CONTENT;
5158	ctxt->checkIndex = 0;
5159	#ifdef DEBUG_PUSH
5160	xmlGenericError(xmlGenericErrorContext,
5161	"HPP: entering CONTENT\n");
5162	#endif
5163	break;
5164	case XML_PARSER_COMMENT:
5165	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5166	"HPP: internal error, state == COMMENT\n",
5167	NULL, NULL);
5168	ctxt->instate = XML_PARSER_CONTENT;
5169	ctxt->checkIndex = 0;
5170	#ifdef DEBUG_PUSH
5171	xmlGenericError(xmlGenericErrorContext,
5172	"HPP: entering CONTENT\n");
5173	#endif
5174	break;
5175	case XML_PARSER_PI:
5176	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5177	"HPP: internal error, state == PI\n",
5178	NULL, NULL);
5179	ctxt->instate = XML_PARSER_CONTENT;
5180	ctxt->checkIndex = 0;
5181	#ifdef DEBUG_PUSH
5182	xmlGenericError(xmlGenericErrorContext,
5183	"HPP: entering CONTENT\n");
5184	#endif
5185	break;
5186	case XML_PARSER_ENTITY_DECL:
5187	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5188	"HPP: internal error, state == ENTITY_DECL\n",
5189	NULL, NULL);
5190	ctxt->instate = XML_PARSER_CONTENT;
5191	ctxt->checkIndex = 0;
5192	#ifdef DEBUG_PUSH
5193	xmlGenericError(xmlGenericErrorContext,
5194	"HPP: entering CONTENT\n");
5195	#endif
5196	break;
5197	case XML_PARSER_ENTITY_VALUE:
5198	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5199	"HPP: internal error, state == ENTITY_VALUE\n",
5200	NULL, NULL);
5201	ctxt->instate = XML_PARSER_CONTENT;
5202	ctxt->checkIndex = 0;
5203	#ifdef DEBUG_PUSH
5204	xmlGenericError(xmlGenericErrorContext,
5205	"HPP: entering DTD\n");
5206	#endif
5207	break;
5208	case XML_PARSER_ATTRIBUTE_VALUE:
5209	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5210	"HPP: internal error, state == ATTRIBUTE_VALUE\n",
5211	NULL, NULL);
5212	ctxt->instate = XML_PARSER_START_TAG;
5213	ctxt->checkIndex = 0;
5214	#ifdef DEBUG_PUSH
5215	xmlGenericError(xmlGenericErrorContext,
5216	"HPP: entering START_TAG\n");
5217	#endif
5218	break;
5219	case XML_PARSER_SYSTEM_LITERAL:
5220	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5221	"HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
5222	NULL, NULL);
5223	ctxt->instate = XML_PARSER_CONTENT;
5224	ctxt->checkIndex = 0;
5225	#ifdef DEBUG_PUSH
5226	xmlGenericError(xmlGenericErrorContext,
5227	"HPP: entering CONTENT\n");
5228	#endif
5229	break;
5230	case XML_PARSER_IGNORE:
5231	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5232	"HPP: internal error, state == XML_PARSER_IGNORE\n",
5233	NULL, NULL);
5234	ctxt->instate = XML_PARSER_CONTENT;
5235	ctxt->checkIndex = 0;
5236	#ifdef DEBUG_PUSH
5237	xmlGenericError(xmlGenericErrorContext,
5238	"HPP: entering CONTENT\n");
5239	#endif
5240	break;
5241	case XML_PARSER_PUBLIC_LITERAL:
5242	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5243	"HPP: internal error, state == XML_PARSER_LITERAL\n",
5244	NULL, NULL);
5245	ctxt->instate = XML_PARSER_CONTENT;
5246	ctxt->checkIndex = 0;
5247	#ifdef DEBUG_PUSH
5248	xmlGenericError(xmlGenericErrorContext,
5249	"HPP: entering CONTENT\n");
5250	#endif
5251	break;
5252
5253	}
5254	}
5255	done:
5256	if ((avail == 0) && (terminate)) {
5257	htmlAutoCloseOnEnd(ctxt);
5258	if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5259	/*
5260	* SAX: end of the document processing.
5261	*/
5262	ctxt->instate = XML_PARSER_EOF;
5263	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5264	ctxt->sax->endDocument(ctxt->userData);
5265	}
5266	}
5267	if ((ctxt->myDoc != NULL) &&
5268	((terminate) \|\| (ctxt->instate == XML_PARSER_EOF) \|\|
5269	(ctxt->instate == XML_PARSER_EPILOG))) {
5270	xmlDtdPtr dtd;
5271	dtd = xmlGetIntSubset(ctxt->myDoc);
5272	if (dtd == NULL)
5273	ctxt->myDoc->intSubset =
5274	xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5275	BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5276	BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5277	}
5278	#ifdef DEBUG_PUSH
5279	xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
5280	#endif
5281	return(ret);
5282	}
5283
5284	/**
5285	* htmlParseChunk:
5286	* @ctxt: an HTML parser context
5287	* @chunk: an char array
5288	* @size: the size in byte of the chunk
5289	* @terminate: last chunk indicator
5290	*
5291	* Parse a Chunk of memory
5292	*
5293	* Returns zero if no error, the xmlParserErrors otherwise.
5294	*/
5295	int
5296	htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5297	int terminate) {
5298	if ((ctxt == NULL) \|\| (ctxt->input == NULL)) {
5299	htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5300	"htmlParseChunk: context error\n", NULL, NULL);
5301	return(XML_ERR_INTERNAL_ERROR);
5302	}
5303	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5304	(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
5305	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5306	int cur = ctxt->input->cur - ctxt->input->base;
5307	int res;
5308
5309	res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5310	if (res < 0) {
5311	ctxt->errNo = XML_PARSER_EOF;
5312	ctxt->disableSAX = 1;
5313	return (XML_PARSER_EOF);
5314	}
5315	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5316	ctxt->input->cur = ctxt->input->base + cur;
5317	ctxt->input->end =
5318	&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5319	#ifdef DEBUG_PUSH
5320	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5321	#endif
5322
5323	#if 0
5324	if ((terminate) \|\| (ctxt->input->buf->buffer->use > 80))
5325	htmlParseTryOrFinish(ctxt, terminate);
5326	#endif
5327	} else if (ctxt->instate != XML_PARSER_EOF) {
5328	if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
5329	xmlParserInputBufferPtr in = ctxt->input->buf;
5330	if ((in->encoder != NULL) && (in->buffer != NULL) &&
5331	(in->raw != NULL)) {
5332	int nbchars;
5333
5334	nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw);
5335	if (nbchars < 0) {
5336	htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
5337	"encoder error\n", NULL, NULL);
5338	return(XML_ERR_INVALID_ENCODING);
5339	}
5340	}
5341	}
5342	}
5343	htmlParseTryOrFinish(ctxt, terminate);
5344	if (terminate) {
5345	if ((ctxt->instate != XML_PARSER_EOF) &&
5346	(ctxt->instate != XML_PARSER_EPILOG) &&
5347	(ctxt->instate != XML_PARSER_MISC)) {
5348	ctxt->errNo = XML_ERR_DOCUMENT_END;
5349	ctxt->wellFormed = 0;
5350	}
5351	if (ctxt->instate != XML_PARSER_EOF) {
5352	if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5353	ctxt->sax->endDocument(ctxt->userData);
5354	}
5355	ctxt->instate = XML_PARSER_EOF;
5356	}
5357	return((xmlParserErrors) ctxt->errNo);
5358	}
5359
5360	/************************************************************************
5361	* *
5362	* User entry points *
5363	* *
5364	************************************************************************/
5365
5366	/**
5367	* htmlCreatePushParserCtxt:
5368	* @sax: a SAX handler
5369	* @user_data: The user data returned on SAX callbacks
5370	* @chunk: a pointer to an array of chars
5371	* @size: number of chars in the array
5372	* @filename: an optional file name or URI
5373	* @enc: an optional encoding
5374	*
5375	* Create a parser context for using the HTML parser in push mode
5376	* The value of @filename is used for fetching external entities
5377	* and error/warning reports.
5378	*
5379	* Returns the new parser context or NULL
5380	*/
5381	htmlParserCtxtPtr
5382	htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5383	const char chunk, int size, const char filename,
5384	xmlCharEncoding enc) {
5385	htmlParserCtxtPtr ctxt;
5386	htmlParserInputPtr inputStream;
5387	xmlParserInputBufferPtr buf;
5388
5389	xmlInitParser();
5390
5391	buf = xmlAllocParserInputBuffer(enc);
5392	if (buf == NULL) return(NULL);
5393
5394	ctxt = htmlNewParserCtxt();
5395	if (ctxt == NULL) {
5396	xmlFreeParserInputBuffer(buf);
5397	return(NULL);
5398	}
5399	if(enc==XML_CHAR_ENCODING_UTF8 \|\| buf->encoder)
5400	ctxt->charset=XML_CHAR_ENCODING_UTF8;
5401	if (sax != NULL) {
5402	if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
5403	xmlFree(ctxt->sax);
5404	ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
5405	if (ctxt->sax == NULL) {
5406	xmlFree(buf);
5407	xmlFree(ctxt);
5408	return(NULL);
5409	}
5410	memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5411	if (user_data != NULL)
5412	ctxt->userData = user_data;
5413	}
5414	if (filename == NULL) {
5415	ctxt->directory = NULL;
5416	} else {
5417	ctxt->directory = xmlParserGetDirectory(filename);
5418	}
5419
5420	inputStream = htmlNewInputStream(ctxt);
5421	if (inputStream == NULL) {
5422	xmlFreeParserCtxt(ctxt);
5423	xmlFree(buf);
5424	return(NULL);
5425	}
5426
5427	if (filename == NULL)
5428	inputStream->filename = NULL;
5429	else
5430	inputStream->filename = (char *)
5431	xmlCanonicPath((const xmlChar *) filename);
5432	inputStream->buf = buf;
5433	inputStream->base = inputStream->buf->buffer->content;
5434	inputStream->cur = inputStream->buf->buffer->content;
5435	inputStream->end =
5436	&inputStream->buf->buffer->content[inputStream->buf->buffer->use];
5437
5438	inputPush(ctxt, inputStream);
5439
5440	if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5441	(ctxt->input->buf != NULL)) {
5442	int base = ctxt->input->base - ctxt->input->buf->buffer->content;
5443	int cur = ctxt->input->cur - ctxt->input->base;
5444
5445	xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5446
5447	ctxt->input->base = ctxt->input->buf->buffer->content + base;
5448	ctxt->input->cur = ctxt->input->base + cur;
5449	ctxt->input->end =
5450	&ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
5451	#ifdef DEBUG_PUSH
5452	xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
5453	#endif
5454	}
5455	ctxt->progressive = 1;
5456
5457	return(ctxt);
5458	}
5459	#endif /* LIBXML_PUSH_ENABLED */
5460
5461	/**
5462	* htmlSAXParseDoc:
5463	* @cur: a pointer to an array of xmlChar
5464	* @encoding: a free form C string describing the HTML document encoding, or NULL
5465	* @sax: the SAX handler block
5466	* @userData: if using SAX, this pointer will be provided on callbacks.
5467	*
5468	* Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5469	* to handle parse events. If sax is NULL, fallback to the default DOM
5470	* behavior and return a tree.
5471	*
5472	* Returns the resulting document tree unless SAX is NULL or the document is
5473	* not well formed.
5474	*/
5475
5476	htmlDocPtr
5477	htmlSAXParseDoc(xmlChar cur, const char encoding, htmlSAXHandlerPtr sax, void *userData) {
5478	htmlDocPtr ret;
5479	htmlParserCtxtPtr ctxt;
5480
5481	xmlInitParser();
5482
5483	if (cur == NULL) return(NULL);
5484
5485
5486	ctxt = htmlCreateDocParserCtxt(cur, encoding);
5487	if (ctxt == NULL) return(NULL);
5488	if (sax != NULL) {
5489	if (ctxt->sax != NULL) xmlFree (ctxt->sax);
5490	ctxt->sax = sax;
5491	ctxt->userData = userData;
5492	}
5493
5494	htmlParseDocument(ctxt);
5495	ret = ctxt->myDoc;
5496	if (sax != NULL) {
5497	ctxt->sax = NULL;
5498	ctxt->userData = NULL;
5499	}
5500	htmlFreeParserCtxt(ctxt);
5501
5502	return(ret);
5503	}
5504
5505	/**
5506	* htmlParseDoc:
5507	* @cur: a pointer to an array of xmlChar
5508	* @encoding: a free form C string describing the HTML document encoding, or NULL
5509	*
5510	* parse an HTML in-memory document and build a tree.
5511	*
5512	* Returns the resulting document tree
5513	*/
5514
5515	htmlDocPtr
5516	htmlParseDoc(xmlChar cur, const char encoding) {
5517	return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5518	}
5519
5520
5521	/**
5522	* htmlCreateFileParserCtxt:
5523	* @filename: the filename
5524	* @encoding: a free form C string describing the HTML document encoding, or NULL
5525	*
5526	* Create a parser context for a file content.
5527	* Automatic support for ZLIB/Compress compressed document is provided
5528	* by default if found at compile-time.
5529	*
5530	* Returns the new parser context or NULL
5531	*/
5532	htmlParserCtxtPtr
5533	htmlCreateFileParserCtxt(const char filename, const char encoding)
5534	{
5535	htmlParserCtxtPtr ctxt;
5536	htmlParserInputPtr inputStream;
5537	char *canonicFilename;
5538	/* htmlCharEncoding enc; */
5539	xmlChar content, content_line = (xmlChar *) "charset=";
5540
5541	if (filename == NULL)
5542	return(NULL);
5543
5544	ctxt = htmlNewParserCtxt();
5545	if (ctxt == NULL) {
5546	return(NULL);
5547	}
5548	canonicFilename = (char ) xmlCanonicPath((const xmlChar ) filename);
5549	if (canonicFilename == NULL) {
5550	#ifdef LIBXML_SAX1_ENABLED
5551	if (xmlDefaultSAXHandler.error != NULL) {
5552	xmlDefaultSAXHandler.error(NULL, "out of memory\n");
5553	}
5554	#endif
5555	xmlFreeParserCtxt(ctxt);
5556	return(NULL);
5557	}
5558
5559	inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
5560	xmlFree(canonicFilename);
5561	if (inputStream == NULL) {
5562	xmlFreeParserCtxt(ctxt);
5563	return(NULL);
5564	}
5565
5566	inputPush(ctxt, inputStream);
5567
5568	/* set encoding */
5569	if (encoding) {
5570	content = xmlMallocAtomic (xmlStrlen(content_line) + strlen(encoding) + 1);
5571	if (content) {
5572	strcpy ((char )content, (char )content_line);
5573	strcat ((char )content, (char )encoding);
5574	htmlCheckEncoding (ctxt, content);
5575	xmlFree (content);
5576	}
5577	}
5578
5579	return(ctxt);
5580	}
5581
5582	/**
5583	* htmlSAXParseFile:
5584	* @filename: the filename
5585	* @encoding: a free form C string describing the HTML document encoding, or NULL
5586	* @sax: the SAX handler block
5587	* @userData: if using SAX, this pointer will be provided on callbacks.
5588	*
5589	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5590	* compressed document is provided by default if found at compile-time.
5591	* It use the given SAX function block to handle the parsing callback.
5592	* If sax is NULL, fallback to the default DOM tree building routines.
5593	*
5594	* Returns the resulting document tree unless SAX is NULL or the document is
5595	* not well formed.
5596	*/
5597
5598	htmlDocPtr
5599	htmlSAXParseFile(const char filename, const char encoding, htmlSAXHandlerPtr sax,
5600	void *userData) {
5601	htmlDocPtr ret;
5602	htmlParserCtxtPtr ctxt;
5603	htmlSAXHandlerPtr oldsax = NULL;
5604
5605	xmlInitParser();
5606
5607	ctxt = htmlCreateFileParserCtxt(filename, encoding);
5608	if (ctxt == NULL) return(NULL);
5609	if (sax != NULL) {
5610	oldsax = ctxt->sax;
5611	ctxt->sax = sax;
5612	ctxt->userData = userData;
5613	}
5614
5615	htmlParseDocument(ctxt);
5616
5617	ret = ctxt->myDoc;
5618	if (sax != NULL) {
5619	ctxt->sax = oldsax;
5620	ctxt->userData = NULL;
5621	}
5622	htmlFreeParserCtxt(ctxt);
5623
5624	return(ret);
5625	}
5626
5627	/**
5628	* htmlParseFile:
5629	* @filename: the filename
5630	* @encoding: a free form C string describing the HTML document encoding, or NULL
5631	*
5632	* parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5633	* compressed document is provided by default if found at compile-time.
5634	*
5635	* Returns the resulting document tree
5636	*/
5637
5638	htmlDocPtr
5639	htmlParseFile(const char filename, const char encoding) {
5640	return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5641	}
5642
5643	/**
5644	* htmlHandleOmittedElem:
5645	* @val: int 0 or 1
5646	*
5647	* Set and return the previous value for handling HTML omitted tags.
5648	*
5649	* Returns the last value for 0 for no handling, 1 for auto insertion.
5650	*/
5651
5652	int
5653	htmlHandleOmittedElem(int val) {
5654	int old = htmlOmittedDefaultValue;
5655
5656	htmlOmittedDefaultValue = val;
5657	return(old);
5658	}
5659
5660	/**
5661	* htmlElementAllowedHere:
5662	* @parent: HTML parent element
5663	* @elt: HTML element
5664	*
5665	* Checks whether an HTML element may be a direct child of a parent element.
5666	* Note - doesn't check for deprecated elements
5667	*
5668	* Returns 1 if allowed; 0 otherwise.
5669	*/
5670	int
5671	htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
5672	const char** p ;
5673
5674	if ( ! elt \|\| ! parent \|\| ! parent->subelts )
5675	return 0 ;
5676
5677	for ( p = parent->subelts; *p; ++p )
5678	if ( !xmlStrcmp((const xmlChar )p, elt) )
5679	return 1 ;
5680
5681	return 0 ;
5682	}
5683	/**
5684	* htmlElementStatusHere:
5685	* @parent: HTML parent element
5686	* @elt: HTML element
5687	*
5688	* Checks whether an HTML element may be a direct child of a parent element.
5689	* and if so whether it is valid or deprecated.
5690	*
5691	* Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5692	*/
5693	htmlStatus
5694	htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
5695	if ( ! parent \|\| ! elt )
5696	return HTML_INVALID ;
5697	if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
5698	return HTML_INVALID ;
5699
5700	return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
5701	}
5702	/**
5703	* htmlAttrAllowed:
5704	* @elt: HTML element
5705	* @attr: HTML attribute
5706	* @legacy: whether to allow deprecated attributes
5707	*
5708	* Checks whether an attribute is valid for an element
5709	* Has full knowledge of Required and Deprecated attributes
5710	*
5711	* Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
5712	*/
5713	htmlStatus
5714	htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
5715	const char** p ;
5716
5717	if ( !elt \|\| ! attr )
5718	return HTML_INVALID ;
5719
5720	if ( elt->attrs_req )
5721	for ( p = elt->attrs_req; *p; ++p)
5722	if ( !xmlStrcmp((const xmlChar)p, attr) )
5723	return HTML_REQUIRED ;
5724
5725	if ( elt->attrs_opt )
5726	for ( p = elt->attrs_opt; *p; ++p)
5727	if ( !xmlStrcmp((const xmlChar)p, attr) )
5728	return HTML_VALID ;
5729
5730	if ( legacy && elt->attrs_depr )
5731	for ( p = elt->attrs_depr; *p; ++p)
5732	if ( !xmlStrcmp((const xmlChar)p, attr) )
5733	return HTML_DEPRECATED ;
5734
5735	return HTML_INVALID ;
5736	}
5737	/**
5738	* htmlNodeStatus:
5739	* @node: an htmlNodePtr in a tree
5740	* @legacy: whether to allow deprecated elements (YES is faster here
5741	* for Element nodes)
5742	*
5743	* Checks whether the tree node is valid. Experimental (the author
5744	* only uses the HTML enhancements in a SAX parser)
5745	*
5746	* Return: for Element nodes, a return from htmlElementAllowedHere (if
5747	* legacy allowed) or htmlElementStatusHere (otherwise).
5748	* for Attribute nodes, a return from htmlAttrAllowed
5749	* for other nodes, HTML_NA (no checks performed)
5750	*/
5751	htmlStatus
5752	htmlNodeStatus(const htmlNodePtr node, int legacy) {
5753	if ( ! node )
5754	return HTML_INVALID ;
5755
5756	switch ( node->type ) {
5757	case XML_ELEMENT_NODE:
5758	return legacy
5759	? ( htmlElementAllowedHere (
5760	htmlTagLookup(node->parent->name) , node->name
5761	) ? HTML_VALID : HTML_INVALID )
5762	: htmlElementStatusHere(
5763	htmlTagLookup(node->parent->name) ,
5764	htmlTagLookup(node->name) )
5765	;
5766	case XML_ATTRIBUTE_NODE:
5767	return htmlAttrAllowed(
5768	htmlTagLookup(node->parent->name) , node->name, legacy) ;
5769	default: return HTML_NA ;
5770	}
5771	}
5772	/************************************************************************
5773	* *
5774	* New set (2.6.0) of simpler and more flexible APIs *
5775	* *
5776	************************************************************************/
5777	/**
5778	* DICT_FREE:
5779	* @str: a string
5780	*
5781	* Free a string if it is not owned by the "dict" dictionnary in the
5782	* current scope
5783	*/
5784	#define DICT_FREE(str) \
5785	if ((str) && ((!dict) \|\| \
5786	(xmlDictOwns(dict, (const xmlChar *)(str)) == 0))) \
5787	xmlFree((char *)(str));
5788
5789	/**
5790	* htmlCtxtReset:
5791	* @ctxt: an HTML parser context
5792	*
5793	* Reset a parser context
5794	*/
5795	void
5796	htmlCtxtReset(htmlParserCtxtPtr ctxt)
5797	{
5798	xmlParserInputPtr input;
5799	xmlDictPtr dict;
5800
5801	if (ctxt == NULL)
5802	return;
5803
5804	xmlInitParser();
5805	dict = ctxt->dict;
5806
5807	while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
5808	xmlFreeInputStream(input);
5809	}
5810	ctxt->inputNr = 0;
5811	ctxt->input = NULL;
5812
5813	ctxt->spaceNr = 0;
5814	if (ctxt->spaceTab != NULL) {
5815	ctxt->spaceTab[0] = -1;
5816	ctxt->space = &ctxt->spaceTab[0];
5817	} else {
5818	ctxt->space = NULL;
5819	}
5820
5821
5822	ctxt->nodeNr = 0;
5823	ctxt->node = NULL;
5824
5825	ctxt->nameNr = 0;
5826	ctxt->name = NULL;
5827
5828	DICT_FREE(ctxt->version);
5829	ctxt->version = NULL;
5830	DICT_FREE(ctxt->encoding);
5831	ctxt->encoding = NULL;
5832	DICT_FREE(ctxt->directory);
5833	ctxt->directory = NULL;
5834	DICT_FREE(ctxt->extSubURI);
5835	ctxt->extSubURI = NULL;
5836	DICT_FREE(ctxt->extSubSystem);
5837	ctxt->extSubSystem = NULL;
5838	if (ctxt->myDoc != NULL)
5839	xmlFreeDoc(ctxt->myDoc);
5840	ctxt->myDoc = NULL;
5841
5842	ctxt->standalone = -1;
5843	ctxt->hasExternalSubset = 0;
5844	ctxt->hasPErefs = 0;
5845	ctxt->html = 1;
5846	ctxt->external = 0;
5847	ctxt->instate = XML_PARSER_START;
5848	ctxt->token = 0;
5849
5850	ctxt->wellFormed = 1;
5851	ctxt->nsWellFormed = 1;
5852	ctxt->valid = 1;
5853	ctxt->vctxt.userData = ctxt;
5854	ctxt->vctxt.error = xmlParserValidityError;
5855	ctxt->vctxt.warning = xmlParserValidityWarning;
5856	ctxt->record_info = 0;
5857	ctxt->nbChars = 0;
5858	ctxt->checkIndex = 0;
5859	ctxt->inSubset = 0;
5860	ctxt->errNo = XML_ERR_OK;
5861	ctxt->depth = 0;
5862	ctxt->charset = XML_CHAR_ENCODING_NONE;
5863	ctxt->catalogs = NULL;
5864	xmlInitNodeInfoSeq(&ctxt->node_seq);
5865
5866	if (ctxt->attsDefault != NULL) {
5867	xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
5868	ctxt->attsDefault = NULL;
5869	}
5870	if (ctxt->attsSpecial != NULL) {
5871	xmlHashFree(ctxt->attsSpecial, NULL);
5872	ctxt->attsSpecial = NULL;
5873	}
5874	}
5875
5876	/**
5877	* htmlCtxtUseOptions:
5878	* @ctxt: an HTML parser context
5879	* @options: a combination of htmlParserOption(s)
5880	*
5881	* Applies the options to the parser context
5882	*
5883	* Returns 0 in case of success, the set of unknown or unimplemented options
5884	* in case of error.
5885	*/
5886	int
5887	htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
5888	{
5889	if (ctxt == NULL)
5890	return(-1);
5891
5892	if (options & HTML_PARSE_NOWARNING) {
5893	ctxt->sax->warning = NULL;
5894	ctxt->vctxt.warning = NULL;
5895	options -= XML_PARSE_NOWARNING;
5896	ctxt->options \|= XML_PARSE_NOWARNING;
5897	}
5898	if (options & HTML_PARSE_NOERROR) {
5899	ctxt->sax->error = NULL;
5900	ctxt->vctxt.error = NULL;
5901	ctxt->sax->fatalError = NULL;
5902	options -= XML_PARSE_NOERROR;
5903	ctxt->options \|= XML_PARSE_NOERROR;
5904	}
5905	if (options & HTML_PARSE_PEDANTIC) {
5906	ctxt->pedantic = 1;
5907	options -= XML_PARSE_PEDANTIC;
5908	ctxt->options \|= XML_PARSE_PEDANTIC;
5909	} else
5910	ctxt->pedantic = 0;
5911	if (options & XML_PARSE_NOBLANKS) {
5912	ctxt->keepBlanks = 0;
5913	ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5914	options -= XML_PARSE_NOBLANKS;
5915	ctxt->options \|= XML_PARSE_NOBLANKS;
5916	} else
5917	ctxt->keepBlanks = 1;
5918	if (options & HTML_PARSE_RECOVER) {
5919	ctxt->recovery = 1;
5920	options -= HTML_PARSE_RECOVER;
5921	} else
5922	ctxt->recovery = 0;
5923	if (options & HTML_PARSE_COMPACT) {
5924	ctxt->options \|= HTML_PARSE_COMPACT;
5925	options -= HTML_PARSE_COMPACT;
5926	}
5927	ctxt->dictNames = 0;
5928	return (options);
5929	}
5930
5931	/**
5932	* htmlDoRead:
5933	* @ctxt: an HTML parser context
5934	* @URL: the base URL to use for the document
5935	* @encoding: the document encoding, or NULL
5936	* @options: a combination of htmlParserOption(s)
5937	* @reuse: keep the context for reuse
5938	*
5939	* Common front-end for the htmlRead functions
5940	*
5941	* Returns the resulting document tree or NULL
5942	*/
5943	static htmlDocPtr
5944	htmlDoRead(htmlParserCtxtPtr ctxt, const char URL, const char encoding,
5945	int options, int reuse)
5946	{
5947	htmlDocPtr ret;
5948
5949	htmlCtxtUseOptions(ctxt, options);
5950	ctxt->html = 1;
5951	if (encoding != NULL) {
5952	xmlCharEncodingHandlerPtr hdlr;
5953
5954	hdlr = xmlFindCharEncodingHandler(encoding);
5955	if (hdlr != NULL)
5956	xmlSwitchToEncoding(ctxt, hdlr);
5957	}
5958	if ((URL != NULL) && (ctxt->input != NULL) &&
5959	(ctxt->input->filename == NULL))
5960	ctxt->input->filename = (char ) xmlStrdup((const xmlChar ) URL);
5961	htmlParseDocument(ctxt);
5962	ret = ctxt->myDoc;
5963	ctxt->myDoc = NULL;
5964	if (!reuse) {
5965	if ((ctxt->dictNames) &&
5966	(ret != NULL) &&
5967	(ret->dict == ctxt->dict))
5968	ctxt->dict = NULL;
5969	xmlFreeParserCtxt(ctxt);
5970	}
5971	return (ret);
5972	}
5973
5974	/**
5975	* htmlReadDoc:
5976	* @cur: a pointer to a zero terminated string
5977	* @URL: the base URL to use for the document
5978	* @encoding: the document encoding, or NULL
5979	* @options: a combination of htmlParserOption(s)
5980	*
5981	* parse an XML in-memory document and build a tree.
5982	*
5983	* Returns the resulting document tree
5984	*/
5985	htmlDocPtr
5986	htmlReadDoc(const xmlChar * cur, const char URL, const char encoding, int options)
5987	{
5988	htmlParserCtxtPtr ctxt;
5989
5990	if (cur == NULL)
5991	return (NULL);
5992
5993	xmlInitParser();
5994	ctxt = htmlCreateDocParserCtxt(cur, NULL);
5995	if (ctxt == NULL)
5996	return (NULL);
5997	return (htmlDoRead(ctxt, URL, encoding, options, 0));
5998	}
5999
6000	/**
6001	* htmlReadFile:
6002	* @filename: a file or URL
6003	* @encoding: the document encoding, or NULL
6004	* @options: a combination of htmlParserOption(s)
6005	*
6006	* parse an XML file from the filesystem or the network.
6007	*
6008	* Returns the resulting document tree
6009	*/
6010	htmlDocPtr
6011	htmlReadFile(const char filename, const char encoding, int options)
6012	{
6013	htmlParserCtxtPtr ctxt;
6014
6015	xmlInitParser();
6016	ctxt = htmlCreateFileParserCtxt(filename, encoding);
6017	if (ctxt == NULL)
6018	return (NULL);
6019	return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6020	}
6021
6022	/**
6023	* htmlReadMemory:
6024	* @buffer: a pointer to a char array
6025	* @size: the size of the array
6026	* @URL: the base URL to use for the document
6027	* @encoding: the document encoding, or NULL
6028	* @options: a combination of htmlParserOption(s)
6029	*
6030	* parse an XML in-memory document and build a tree.
6031	*
6032	* Returns the resulting document tree
6033	*/
6034	htmlDocPtr
6035	htmlReadMemory(const char buffer, int size, const char URL, const char *encoding, int options)
6036	{
6037	htmlParserCtxtPtr ctxt;
6038
6039	xmlInitParser();
6040	ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6041	if (ctxt == NULL)
6042	return (NULL);
6043	htmlDefaultSAXHandlerInit();
6044	if (ctxt->sax != NULL)
6045	memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6046	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6047	}
6048
6049	/**
6050	* htmlReadFd:
6051	* @fd: an open file descriptor
6052	* @URL: the base URL to use for the document
6053	* @encoding: the document encoding, or NULL
6054	* @options: a combination of htmlParserOption(s)
6055	*
6056	* parse an XML from a file descriptor and build a tree.
6057	*
6058	* Returns the resulting document tree
6059	*/
6060	htmlDocPtr
6061	htmlReadFd(int fd, const char URL, const char encoding, int options)
6062	{
6063	htmlParserCtxtPtr ctxt;
6064	xmlParserInputBufferPtr input;
6065	xmlParserInputPtr stream;
6066
6067	if (fd < 0)
6068	return (NULL);
6069
6070	xmlInitParser();
6071	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6072	if (input == NULL)
6073	return (NULL);
6074	ctxt = xmlNewParserCtxt();
6075	if (ctxt == NULL) {
6076	xmlFreeParserInputBuffer(input);
6077	return (NULL);
6078	}
6079	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6080	if (stream == NULL) {
6081	xmlFreeParserInputBuffer(input);
6082	xmlFreeParserCtxt(ctxt);
6083	return (NULL);
6084	}
6085	inputPush(ctxt, stream);
6086	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6087	}
6088
6089	/**
6090	* htmlReadIO:
6091	* @ioread: an I/O read function
6092	* @ioclose: an I/O close function
6093	* @ioctx: an I/O handler
6094	* @URL: the base URL to use for the document
6095	* @encoding: the document encoding, or NULL
6096	* @options: a combination of htmlParserOption(s)
6097	*
6098	* parse an HTML document from I/O functions and source and build a tree.
6099	*
6100	* Returns the resulting document tree
6101	*/
6102	htmlDocPtr
6103	htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6104	void ioctx, const char URL, const char *encoding, int options)
6105	{
6106	htmlParserCtxtPtr ctxt;
6107	xmlParserInputBufferPtr input;
6108	xmlParserInputPtr stream;
6109
6110	if (ioread == NULL)
6111	return (NULL);
6112	xmlInitParser();
6113
6114	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6115	XML_CHAR_ENCODING_NONE);
6116	if (input == NULL)
6117	return (NULL);
6118	ctxt = htmlNewParserCtxt();
6119	if (ctxt == NULL) {
6120	xmlFreeParserInputBuffer(input);
6121	return (NULL);
6122	}
6123	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6124	if (stream == NULL) {
6125	xmlFreeParserInputBuffer(input);
6126	xmlFreeParserCtxt(ctxt);
6127	return (NULL);
6128	}
6129	inputPush(ctxt, stream);
6130	return (htmlDoRead(ctxt, URL, encoding, options, 0));
6131	}
6132
6133	/**
6134	* htmlCtxtReadDoc:
6135	* @ctxt: an HTML parser context
6136	* @cur: a pointer to a zero terminated string
6137	* @URL: the base URL to use for the document
6138	* @encoding: the document encoding, or NULL
6139	* @options: a combination of htmlParserOption(s)
6140	*
6141	* parse an XML in-memory document and build a tree.
6142	* This reuses the existing @ctxt parser context
6143	*
6144	* Returns the resulting document tree
6145	*/
6146	htmlDocPtr
6147	htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6148	const char URL, const char encoding, int options)
6149	{
6150	xmlParserInputPtr stream;
6151
6152	if (cur == NULL)
6153	return (NULL);
6154	if (ctxt == NULL)
6155	return (NULL);
6156
6157	htmlCtxtReset(ctxt);
6158
6159	stream = xmlNewStringInputStream(ctxt, cur);
6160	if (stream == NULL) {
6161	return (NULL);
6162	}
6163	inputPush(ctxt, stream);
6164	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6165	}
6166
6167	/**
6168	* htmlCtxtReadFile:
6169	* @ctxt: an HTML parser context
6170	* @filename: a file or URL
6171	* @encoding: the document encoding, or NULL
6172	* @options: a combination of htmlParserOption(s)
6173	*
6174	* parse an XML file from the filesystem or the network.
6175	* This reuses the existing @ctxt parser context
6176	*
6177	* Returns the resulting document tree
6178	*/
6179	htmlDocPtr
6180	htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6181	const char *encoding, int options)
6182	{
6183	xmlParserInputPtr stream;
6184
6185	if (filename == NULL)
6186	return (NULL);
6187	if (ctxt == NULL)
6188	return (NULL);
6189
6190	htmlCtxtReset(ctxt);
6191
6192	stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6193	if (stream == NULL) {
6194	return (NULL);
6195	}
6196	inputPush(ctxt, stream);
6197	return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6198	}
6199
6200	/**
6201	* htmlCtxtReadMemory:
6202	* @ctxt: an HTML parser context
6203	* @buffer: a pointer to a char array
6204	* @size: the size of the array
6205	* @URL: the base URL to use for the document
6206	* @encoding: the document encoding, or NULL
6207	* @options: a combination of htmlParserOption(s)
6208	*
6209	* parse an XML in-memory document and build a tree.
6210	* This reuses the existing @ctxt parser context
6211	*
6212	* Returns the resulting document tree
6213	*/
6214	htmlDocPtr
6215	htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6216	const char URL, const char encoding, int options)
6217	{
6218	xmlParserInputBufferPtr input;
6219	xmlParserInputPtr stream;
6220
6221	if (ctxt == NULL)
6222	return (NULL);
6223	if (buffer == NULL)
6224	return (NULL);
6225
6226	htmlCtxtReset(ctxt);
6227
6228	input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
6229	if (input == NULL) {
6230	return(NULL);
6231	}
6232
6233	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6234	if (stream == NULL) {
6235	xmlFreeParserInputBuffer(input);
6236	return(NULL);
6237	}
6238
6239	inputPush(ctxt, stream);
6240	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6241	}
6242
6243	/**
6244	* htmlCtxtReadFd:
6245	* @ctxt: an HTML parser context
6246	* @fd: an open file descriptor
6247	* @URL: the base URL to use for the document
6248	* @encoding: the document encoding, or NULL
6249	* @options: a combination of htmlParserOption(s)
6250	*
6251	* parse an XML from a file descriptor and build a tree.
6252	* This reuses the existing @ctxt parser context
6253	*
6254	* Returns the resulting document tree
6255	*/
6256	htmlDocPtr
6257	htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6258	const char URL, const char encoding, int options)
6259	{
6260	xmlParserInputBufferPtr input;
6261	xmlParserInputPtr stream;
6262
6263	if (fd < 0)
6264	return (NULL);
6265	if (ctxt == NULL)
6266	return (NULL);
6267
6268	htmlCtxtReset(ctxt);
6269
6270
6271	input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6272	if (input == NULL)
6273	return (NULL);
6274	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6275	if (stream == NULL) {
6276	xmlFreeParserInputBuffer(input);
6277	return (NULL);
6278	}
6279	inputPush(ctxt, stream);
6280	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6281	}
6282
6283	/**
6284	* htmlCtxtReadIO:
6285	* @ctxt: an HTML parser context
6286	* @ioread: an I/O read function
6287	* @ioclose: an I/O close function
6288	* @ioctx: an I/O handler
6289	* @URL: the base URL to use for the document
6290	* @encoding: the document encoding, or NULL
6291	* @options: a combination of htmlParserOption(s)
6292	*
6293	* parse an HTML document from I/O functions and source and build a tree.
6294	* This reuses the existing @ctxt parser context
6295	*
6296	* Returns the resulting document tree
6297	*/
6298	htmlDocPtr
6299	htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6300	xmlInputCloseCallback ioclose, void *ioctx,
6301	const char *URL,
6302	const char *encoding, int options)
6303	{
6304	xmlParserInputBufferPtr input;
6305	xmlParserInputPtr stream;
6306
6307	if (ioread == NULL)
6308	return (NULL);
6309	if (ctxt == NULL)
6310	return (NULL);
6311
6312	htmlCtxtReset(ctxt);
6313
6314	input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6315	XML_CHAR_ENCODING_NONE);
6316	if (input == NULL)
6317	return (NULL);
6318	stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6319	if (stream == NULL) {
6320	xmlFreeParserInputBuffer(input);
6321	return (NULL);
6322	}
6323	inputPush(ctxt, stream);
6324	return (htmlDoRead(ctxt, URL, encoding, options, 1));
6325	}
6326
6327	#define bottom_HTMLparser
6328	#include "elfgcchack.h"
6329	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.6.30/HTMLparser.c@ 8234

Download in other formats: