HTMLtree.c@ 102797

Last change on this file since 102797 was 95312, checked in by vboxsync, 3 years ago
libs/{curl,libxml2}: OSE export fixes, bugref:8515
Property svn:eol-style set to `native`
File size: 33.1 KB

Line
1	/*
2	* HTMLtree.c : implementation of access function for an HTML tree.
3	*
4	* See Copyright for the status of this software.
5	*
6	* daniel@veillard.com
7	*/
8
9
10	#define IN_LIBXML
11	#include "libxml.h"
12	#ifdef LIBXML_HTML_ENABLED
13
14	#include <string.h> /* for memset() only ! */
15
16	#ifdef HAVE_CTYPE_H
17	#include <ctype.h>
18	#endif
19	#ifdef HAVE_STDLIB_H
20	#include <stdlib.h>
21	#endif
22
23	#include <libxml/xmlmemory.h>
24	#include <libxml/HTMLparser.h>
25	#include <libxml/HTMLtree.h>
26	#include <libxml/entities.h>
27	#include <libxml/valid.h>
28	#include <libxml/xmlerror.h>
29	#include <libxml/parserInternals.h>
30	#include <libxml/globals.h>
31	#include <libxml/uri.h>
32
33	#include "buf.h"
34
35	/************************************************************************
36	* *
37	* Getting/Setting encoding meta tags *
38	* *
39	************************************************************************/
40
41	/**
42	* htmlGetMetaEncoding:
43	* @doc: the document
44	*
45	* Encoding definition lookup in the Meta tags
46	*
47	* Returns the current encoding as flagged in the HTML source
48	*/
49	const xmlChar *
50	htmlGetMetaEncoding(htmlDocPtr doc) {
51	htmlNodePtr cur;
52	const xmlChar *content;
53	const xmlChar *encoding;
54
55	if (doc == NULL)
56	return(NULL);
57	cur = doc->children;
58
59	/*
60	* Search the html
61	*/
62	while (cur != NULL) {
63	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
64	if (xmlStrEqual(cur->name, BAD_CAST"html"))
65	break;
66	if (xmlStrEqual(cur->name, BAD_CAST"head"))
67	goto found_head;
68	if (xmlStrEqual(cur->name, BAD_CAST"meta"))
69	goto found_meta;
70	}
71	cur = cur->next;
72	}
73	if (cur == NULL)
74	return(NULL);
75	cur = cur->children;
76
77	/*
78	* Search the head
79	*/
80	while (cur != NULL) {
81	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
82	if (xmlStrEqual(cur->name, BAD_CAST"head"))
83	break;
84	if (xmlStrEqual(cur->name, BAD_CAST"meta"))
85	goto found_meta;
86	}
87	cur = cur->next;
88	}
89	if (cur == NULL)
90	return(NULL);
91	found_head:
92	cur = cur->children;
93
94	/*
95	* Search the meta elements
96	*/
97	found_meta:
98	while (cur != NULL) {
99	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
100	if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
101	xmlAttrPtr attr = cur->properties;
102	int http;
103	const xmlChar *value;
104
105	content = NULL;
106	http = 0;
107	while (attr != NULL) {
108	if ((attr->children != NULL) &&
109	(attr->children->type == XML_TEXT_NODE) &&
110	(attr->children->next == NULL)) {
111	value = attr->children->content;
112	if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
113	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
114	http = 1;
115	else if ((value != NULL)
116	&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
117	content = value;
118	if ((http != 0) && (content != NULL))
119	goto found_content;
120	}
121	attr = attr->next;
122	}
123	}
124	}
125	cur = cur->next;
126	}
127	return(NULL);
128
129	found_content:
130	encoding = xmlStrstr(content, BAD_CAST"charset=");
131	if (encoding == NULL)
132	encoding = xmlStrstr(content, BAD_CAST"Charset=");
133	if (encoding == NULL)
134	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
135	if (encoding != NULL) {
136	encoding += 8;
137	} else {
138	encoding = xmlStrstr(content, BAD_CAST"charset =");
139	if (encoding == NULL)
140	encoding = xmlStrstr(content, BAD_CAST"Charset =");
141	if (encoding == NULL)
142	encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
143	if (encoding != NULL)
144	encoding += 9;
145	}
146	if (encoding != NULL) {
147	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
148	}
149	return(encoding);
150	}
151
152	/**
153	* htmlSetMetaEncoding:
154	* @doc: the document
155	* @encoding: the encoding string
156	*
157	* Sets the current encoding in the Meta tags
158	* NOTE: this will not change the document content encoding, just
159	* the META flag associated.
160	*
161	* Returns 0 in case of success and -1 in case of error
162	*/
163	int
164	htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
165	htmlNodePtr cur, meta = NULL, head = NULL;
166	const xmlChar *content = NULL;
167	char newcontent[100];
168
169	newcontent[0] = 0;
170
171	if (doc == NULL)
172	return(-1);
173
174	/* html isn't a real encoding it's just libxml2 way to get entities */
175	if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
176	return(-1);
177
178	if (encoding != NULL) {
179	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
180	(char *)encoding);
181	newcontent[sizeof(newcontent) - 1] = 0;
182	}
183
184	cur = doc->children;
185
186	/*
187	* Search the html
188	*/
189	while (cur != NULL) {
190	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
191	if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
192	break;
193	if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
194	goto found_head;
195	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
196	goto found_meta;
197	}
198	cur = cur->next;
199	}
200	if (cur == NULL)
201	return(-1);
202	cur = cur->children;
203
204	/*
205	* Search the head
206	*/
207	while (cur != NULL) {
208	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
209	if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
210	break;
211	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
212	head = cur->parent;
213	goto found_meta;
214	}
215	}
216	cur = cur->next;
217	}
218	if (cur == NULL)
219	return(-1);
220	found_head:
221	head = cur;
222	if (cur->children == NULL)
223	goto create;
224	cur = cur->children;
225
226	found_meta:
227	/*
228	* Search and update all the remaining the meta elements carrying
229	* encoding information
230	*/
231	while (cur != NULL) {
232	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
233	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
234	xmlAttrPtr attr = cur->properties;
235	int http;
236	const xmlChar *value;
237
238	content = NULL;
239	http = 0;
240	while (attr != NULL) {
241	if ((attr->children != NULL) &&
242	(attr->children->type == XML_TEXT_NODE) &&
243	(attr->children->next == NULL)) {
244	value = attr->children->content;
245	if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
246	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
247	http = 1;
248	else
249	{
250	if ((value != NULL) &&
251	(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
252	content = value;
253	}
254	if ((http != 0) && (content != NULL))
255	break;
256	}
257	attr = attr->next;
258	}
259	if ((http != 0) && (content != NULL)) {
260	meta = cur;
261	break;
262	}
263
264	}
265	}
266	cur = cur->next;
267	}
268	create:
269	if (meta == NULL) {
270	if ((encoding != NULL) && (head != NULL)) {
271	/*
272	* Create a new Meta element with the right attributes
273	*/
274
275	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
276	if (head->children == NULL)
277	xmlAddChild(head, meta);
278	else
279	xmlAddPrevSibling(head->children, meta);
280	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
281	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
282	}
283	} else {
284	/* remove the meta tag if NULL is passed */
285	if (encoding == NULL) {
286	xmlUnlinkNode(meta);
287	xmlFreeNode(meta);
288	}
289	/* change the document only if there is a real encoding change */
290	else if (xmlStrcasestr(content, encoding) == NULL) {
291	xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
292	}
293	}
294
295
296	return(0);
297	}
298
299	/**
300	* booleanHTMLAttrs:
301	*
302	* These are the HTML attributes which will be output
303	* in minimized form, i.e. <option selected="selected"> will be
304	* output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
305	*
306	*/
307	static const char* htmlBooleanAttrs[] = {
308	"checked", "compact", "declare", "defer", "disabled", "ismap",
309	"multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
310	"selected", NULL
311	};
312
313
314	/**
315	* htmlIsBooleanAttr:
316	* @name: the name of the attribute to check
317	*
318	* Determine if a given attribute is a boolean attribute.
319	*
320	* returns: false if the attribute is not boolean, true otherwise.
321	*/
322	int
323	htmlIsBooleanAttr(const xmlChar *name)
324	{
325	int i = 0;
326
327	while (htmlBooleanAttrs[i] != NULL) {
328	if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
329	return 1;
330	i++;
331	}
332	return 0;
333	}
334
335	#ifdef LIBXML_OUTPUT_ENABLED
336	/*
337	* private routine exported from xmlIO.c
338	*/
339	xmlOutputBufferPtr
340	xmlAllocOutputBufferInternal(xmlCharEncodingHandlerPtr encoder);
341	/************************************************************************
342	* *
343	* Output error handlers *
344	* *
345	************************************************************************/
346	/**
347	* htmlSaveErrMemory:
348	* @extra: extra information
349	*
350	* Handle an out of memory condition
351	*/
352	static void
353	htmlSaveErrMemory(const char *extra)
354	{
355	__xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
356	}
357
358	/**
359	* htmlSaveErr:
360	* @code: the error number
361	* @node: the location of the error.
362	* @extra: extra information
363	*
364	* Handle an out of memory condition
365	*/
366	static void
367	htmlSaveErr(int code, xmlNodePtr node, const char *extra)
368	{
369	const char *msg = NULL;
370
371	switch(code) {
372	case XML_SAVE_NOT_UTF8:
373	msg = "string is not in UTF-8\n";
374	break;
375	case XML_SAVE_CHAR_INVALID:
376	msg = "invalid character value\n";
377	break;
378	case XML_SAVE_UNKNOWN_ENCODING:
379	msg = "unknown encoding %s\n";
380	break;
381	case XML_SAVE_NO_DOCTYPE:
382	msg = "HTML has no DOCTYPE\n";
383	break;
384	default:
385	msg = "unexpected error number\n";
386	}
387	__xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
388	}
389
390	/************************************************************************
391	* *
392	* Dumping HTML tree content to a simple buffer *
393	* *
394	************************************************************************/
395
396	/**
397	* htmlBufNodeDumpFormat:
398	* @buf: the xmlBufPtr output
399	* @doc: the document
400	* @cur: the current node
401	* @format: should formatting spaces been added
402	*
403	* Dump an HTML node, recursive behaviour,children are printed too.
404	*
405	* Returns the number of byte written or -1 in case of error
406	*/
407	static size_t
408	htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
409	int format) {
410	size_t use;
411	int ret;
412	xmlOutputBufferPtr outbuf;
413
414	if (cur == NULL) {
415	return (-1);
416	}
417	if (buf == NULL) {
418	return (-1);
419	}
420	outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
421	if (outbuf == NULL) {
422	htmlSaveErrMemory("allocating HTML output buffer");
423	return (-1);
424	}
425	memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer));
426	outbuf->buffer = buf;
427	outbuf->encoder = NULL;
428	outbuf->writecallback = NULL;
429	outbuf->closecallback = NULL;
430	outbuf->context = NULL;
431	outbuf->written = 0;
432
433	use = xmlBufUse(buf);
434	htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
435	xmlFree(outbuf);
436	ret = xmlBufUse(buf) - use;
437	return (ret);
438	}
439
440	/**
441	* htmlNodeDump:
442	* @buf: the HTML buffer output
443	* @doc: the document
444	* @cur: the current node
445	*
446	* Dump an HTML node, recursive behaviour,children are printed too,
447	* and formatting returns are added.
448	*
449	* Returns the number of byte written or -1 in case of error
450	*/
451	int
452	htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
453	xmlBufPtr buffer;
454	size_t ret;
455
456	if ((buf == NULL) \|\| (cur == NULL))
457	return(-1);
458
459	xmlInitParser();
460	buffer = xmlBufFromBuffer(buf);
461	if (buffer == NULL)
462	return(-1);
463
464	ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
465
466	xmlBufBackToBuffer(buffer);
467
468	if (ret > INT_MAX)
469	return(-1);
470	return((int) ret);
471	}
472
473	/**
474	* htmlNodeDumpFileFormat:
475	* @out: the FILE pointer
476	* @doc: the document
477	* @cur: the current node
478	* @encoding: the document encoding
479	* @format: should formatting spaces been added
480	*
481	* Dump an HTML node, recursive behaviour,children are printed too.
482	*
483	* TODO: if encoding == NULL try to save in the doc encoding
484	*
485	* returns: the number of byte written or -1 in case of failure.
486	*/
487	int
488	htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
489	xmlNodePtr cur, const char *encoding, int format) {
490	xmlOutputBufferPtr buf;
491	xmlCharEncodingHandlerPtr handler = NULL;
492	int ret;
493
494	xmlInitParser();
495
496	if (encoding != NULL) {
497	xmlCharEncoding enc;
498
499	enc = xmlParseCharEncoding(encoding);
500	if (enc != XML_CHAR_ENCODING_UTF8) {
501	handler = xmlFindCharEncodingHandler(encoding);
502	if (handler == NULL)
503	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
504	}
505	} else {
506	/*
507	* Fallback to HTML or ASCII when the encoding is unspecified
508	*/
509	if (handler == NULL)
510	handler = xmlFindCharEncodingHandler("HTML");
511	if (handler == NULL)
512	handler = xmlFindCharEncodingHandler("ascii");
513	}
514
515	/*
516	* save the content to a temp buffer.
517	*/
518	buf = xmlOutputBufferCreateFile(out, handler);
519	if (buf == NULL) return(0);
520
521	htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
522
523	ret = xmlOutputBufferClose(buf);
524	return(ret);
525	}
526
527	/**
528	* htmlNodeDumpFile:
529	* @out: the FILE pointer
530	* @doc: the document
531	* @cur: the current node
532	*
533	* Dump an HTML node, recursive behaviour,children are printed too,
534	* and formatting returns are added.
535	*/
536	void
537	htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
538	htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
539	}
540
541	/**
542	* htmlDocDumpMemoryFormat:
543	* @cur: the document
544	* @mem: OUT: the memory pointer
545	* @size: OUT: the memory length
546	* @format: should formatting spaces been added
547	*
548	* Dump an HTML document in memory and return the xmlChar * and it's size.
549	* It's up to the caller to free the memory.
550	*/
551	void
552	htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar*mem, int size, int format) {
553	xmlOutputBufferPtr buf;
554	xmlCharEncodingHandlerPtr handler = NULL;
555	const char *encoding;
556
557	xmlInitParser();
558
559	if ((mem == NULL) \|\| (size == NULL))
560	return;
561	if (cur == NULL) {
562	*mem = NULL;
563	*size = 0;
564	return;
565	}
566
567	encoding = (const char *) htmlGetMetaEncoding(cur);
568
569	if (encoding != NULL) {
570	xmlCharEncoding enc;
571
572	enc = xmlParseCharEncoding(encoding);
573	if (enc != XML_CHAR_ENCODING_UTF8) {
574	handler = xmlFindCharEncodingHandler(encoding);
575	if (handler == NULL)
576	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
577
578	}
579	} else {
580	/*
581	* Fallback to HTML or ASCII when the encoding is unspecified
582	*/
583	if (handler == NULL)
584	handler = xmlFindCharEncodingHandler("HTML");
585	if (handler == NULL)
586	handler = xmlFindCharEncodingHandler("ascii");
587	}
588
589	buf = xmlAllocOutputBufferInternal(handler);
590	if (buf == NULL) {
591	*mem = NULL;
592	*size = 0;
593	return;
594	}
595
596	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
597
598	xmlOutputBufferFlush(buf);
599	if (buf->conv != NULL) {
600	*size = xmlBufUse(buf->conv);
601	mem = xmlStrndup(xmlBufContent(buf->conv), size);
602	} else {
603	*size = xmlBufUse(buf->buffer);
604	mem = xmlStrndup(xmlBufContent(buf->buffer), size);
605	}
606	(void)xmlOutputBufferClose(buf);
607	}
608
609	/**
610	* htmlDocDumpMemory:
611	* @cur: the document
612	* @mem: OUT: the memory pointer
613	* @size: OUT: the memory length
614	*
615	* Dump an HTML document in memory and return the xmlChar * and it's size.
616	* It's up to the caller to free the memory.
617	*/
618	void
619	htmlDocDumpMemory(xmlDocPtr cur, xmlChar*mem, int size) {
620	htmlDocDumpMemoryFormat(cur, mem, size, 1);
621	}
622
623
624	/************************************************************************
625	* *
626	* Dumping HTML tree content to an I/O output buffer *
627	* *
628	************************************************************************/
629
630	void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur);
631
632	/**
633	* htmlDtdDumpOutput:
634	* @buf: the HTML buffer output
635	* @doc: the document
636	* @encoding: the encoding string
637	*
638	* TODO: check whether encoding is needed
639	*
640	* Dump the HTML document DTD, if any.
641	*/
642	static void
643	htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
644	const char *encoding ATTRIBUTE_UNUSED) {
645	xmlDtdPtr cur = doc->intSubset;
646
647	if (cur == NULL) {
648	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
649	return;
650	}
651	xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
652	xmlOutputBufferWriteString(buf, (const char *)cur->name);
653	if (cur->ExternalID != NULL) {
654	xmlOutputBufferWriteString(buf, " PUBLIC ");
655	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
656	if (cur->SystemID != NULL) {
657	xmlOutputBufferWriteString(buf, " ");
658	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
659	}
660	} else if (cur->SystemID != NULL &&
661	xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
662	xmlOutputBufferWriteString(buf, " SYSTEM ");
663	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
664	}
665	xmlOutputBufferWriteString(buf, ">\n");
666	}
667
668	/**
669	* htmlAttrDumpOutput:
670	* @buf: the HTML buffer output
671	* @doc: the document
672	* @cur: the attribute pointer
673	*
674	* Dump an HTML attribute
675	*/
676	static void
677	htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
678	xmlChar *value;
679
680	/*
681	* The html output method should not escape a & character
682	* occurring in an attribute value immediately followed by
683	* a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
684	* This is implemented in xmlEncodeEntitiesReentrant
685	*/
686
687	if (cur == NULL) {
688	return;
689	}
690	xmlOutputBufferWriteString(buf, " ");
691	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
692	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
693	xmlOutputBufferWriteString(buf, ":");
694	}
695	xmlOutputBufferWriteString(buf, (const char *)cur->name);
696	if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
697	value = xmlNodeListGetString(doc, cur->children, 0);
698	if (value) {
699	xmlOutputBufferWriteString(buf, "=");
700	if ((cur->ns == NULL) && (cur->parent != NULL) &&
701	(cur->parent->ns == NULL) &&
702	((!xmlStrcasecmp(cur->name, BAD_CAST "href")) \|\|
703	(!xmlStrcasecmp(cur->name, BAD_CAST "action")) \|\|
704	(!xmlStrcasecmp(cur->name, BAD_CAST "src")) \|\|
705	((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
706	(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
707	xmlChar *escaped;
708	xmlChar *tmp = value;
709
710	while (IS_BLANK_CH(*tmp)) tmp++;
711
712	/*
713	* the < and > have already been escaped at the entity level
714	* And doing so here breaks server side includes
715	*/
716	escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+<>");
717	if (escaped != NULL) {
718	xmlBufWriteQuotedString(buf->buffer, escaped);
719	xmlFree(escaped);
720	} else {
721	xmlBufWriteQuotedString(buf->buffer, value);
722	}
723	} else {
724	xmlBufWriteQuotedString(buf->buffer, value);
725	}
726	xmlFree(value);
727	} else {
728	xmlOutputBufferWriteString(buf, "=\"\"");
729	}
730	}
731	}
732
733	/**
734	* htmlNodeDumpFormatOutput:
735	* @buf: the HTML buffer output
736	* @doc: the document
737	* @cur: the current node
738	* @encoding: the encoding string (unused)
739	* @format: should formatting spaces been added
740	*
741	* Dump an HTML node, recursive behaviour,children are printed too.
742	*/
743	void
744	htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
745	xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
746	int format) {
747	xmlNodePtr root, parent;
748	xmlAttrPtr attr;
749	const htmlElemDesc * info;
750
751	xmlInitParser();
752
753	if ((cur == NULL) \|\| (buf == NULL)) {
754	return;
755	}
756
757	root = cur;
758	parent = cur->parent;
759	while (1) {
760	switch (cur->type) {
761	case XML_HTML_DOCUMENT_NODE:
762	case XML_DOCUMENT_NODE:
763	if (((xmlDocPtr) cur)->intSubset != NULL) {
764	htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
765	}
766	if (cur->children != NULL) {
767	/* Always validate cur->parent when descending. */
768	if (cur->parent == parent) {
769	parent = cur;
770	cur = cur->children;
771	continue;
772	}
773	} else {
774	xmlOutputBufferWriteString(buf, "\n");
775	}
776	break;
777
778	case XML_ELEMENT_NODE:
779	/*
780	* Some users like lxml are known to pass nodes with a corrupted
781	* tree structure. Fall back to a recursive call to handle this
782	* case.
783	*/
784	if ((cur->parent != parent) && (cur->children != NULL)) {
785	htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
786	break;
787	}
788
789	/*
790	* Get specific HTML info for that node.
791	*/
792	if (cur->ns == NULL)
793	info = htmlTagLookup(cur->name);
794	else
795	info = NULL;
796
797	xmlOutputBufferWriteString(buf, "<");
798	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
799	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
800	xmlOutputBufferWriteString(buf, ":");
801	}
802	xmlOutputBufferWriteString(buf, (const char *)cur->name);
803	if (cur->nsDef)
804	xmlNsListDumpOutput(buf, cur->nsDef);
805	attr = cur->properties;
806	while (attr != NULL) {
807	htmlAttrDumpOutput(buf, doc, attr);
808	attr = attr->next;
809	}
810
811	if ((info != NULL) && (info->empty)) {
812	xmlOutputBufferWriteString(buf, ">");
813	} else if (cur->children == NULL) {
814	if ((info != NULL) && (info->saveEndTag != 0) &&
815	(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
816	(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
817	xmlOutputBufferWriteString(buf, ">");
818	} else {
819	xmlOutputBufferWriteString(buf, "></");
820	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
821	xmlOutputBufferWriteString(buf,
822	(const char *)cur->ns->prefix);
823	xmlOutputBufferWriteString(buf, ":");
824	}
825	xmlOutputBufferWriteString(buf, (const char *)cur->name);
826	xmlOutputBufferWriteString(buf, ">");
827	}
828	} else {
829	xmlOutputBufferWriteString(buf, ">");
830	if ((format) && (info != NULL) && (!info->isinline) &&
831	(cur->children->type != HTML_TEXT_NODE) &&
832	(cur->children->type != HTML_ENTITY_REF_NODE) &&
833	(cur->children != cur->last) &&
834	(cur->name != NULL) &&
835	(cur->name[0] != 'p')) /* p, pre, param */
836	xmlOutputBufferWriteString(buf, "\n");
837	parent = cur;
838	cur = cur->children;
839	continue;
840	}
841
842	if ((format) && (cur->next != NULL) &&
843	(info != NULL) && (!info->isinline)) {
844	if ((cur->next->type != HTML_TEXT_NODE) &&
845	(cur->next->type != HTML_ENTITY_REF_NODE) &&
846	(parent != NULL) &&
847	(parent->name != NULL) &&
848	(parent->name[0] != 'p')) /* p, pre, param */
849	xmlOutputBufferWriteString(buf, "\n");
850	}
851
852	break;
853
854	case XML_ATTRIBUTE_NODE:
855	htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
856	break;
857
858	case HTML_TEXT_NODE:
859	if (cur->content == NULL)
860	break;
861	if (((cur->name == (const xmlChar *)xmlStringText) \|\|
862	(cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
863	((parent == NULL) \|\|
864	((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
865	(xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
866	xmlChar *buffer;
867
868	buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
869	if (buffer != NULL) {
870	xmlOutputBufferWriteString(buf, (const char *)buffer);
871	xmlFree(buffer);
872	}
873	} else {
874	xmlOutputBufferWriteString(buf, (const char *)cur->content);
875	}
876	break;
877
878	case HTML_COMMENT_NODE:
879	if (cur->content != NULL) {
880	xmlOutputBufferWriteString(buf, "<!--");
881	xmlOutputBufferWriteString(buf, (const char *)cur->content);
882	xmlOutputBufferWriteString(buf, "-->");
883	}
884	break;
885
886	case HTML_PI_NODE:
887	if (cur->name != NULL) {
888	xmlOutputBufferWriteString(buf, "<?");
889	xmlOutputBufferWriteString(buf, (const char *)cur->name);
890	if (cur->content != NULL) {
891	xmlOutputBufferWriteString(buf, " ");
892	xmlOutputBufferWriteString(buf,
893	(const char *)cur->content);
894	}
895	xmlOutputBufferWriteString(buf, ">");
896	}
897	break;
898
899	case HTML_ENTITY_REF_NODE:
900	xmlOutputBufferWriteString(buf, "&");
901	xmlOutputBufferWriteString(buf, (const char *)cur->name);
902	xmlOutputBufferWriteString(buf, ";");
903	break;
904
905	case HTML_PRESERVE_NODE:
906	if (cur->content != NULL) {
907	xmlOutputBufferWriteString(buf, (const char *)cur->content);
908	}
909	break;
910
911	default:
912	break;
913	}
914
915	while (1) {
916	if (cur == root)
917	return;
918	if (cur->next != NULL) {
919	cur = cur->next;
920	break;
921	}
922
923	cur = parent;
924	/* cur->parent was validated when descending. */
925	parent = cur->parent;
926
927	if ((cur->type == XML_HTML_DOCUMENT_NODE) \|\|
928	(cur->type == XML_DOCUMENT_NODE)) {
929	xmlOutputBufferWriteString(buf, "\n");
930	} else {
931	if ((format) && (cur->ns == NULL))
932	info = htmlTagLookup(cur->name);
933	else
934	info = NULL;
935
936	if ((format) && (info != NULL) && (!info->isinline) &&
937	(cur->last->type != HTML_TEXT_NODE) &&
938	(cur->last->type != HTML_ENTITY_REF_NODE) &&
939	(cur->children != cur->last) &&
940	(cur->name != NULL) &&
941	(cur->name[0] != 'p')) /* p, pre, param */
942	xmlOutputBufferWriteString(buf, "\n");
943
944	xmlOutputBufferWriteString(buf, "</");
945	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
946	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
947	xmlOutputBufferWriteString(buf, ":");
948	}
949	xmlOutputBufferWriteString(buf, (const char *)cur->name);
950	xmlOutputBufferWriteString(buf, ">");
951
952	if ((format) && (info != NULL) && (!info->isinline) &&
953	(cur->next != NULL)) {
954	if ((cur->next->type != HTML_TEXT_NODE) &&
955	(cur->next->type != HTML_ENTITY_REF_NODE) &&
956	(parent != NULL) &&
957	(parent->name != NULL) &&
958	(parent->name[0] != 'p')) /* p, pre, param */
959	xmlOutputBufferWriteString(buf, "\n");
960	}
961	}
962	}
963	}
964	}
965
966	/**
967	* htmlNodeDumpOutput:
968	* @buf: the HTML buffer output
969	* @doc: the document
970	* @cur: the current node
971	* @encoding: the encoding string (unused)
972	*
973	* Dump an HTML node, recursive behaviour,children are printed too,
974	* and formatting returns/spaces are added.
975	*/
976	void
977	htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
978	xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
979	htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
980	}
981
982	/**
983	* htmlDocContentDumpFormatOutput:
984	* @buf: the HTML buffer output
985	* @cur: the document
986	* @encoding: the encoding string (unused)
987	* @format: should formatting spaces been added
988	*
989	* Dump an HTML document.
990	*/
991	void
992	htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
993	const char *encoding ATTRIBUTE_UNUSED,
994	int format) {
995	htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
996	}
997
998	/**
999	* htmlDocContentDumpOutput:
1000	* @buf: the HTML buffer output
1001	* @cur: the document
1002	* @encoding: the encoding string (unused)
1003	*
1004	* Dump an HTML document. Formatting return/spaces are added.
1005	*/
1006	void
1007	htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1008	const char *encoding ATTRIBUTE_UNUSED) {
1009	htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1010	}
1011
1012	/************************************************************************
1013	* *
1014	* Saving functions front-ends *
1015	* *
1016	************************************************************************/
1017
1018	/**
1019	* htmlDocDump:
1020	* @f: the FILE*
1021	* @cur: the document
1022	*
1023	* Dump an HTML document to an open FILE.
1024	*
1025	* returns: the number of byte written or -1 in case of failure.
1026	*/
1027	int
1028	htmlDocDump(FILE *f, xmlDocPtr cur) {
1029	xmlOutputBufferPtr buf;
1030	xmlCharEncodingHandlerPtr handler = NULL;
1031	const char *encoding;
1032	int ret;
1033
1034	xmlInitParser();
1035
1036	if ((cur == NULL) \|\| (f == NULL)) {
1037	return(-1);
1038	}
1039
1040	encoding = (const char *) htmlGetMetaEncoding(cur);
1041
1042	if (encoding != NULL) {
1043	xmlCharEncoding enc;
1044
1045	enc = xmlParseCharEncoding(encoding);
1046	if (enc != XML_CHAR_ENCODING_UTF8) {
1047	handler = xmlFindCharEncodingHandler(encoding);
1048	if (handler == NULL)
1049	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1050	}
1051	} else {
1052	/*
1053	* Fallback to HTML or ASCII when the encoding is unspecified
1054	*/
1055	if (handler == NULL)
1056	handler = xmlFindCharEncodingHandler("HTML");
1057	if (handler == NULL)
1058	handler = xmlFindCharEncodingHandler("ascii");
1059	}
1060
1061	buf = xmlOutputBufferCreateFile(f, handler);
1062	if (buf == NULL) return(-1);
1063	htmlDocContentDumpOutput(buf, cur, NULL);
1064
1065	ret = xmlOutputBufferClose(buf);
1066	return(ret);
1067	}
1068
1069	/**
1070	* htmlSaveFile:
1071	* @filename: the filename (or URL)
1072	* @cur: the document
1073	*
1074	* Dump an HTML document to a file. If @filename is "-" the stdout file is
1075	* used.
1076	* returns: the number of byte written or -1 in case of failure.
1077	*/
1078	int
1079	htmlSaveFile(const char *filename, xmlDocPtr cur) {
1080	xmlOutputBufferPtr buf;
1081	xmlCharEncodingHandlerPtr handler = NULL;
1082	const char *encoding;
1083	int ret;
1084
1085	if ((cur == NULL) \|\| (filename == NULL))
1086	return(-1);
1087
1088	xmlInitParser();
1089
1090	encoding = (const char *) htmlGetMetaEncoding(cur);
1091
1092	if (encoding != NULL) {
1093	xmlCharEncoding enc;
1094
1095	enc = xmlParseCharEncoding(encoding);
1096	if (enc != XML_CHAR_ENCODING_UTF8) {
1097	handler = xmlFindCharEncodingHandler(encoding);
1098	if (handler == NULL)
1099	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1100	}
1101	} else {
1102	/*
1103	* Fallback to HTML or ASCII when the encoding is unspecified
1104	*/
1105	if (handler == NULL)
1106	handler = xmlFindCharEncodingHandler("HTML");
1107	if (handler == NULL)
1108	handler = xmlFindCharEncodingHandler("ascii");
1109	}
1110
1111	/*
1112	* save the content to a temp buffer.
1113	*/
1114	buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1115	if (buf == NULL) return(0);
1116
1117	htmlDocContentDumpOutput(buf, cur, NULL);
1118
1119	ret = xmlOutputBufferClose(buf);
1120	return(ret);
1121	}
1122
1123	/**
1124	* htmlSaveFileFormat:
1125	* @filename: the filename
1126	* @cur: the document
1127	* @format: should formatting spaces been added
1128	* @encoding: the document encoding
1129	*
1130	* Dump an HTML document to a file using a given encoding.
1131	*
1132	* returns: the number of byte written or -1 in case of failure.
1133	*/
1134	int
1135	htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1136	const char *encoding, int format) {
1137	xmlOutputBufferPtr buf;
1138	xmlCharEncodingHandlerPtr handler = NULL;
1139	int ret;
1140
1141	if ((cur == NULL) \|\| (filename == NULL))
1142	return(-1);
1143
1144	xmlInitParser();
1145
1146	if (encoding != NULL) {
1147	xmlCharEncoding enc;
1148
1149	enc = xmlParseCharEncoding(encoding);
1150	if (enc != XML_CHAR_ENCODING_UTF8) {
1151	handler = xmlFindCharEncodingHandler(encoding);
1152	if (handler == NULL)
1153	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1154	}
1155	htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1156	} else {
1157	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1158
1159	/*
1160	* Fallback to HTML or ASCII when the encoding is unspecified
1161	*/
1162	if (handler == NULL)
1163	handler = xmlFindCharEncodingHandler("HTML");
1164	if (handler == NULL)
1165	handler = xmlFindCharEncodingHandler("ascii");
1166	}
1167
1168	/*
1169	* save the content to a temp buffer.
1170	*/
1171	buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1172	if (buf == NULL) return(0);
1173
1174	htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1175
1176	ret = xmlOutputBufferClose(buf);
1177	return(ret);
1178	}
1179
1180	/**
1181	* htmlSaveFileEnc:
1182	* @filename: the filename
1183	* @cur: the document
1184	* @encoding: the document encoding
1185	*
1186	* Dump an HTML document to a file using a given encoding
1187	* and formatting returns/spaces are added.
1188	*
1189	* returns: the number of byte written or -1 in case of failure.
1190	*/
1191	int
1192	htmlSaveFileEnc(const char filename, xmlDocPtr cur, const char encoding) {
1193	return(htmlSaveFileFormat(filename, cur, encoding, 1));
1194	}
1195
1196	#endif /* LIBXML_OUTPUT_ENABLED */
1197
1198	#define bottom_HTMLtree
1199	#include "elfgcchack.h"
1200	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.9.14/HTMLtree.c@ 102797

Download in other formats: