HTMLtree.c@ 104932

Last change on this file since 104932 was 104106, checked in by vboxsync, 9 months ago
libxml2-2.9.14: Applied and adjusted our libxml2 changes to 2.9.14. bugref:10640
Property svn:eol-style set to `native`
File size: 33.3 KB

Line
1	/*
2	* HTMLtree.c : implementation of access function for an HTML tree.
3	*
4	* See Copyright for the status of this software.
5	*
6	* daniel@veillard.com
7	*/
8
9
10	#define IN_LIBXML
11	#include "libxml.h"
12	#ifdef LIBXML_HTML_ENABLED
13
14	#include <string.h> /* for memset() only ! */
15	#include <ctype.h>
16	#include <stdlib.h>
17
18	#include <libxml/xmlmemory.h>
19	#include <libxml/HTMLparser.h>
20	#include <libxml/HTMLtree.h>
21	#include <libxml/entities.h>
22	#include <libxml/xmlerror.h>
23	#include <libxml/parserInternals.h>
24	#include <libxml/uri.h>
25
26	#include "private/buf.h"
27	#include "private/error.h"
28	#include "private/io.h"
29	#include "private/save.h"
30
31	/************************************************************************
32	* *
33	* Getting/Setting encoding meta tags *
34	* *
35	************************************************************************/
36
37	/**
38	* htmlGetMetaEncoding:
39	* @doc: the document
40	*
41	* Encoding definition lookup in the Meta tags
42	*
43	* Returns the current encoding as flagged in the HTML source
44	*/
45	const xmlChar *
46	htmlGetMetaEncoding(htmlDocPtr doc) {
47	htmlNodePtr cur;
48	const xmlChar *content;
49	const xmlChar *encoding;
50
51	if (doc == NULL)
52	return(NULL);
53	cur = doc->children;
54
55	/*
56	* Search the html
57	*/
58	while (cur != NULL) {
59	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
60	if (xmlStrEqual(cur->name, BAD_CAST"html"))
61	break;
62	if (xmlStrEqual(cur->name, BAD_CAST"head"))
63	goto found_head;
64	if (xmlStrEqual(cur->name, BAD_CAST"meta"))
65	goto found_meta;
66	}
67	cur = cur->next;
68	}
69	if (cur == NULL)
70	return(NULL);
71	cur = cur->children;
72
73	/*
74	* Search the head
75	*/
76	while (cur != NULL) {
77	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
78	if (xmlStrEqual(cur->name, BAD_CAST"head"))
79	break;
80	if (xmlStrEqual(cur->name, BAD_CAST"meta"))
81	goto found_meta;
82	}
83	cur = cur->next;
84	}
85	if (cur == NULL)
86	return(NULL);
87	found_head:
88	cur = cur->children;
89
90	/*
91	* Search the meta elements
92	*/
93	found_meta:
94	while (cur != NULL) {
95	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
96	if (xmlStrEqual(cur->name, BAD_CAST"meta")) {
97	xmlAttrPtr attr = cur->properties;
98	int http;
99	const xmlChar *value;
100
101	content = NULL;
102	http = 0;
103	while (attr != NULL) {
104	if ((attr->children != NULL) &&
105	(attr->children->type == XML_TEXT_NODE) &&
106	(attr->children->next == NULL)) {
107	value = attr->children->content;
108	if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
109	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
110	http = 1;
111	else if ((value != NULL)
112	&& (!xmlStrcasecmp(attr->name, BAD_CAST"content")))
113	content = value;
114	if ((http != 0) && (content != NULL))
115	goto found_content;
116	}
117	attr = attr->next;
118	}
119	}
120	}
121	cur = cur->next;
122	}
123	return(NULL);
124
125	found_content:
126	encoding = xmlStrstr(content, BAD_CAST"charset=");
127	if (encoding == NULL)
128	encoding = xmlStrstr(content, BAD_CAST"Charset=");
129	if (encoding == NULL)
130	encoding = xmlStrstr(content, BAD_CAST"CHARSET=");
131	if (encoding != NULL) {
132	encoding += 8;
133	} else {
134	encoding = xmlStrstr(content, BAD_CAST"charset =");
135	if (encoding == NULL)
136	encoding = xmlStrstr(content, BAD_CAST"Charset =");
137	if (encoding == NULL)
138	encoding = xmlStrstr(content, BAD_CAST"CHARSET =");
139	if (encoding != NULL)
140	encoding += 9;
141	}
142	if (encoding != NULL) {
143	while ((encoding == ' ') \|\| (encoding == '\t')) encoding++;
144	}
145	return(encoding);
146	}
147
148	/**
149	* htmlSetMetaEncoding:
150	* @doc: the document
151	* @encoding: the encoding string
152	*
153	* Sets the current encoding in the Meta tags
154	* NOTE: this will not change the document content encoding, just
155	* the META flag associated.
156	*
157	* Returns 0 in case of success and -1 in case of error
158	*/
159	int
160	htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) {
161	htmlNodePtr cur, meta = NULL, head = NULL;
162	const xmlChar *content = NULL;
163	char newcontent[100];
164
165	newcontent[0] = 0;
166
167	if (doc == NULL)
168	return(-1);
169
170	/* html isn't a real encoding it's just libxml2 way to get entities */
171	if (!xmlStrcasecmp(encoding, BAD_CAST "html"))
172	return(-1);
173
174	if (encoding != NULL) {
175	snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s",
176	(char *)encoding);
177	newcontent[sizeof(newcontent) - 1] = 0;
178	}
179
180	cur = doc->children;
181
182	/*
183	* Search the html
184	*/
185	while (cur != NULL) {
186	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
187	if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0)
188	break;
189	if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
190	goto found_head;
191	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0)
192	goto found_meta;
193	}
194	cur = cur->next;
195	}
196	if (cur == NULL)
197	return(-1);
198	cur = cur->children;
199
200	/*
201	* Search the head
202	*/
203	while (cur != NULL) {
204	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
205	if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0)
206	break;
207	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
208	head = cur->parent;
209	goto found_meta;
210	}
211	}
212	cur = cur->next;
213	}
214	if (cur == NULL)
215	return(-1);
216	found_head:
217	head = cur;
218	if (cur->children == NULL)
219	goto create;
220	cur = cur->children;
221
222	found_meta:
223	/*
224	* Search and update all the remaining the meta elements carrying
225	* encoding information
226	*/
227	while (cur != NULL) {
228	if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) {
229	if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) {
230	xmlAttrPtr attr = cur->properties;
231	int http;
232	const xmlChar *value;
233
234	content = NULL;
235	http = 0;
236	while (attr != NULL) {
237	if ((attr->children != NULL) &&
238	(attr->children->type == XML_TEXT_NODE) &&
239	(attr->children->next == NULL)) {
240	value = attr->children->content;
241	if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv"))
242	&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
243	http = 1;
244	else
245	{
246	if ((value != NULL) &&
247	(!xmlStrcasecmp(attr->name, BAD_CAST"content")))
248	content = value;
249	}
250	if ((http != 0) && (content != NULL))
251	break;
252	}
253	attr = attr->next;
254	}
255	if ((http != 0) && (content != NULL)) {
256	meta = cur;
257	break;
258	}
259
260	}
261	}
262	cur = cur->next;
263	}
264	create:
265	if (meta == NULL) {
266	if ((encoding != NULL) && (head != NULL)) {
267	/*
268	* Create a new Meta element with the right attributes
269	*/
270
271	meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL);
272	if (head->children == NULL)
273	xmlAddChild(head, meta);
274	else
275	xmlAddPrevSibling(head->children, meta);
276	xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type");
277	xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent);
278	}
279	} else {
280	/* remove the meta tag if NULL is passed */
281	if (encoding == NULL) {
282	xmlUnlinkNode(meta);
283	xmlFreeNode(meta);
284	}
285	/* change the document only if there is a real encoding change */
286	else if (xmlStrcasestr(content, encoding) == NULL) {
287	xmlSetProp(meta, BAD_CAST"content", BAD_CAST newcontent);
288	}
289	}
290
291
292	return(0);
293	}
294
295	/**
296	* booleanHTMLAttrs:
297	*
298	* These are the HTML attributes which will be output
299	* in minimized form, i.e. <option selected="selected"> will be
300	* output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method"
301	*
302	*/
303	static const char* const htmlBooleanAttrs[] = {
304	"checked", "compact", "declare", "defer", "disabled", "ismap",
305	"multiple", "nohref", "noresize", "noshade", "nowrap", "readonly",
306	"selected", NULL
307	};
308
309
310	/**
311	* htmlIsBooleanAttr:
312	* @name: the name of the attribute to check
313	*
314	* Determine if a given attribute is a boolean attribute.
315	*
316	* returns: false if the attribute is not boolean, true otherwise.
317	*/
318	int
319	htmlIsBooleanAttr(const xmlChar *name)
320	{
321	int i = 0;
322
323	while (htmlBooleanAttrs[i] != NULL) {
324	if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0)
325	return 1;
326	i++;
327	}
328	return 0;
329	}
330
331	#ifdef LIBXML_OUTPUT_ENABLED
332	/************************************************************************
333	* *
334	* Output error handlers *
335	* *
336	************************************************************************/
337	/**
338	* htmlSaveErrMemory:
339	* @extra: extra information
340	*
341	* Handle an out of memory condition
342	*/
343	static void
344	htmlSaveErrMemory(const char *extra)
345	{
346	__xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra);
347	}
348
349	/**
350	* htmlSaveErr:
351	* @code: the error number
352	* @node: the location of the error.
353	* @extra: extra information
354	*
355	* Handle an out of memory condition
356	*/
357	static void
358	htmlSaveErr(int code, xmlNodePtr node, const char *extra)
359	{
360	const char *msg = NULL;
361
362	switch(code) {
363	case XML_SAVE_NOT_UTF8:
364	msg = "string is not in UTF-8\n";
365	break;
366	case XML_SAVE_CHAR_INVALID:
367	msg = "invalid character value\n";
368	break;
369	case XML_SAVE_UNKNOWN_ENCODING:
370	msg = "unknown encoding %s\n";
371	break;
372	case XML_SAVE_NO_DOCTYPE:
373	msg = "HTML has no DOCTYPE\n";
374	break;
375	default:
376	msg = "unexpected error number\n";
377	}
378	__xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra);
379	}
380
381	/************************************************************************
382	* *
383	* Dumping HTML tree content to a simple buffer *
384	* *
385	************************************************************************/
386
387	/**
388	* htmlBufNodeDumpFormat:
389	* @buf: the xmlBufPtr output
390	* @doc: the document
391	* @cur: the current node
392	* @format: should formatting spaces been added
393	*
394	* Dump an HTML node, recursive behaviour,children are printed too.
395	*
396	* Returns the number of byte written or -1 in case of error
397	*/
398	static size_t
399	htmlBufNodeDumpFormat(xmlBufPtr buf, xmlDocPtr doc, xmlNodePtr cur,
400	int format) {
401	size_t use;
402	int ret;
403	xmlOutputBufferPtr outbuf;
404
405	if (cur == NULL) {
406	return (-1);
407	}
408	if (buf == NULL) {
409	return (-1);
410	}
411	outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer));
412	if (outbuf == NULL) {
413	htmlSaveErrMemory("allocating HTML output buffer");
414	return (-1);
415	}
416	memset(outbuf, 0, sizeof(xmlOutputBuffer));
417	outbuf->buffer = buf;
418	outbuf->encoder = NULL;
419	outbuf->writecallback = NULL;
420	outbuf->closecallback = NULL;
421	outbuf->context = NULL;
422	outbuf->written = 0;
423
424	use = xmlBufUse(buf);
425	htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format);
426	xmlFree(outbuf);
427	ret = xmlBufUse(buf) - use;
428	return (ret);
429	}
430
431	/**
432	* htmlNodeDump:
433	* @buf: the HTML buffer output
434	* @doc: the document
435	* @cur: the current node
436	*
437	* Dump an HTML node, recursive behaviour,children are printed too,
438	* and formatting returns are added.
439	*
440	* Returns the number of byte written or -1 in case of error
441	*/
442	int
443	htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) {
444	xmlBufPtr buffer;
445	size_t ret;
446
447	if ((buf == NULL) \|\| (cur == NULL))
448	return(-1);
449
450	xmlInitParser();
451	buffer = xmlBufFromBuffer(buf);
452	if (buffer == NULL)
453	return(-1);
454
455	ret = htmlBufNodeDumpFormat(buffer, doc, cur, 1);
456
457	xmlBufBackToBuffer(buffer);
458
459	if (ret > INT_MAX)
460	return(-1);
461	return((int) ret);
462	}
463
464	/**
465	* htmlNodeDumpFileFormat:
466	* @out: the FILE pointer
467	* @doc: the document
468	* @cur: the current node
469	* @encoding: the document encoding
470	* @format: should formatting spaces been added
471	*
472	* Dump an HTML node, recursive behaviour,children are printed too.
473	*
474	* TODO: if encoding == NULL try to save in the doc encoding
475	*
476	* returns: the number of byte written or -1 in case of failure.
477	*/
478	int
479	htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc,
480	xmlNodePtr cur, const char *encoding, int format) {
481	xmlOutputBufferPtr buf;
482	xmlCharEncodingHandlerPtr handler = NULL;
483	int ret;
484
485	xmlInitParser();
486
487	if (encoding != NULL) {
488	xmlCharEncoding enc;
489
490	enc = xmlParseCharEncoding(encoding);
491	if (enc != XML_CHAR_ENCODING_UTF8) {
492	handler = xmlFindCharEncodingHandler(encoding);
493	if (handler == NULL)
494	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
495	}
496	} else {
497	/*
498	* Fallback to HTML or ASCII when the encoding is unspecified
499	*/
500	if (handler == NULL)
501	handler = xmlFindCharEncodingHandler("HTML");
502	if (handler == NULL)
503	handler = xmlFindCharEncodingHandler("ascii");
504	}
505
506	/*
507	* save the content to a temp buffer.
508	*/
509	buf = xmlOutputBufferCreateFile(out, handler);
510	if (buf == NULL) return(0);
511
512	htmlNodeDumpFormatOutput(buf, doc, cur, NULL, format);
513
514	ret = xmlOutputBufferClose(buf);
515	return(ret);
516	}
517
518	/**
519	* htmlNodeDumpFile:
520	* @out: the FILE pointer
521	* @doc: the document
522	* @cur: the current node
523	*
524	* Dump an HTML node, recursive behaviour,children are printed too,
525	* and formatting returns are added.
526	*/
527	void
528	htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) {
529	htmlNodeDumpFileFormat(out, doc, cur, NULL, 1);
530	}
531
532	/**
533	* htmlDocDumpMemoryFormat:
534	* @cur: the document
535	* @mem: OUT: the memory pointer
536	* @size: OUT: the memory length
537	* @format: should formatting spaces been added
538	*
539	* Dump an HTML document in memory and return the xmlChar * and it's size.
540	* It's up to the caller to free the memory.
541	*/
542	void
543	htmlDocDumpMemoryFormat(xmlDocPtr cur, xmlChar*mem, int size, int format) {
544	xmlOutputBufferPtr buf;
545	xmlCharEncodingHandlerPtr handler = NULL;
546	const char *encoding;
547
548	xmlInitParser();
549
550	if ((mem == NULL) \|\| (size == NULL))
551	return;
552	if (cur == NULL) {
553	*mem = NULL;
554	*size = 0;
555	return;
556	}
557
558	encoding = (const char *) htmlGetMetaEncoding(cur);
559
560	if (encoding != NULL) {
561	xmlCharEncoding enc;
562
563	enc = xmlParseCharEncoding(encoding);
564	if (enc != XML_CHAR_ENCODING_UTF8) {
565	handler = xmlFindCharEncodingHandler(encoding);
566	if (handler == NULL)
567	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
568
569	}
570	} else {
571	/*
572	* Fallback to HTML or ASCII when the encoding is unspecified
573	*/
574	if (handler == NULL)
575	handler = xmlFindCharEncodingHandler("HTML");
576	if (handler == NULL)
577	handler = xmlFindCharEncodingHandler("ascii");
578	}
579
580	buf = xmlAllocOutputBufferInternal(handler);
581	if (buf == NULL) {
582	*mem = NULL;
583	*size = 0;
584	return;
585	}
586
587	htmlDocContentDumpFormatOutput(buf, cur, NULL, format);
588
589	xmlOutputBufferFlush(buf);
590	if (buf->conv != NULL) {
591	*size = xmlBufUse(buf->conv);
592	mem = xmlStrndup(xmlBufContent(buf->conv), size);
593	} else {
594	*size = xmlBufUse(buf->buffer);
595	mem = xmlStrndup(xmlBufContent(buf->buffer), size);
596	}
597	(void)xmlOutputBufferClose(buf);
598	}
599
600	/**
601	* htmlDocDumpMemory:
602	* @cur: the document
603	* @mem: OUT: the memory pointer
604	* @size: OUT: the memory length
605	*
606	* Dump an HTML document in memory and return the xmlChar * and it's size.
607	* It's up to the caller to free the memory.
608	*/
609	void
610	htmlDocDumpMemory(xmlDocPtr cur, xmlChar*mem, int size) {
611	htmlDocDumpMemoryFormat(cur, mem, size, 1);
612	}
613
614
615	/************************************************************************
616	* *
617	* Dumping HTML tree content to an I/O output buffer *
618	* *
619	************************************************************************/
620
621	/**
622	* htmlDtdDumpOutput:
623	* @buf: the HTML buffer output
624	* @doc: the document
625	* @encoding: the encoding string
626	*
627	* TODO: check whether encoding is needed
628	*
629	* Dump the HTML document DTD, if any.
630	*/
631	static void
632	htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
633	const char *encoding ATTRIBUTE_UNUSED) {
634	xmlDtdPtr cur = doc->intSubset;
635
636	if (cur == NULL) {
637	htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL);
638	return;
639	}
640	xmlOutputBufferWriteString(buf, "<!DOCTYPE ");
641	xmlOutputBufferWriteString(buf, (const char *)cur->name);
642	if (cur->ExternalID != NULL) {
643	xmlOutputBufferWriteString(buf, " PUBLIC ");
644	xmlBufWriteQuotedString(buf->buffer, cur->ExternalID);
645	if (cur->SystemID != NULL) {
646	xmlOutputBufferWriteString(buf, " ");
647	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
648	}
649	} else if (cur->SystemID != NULL &&
650	xmlStrcmp(cur->SystemID, BAD_CAST "about:legacy-compat")) {
651	xmlOutputBufferWriteString(buf, " SYSTEM ");
652	xmlBufWriteQuotedString(buf->buffer, cur->SystemID);
653	}
654	xmlOutputBufferWriteString(buf, ">\n");
655	}
656
657	/**
658	* htmlAttrDumpOutput:
659	* @buf: the HTML buffer output
660	* @doc: the document
661	* @cur: the attribute pointer
662	*
663	* Dump an HTML attribute
664	*/
665	static void
666	htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur) {
667	xmlChar *value;
668
669	/*
670	* The html output method should not escape a & character
671	* occurring in an attribute value immediately followed by
672	* a { character (see Section B.7.1 of the HTML 4.0 Recommendation).
673	* This is implemented in xmlEncodeEntitiesReentrant
674	*/
675
676	if (cur == NULL) {
677	return;
678	}
679	xmlOutputBufferWriteString(buf, " ");
680	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
681	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
682	xmlOutputBufferWriteString(buf, ":");
683	}
684	xmlOutputBufferWriteString(buf, (const char *)cur->name);
685	if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) {
686	value = xmlNodeListGetString(doc, cur->children, 0);
687	if (value) {
688	xmlOutputBufferWriteString(buf, "=");
689	if ((cur->ns == NULL) && (cur->parent != NULL) &&
690	(cur->parent->ns == NULL) &&
691	((!xmlStrcasecmp(cur->name, BAD_CAST "href")) \|\|
692	(!xmlStrcasecmp(cur->name, BAD_CAST "action")) \|\|
693	(!xmlStrcasecmp(cur->name, BAD_CAST "src")) \|\|
694	((!xmlStrcasecmp(cur->name, BAD_CAST "name")) &&
695	(!xmlStrcasecmp(cur->parent->name, BAD_CAST "a"))))) {
696	xmlChar *escaped;
697	xmlChar *tmp = value;
698
699	while (IS_BLANK_CH(*tmp)) tmp++;
700
701	/*
702	* Angle brackets are technically illegal in URIs, but they're
703	* used in server side includes, for example. Curly brackets
704	* are illegal as well and often used in templates.
705	* Don't escape non-whitespace, printable ASCII chars for
706	* improved interoperability. Only escape space, control
707	* and non-ASCII chars.
708	*/
709	escaped = xmlURIEscapeStr(tmp,
710	BAD_CAST "\"#$%&+,/:;<=>?@[\\]^`{\|}");
711	if (escaped != NULL) {
712	xmlBufWriteQuotedString(buf->buffer, escaped);
713	xmlFree(escaped);
714	} else {
715	xmlBufWriteQuotedString(buf->buffer, value);
716	}
717	} else {
718	xmlBufWriteQuotedString(buf->buffer, value);
719	}
720	xmlFree(value);
721	} else {
722	xmlOutputBufferWriteString(buf, "=\"\"");
723	}
724	}
725	}
726
727	/**
728	* htmlNodeDumpFormatOutput:
729	* @buf: the HTML buffer output
730	* @doc: the document
731	* @cur: the current node
732	* @encoding: the encoding string (unused)
733	* @format: should formatting spaces been added
734	*
735	* Dump an HTML node, recursive behaviour,children are printed too.
736	*/
737	void
738	htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
739	xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED,
740	int format) {
741	xmlNodePtr root, parent;
742	xmlAttrPtr attr;
743	const htmlElemDesc * info;
744
745	xmlInitParser();
746
747	if ((cur == NULL) \|\| (buf == NULL)) {
748	return;
749	}
750
751	root = cur;
752	parent = cur->parent;
753	while (1) {
754	switch (cur->type) {
755	case XML_HTML_DOCUMENT_NODE:
756	case XML_DOCUMENT_NODE:
757	if (((xmlDocPtr) cur)->intSubset != NULL) {
758	htmlDtdDumpOutput(buf, (xmlDocPtr) cur, NULL);
759	}
760	if (cur->children != NULL) {
761	/* Always validate cur->parent when descending. */
762	if (cur->parent == parent) {
763	parent = cur;
764	cur = cur->children;
765	continue;
766	}
767	} else {
768	xmlOutputBufferWriteString(buf, "\n");
769	}
770	break;
771
772	case XML_ELEMENT_NODE:
773	/*
774	* Some users like lxml are known to pass nodes with a corrupted
775	* tree structure. Fall back to a recursive call to handle this
776	* case.
777	*/
778	if ((cur->parent != parent) && (cur->children != NULL)) {
779	htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format);
780	break;
781	}
782
783	/*
784	* Get specific HTML info for that node.
785	*/
786	if (cur->ns == NULL)
787	info = htmlTagLookup(cur->name);
788	else
789	info = NULL;
790
791	xmlOutputBufferWriteString(buf, "<");
792	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
793	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
794	xmlOutputBufferWriteString(buf, ":");
795	}
796	xmlOutputBufferWriteString(buf, (const char *)cur->name);
797	if (cur->nsDef)
798	xmlNsListDumpOutput(buf, cur->nsDef);
799	attr = cur->properties;
800	while (attr != NULL) {
801	htmlAttrDumpOutput(buf, doc, attr);
802	attr = attr->next;
803	}
804
805	if ((info != NULL) && (info->empty)) {
806	xmlOutputBufferWriteString(buf, ">");
807	} else if (cur->children == NULL) {
808	if ((info != NULL) && (info->saveEndTag != 0) &&
809	(xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) &&
810	(xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) {
811	xmlOutputBufferWriteString(buf, ">");
812	} else {
813	xmlOutputBufferWriteString(buf, "></");
814	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
815	xmlOutputBufferWriteString(buf,
816	(const char *)cur->ns->prefix);
817	xmlOutputBufferWriteString(buf, ":");
818	}
819	xmlOutputBufferWriteString(buf, (const char *)cur->name);
820	xmlOutputBufferWriteString(buf, ">");
821	}
822	} else {
823	xmlOutputBufferWriteString(buf, ">");
824	if ((format) && (info != NULL) && (!info->isinline) &&
825	(cur->children->type != HTML_TEXT_NODE) &&
826	(cur->children->type != HTML_ENTITY_REF_NODE) &&
827	(cur->children != cur->last) &&
828	(cur->name != NULL) &&
829	(cur->name[0] != 'p')) /* p, pre, param */
830	xmlOutputBufferWriteString(buf, "\n");
831	parent = cur;
832	cur = cur->children;
833	continue;
834	}
835
836	if ((format) && (cur->next != NULL) &&
837	(info != NULL) && (!info->isinline)) {
838	if ((cur->next->type != HTML_TEXT_NODE) &&
839	(cur->next->type != HTML_ENTITY_REF_NODE) &&
840	(parent != NULL) &&
841	(parent->name != NULL) &&
842	(parent->name[0] != 'p')) /* p, pre, param */
843	xmlOutputBufferWriteString(buf, "\n");
844	}
845
846	break;
847
848	case XML_ATTRIBUTE_NODE:
849	htmlAttrDumpOutput(buf, doc, (xmlAttrPtr) cur);
850	break;
851
852	case HTML_TEXT_NODE:
853	if (cur->content == NULL)
854	break;
855	if (((cur->name == (const xmlChar *)xmlStringText) \|\|
856	(cur->name != (const xmlChar *)xmlStringTextNoenc)) &&
857	((parent == NULL) \|\|
858	((xmlStrcasecmp(parent->name, BAD_CAST "script")) &&
859	(xmlStrcasecmp(parent->name, BAD_CAST "style"))))) {
860	xmlChar *buffer;
861
862	buffer = xmlEncodeEntitiesReentrant(doc, cur->content);
863	if (buffer != NULL) {
864	xmlOutputBufferWriteString(buf, (const char *)buffer);
865	xmlFree(buffer);
866	}
867	} else {
868	xmlOutputBufferWriteString(buf, (const char *)cur->content);
869	}
870	break;
871
872	case HTML_COMMENT_NODE:
873	if (cur->content != NULL) {
874	xmlOutputBufferWriteString(buf, "<!--");
875	xmlOutputBufferWriteString(buf, (const char *)cur->content);
876	xmlOutputBufferWriteString(buf, "-->");
877	}
878	break;
879
880	case HTML_PI_NODE:
881	if (cur->name != NULL) {
882	xmlOutputBufferWriteString(buf, "<?");
883	xmlOutputBufferWriteString(buf, (const char *)cur->name);
884	if (cur->content != NULL) {
885	xmlOutputBufferWriteString(buf, " ");
886	xmlOutputBufferWriteString(buf,
887	(const char *)cur->content);
888	}
889	xmlOutputBufferWriteString(buf, ">");
890	}
891	break;
892
893	case HTML_ENTITY_REF_NODE:
894	xmlOutputBufferWriteString(buf, "&");
895	xmlOutputBufferWriteString(buf, (const char *)cur->name);
896	xmlOutputBufferWriteString(buf, ";");
897	break;
898
899	case HTML_PRESERVE_NODE:
900	if (cur->content != NULL) {
901	xmlOutputBufferWriteString(buf, (const char *)cur->content);
902	}
903	break;
904
905	default:
906	break;
907	}
908
909	while (1) {
910	if (cur == root)
911	return;
912	if (cur->next != NULL) {
913	cur = cur->next;
914	break;
915	}
916
917	cur = parent;
918	/* cur->parent was validated when descending. */
919	parent = cur->parent;
920
921	if ((cur->type == XML_HTML_DOCUMENT_NODE) \|\|
922	(cur->type == XML_DOCUMENT_NODE)) {
923	xmlOutputBufferWriteString(buf, "\n");
924	} else {
925	if ((format) && (cur->ns == NULL))
926	info = htmlTagLookup(cur->name);
927	else
928	info = NULL;
929
930	if ((format) && (info != NULL) && (!info->isinline) &&
931	(cur->last->type != HTML_TEXT_NODE) &&
932	(cur->last->type != HTML_ENTITY_REF_NODE) &&
933	(cur->children != cur->last) &&
934	(cur->name != NULL) &&
935	(cur->name[0] != 'p')) /* p, pre, param */
936	xmlOutputBufferWriteString(buf, "\n");
937
938	xmlOutputBufferWriteString(buf, "</");
939	if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) {
940	xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix);
941	xmlOutputBufferWriteString(buf, ":");
942	}
943	xmlOutputBufferWriteString(buf, (const char *)cur->name);
944	xmlOutputBufferWriteString(buf, ">");
945
946	if ((format) && (info != NULL) && (!info->isinline) &&
947	(cur->next != NULL)) {
948	if ((cur->next->type != HTML_TEXT_NODE) &&
949	(cur->next->type != HTML_ENTITY_REF_NODE) &&
950	(parent != NULL) &&
951	(parent->name != NULL) &&
952	(parent->name[0] != 'p')) /* p, pre, param */
953	xmlOutputBufferWriteString(buf, "\n");
954	}
955	}
956	}
957	}
958	}
959
960	/**
961	* htmlNodeDumpOutput:
962	* @buf: the HTML buffer output
963	* @doc: the document
964	* @cur: the current node
965	* @encoding: the encoding string (unused)
966	*
967	* Dump an HTML node, recursive behaviour,children are printed too,
968	* and formatting returns/spaces are added.
969	*/
970	void
971	htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc,
972	xmlNodePtr cur, const char *encoding ATTRIBUTE_UNUSED) {
973	htmlNodeDumpFormatOutput(buf, doc, cur, NULL, 1);
974	}
975
976	/**
977	* htmlDocContentDumpFormatOutput:
978	* @buf: the HTML buffer output
979	* @cur: the document
980	* @encoding: the encoding string (unused)
981	* @format: should formatting spaces been added
982	*
983	* Dump an HTML document.
984	*/
985	void
986	htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
987	const char *encoding ATTRIBUTE_UNUSED,
988	int format) {
989	int type = 0;
990	if (cur) {
991	type = cur->type;
992	cur->type = XML_HTML_DOCUMENT_NODE;
993	}
994	htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, format);
995	if (cur)
996	cur->type = (xmlElementType) type;
997	}
998
999	/**
1000	* htmlDocContentDumpOutput:
1001	* @buf: the HTML buffer output
1002	* @cur: the document
1003	* @encoding: the encoding string (unused)
1004	*
1005	* Dump an HTML document. Formatting return/spaces are added.
1006	*/
1007	void
1008	htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur,
1009	const char *encoding ATTRIBUTE_UNUSED) {
1010	htmlNodeDumpFormatOutput(buf, cur, (xmlNodePtr) cur, NULL, 1);
1011	}
1012
1013	/************************************************************************
1014	* *
1015	* Saving functions front-ends *
1016	* *
1017	************************************************************************/
1018
1019	/**
1020	* htmlDocDump:
1021	* @f: the FILE*
1022	* @cur: the document
1023	*
1024	* Dump an HTML document to an open FILE.
1025	*
1026	* returns: the number of byte written or -1 in case of failure.
1027	*/
1028	int
1029	htmlDocDump(FILE *f, xmlDocPtr cur) {
1030	xmlOutputBufferPtr buf;
1031	xmlCharEncodingHandlerPtr handler = NULL;
1032	const char *encoding;
1033	int ret;
1034
1035	xmlInitParser();
1036
1037	if ((cur == NULL) \|\| (f == NULL)) {
1038	return(-1);
1039	}
1040
1041	encoding = (const char *) htmlGetMetaEncoding(cur);
1042
1043	if (encoding != NULL) {
1044	xmlCharEncoding enc;
1045
1046	enc = xmlParseCharEncoding(encoding);
1047	if (enc != XML_CHAR_ENCODING_UTF8) {
1048	handler = xmlFindCharEncodingHandler(encoding);
1049	if (handler == NULL)
1050	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1051	}
1052	} else {
1053	/*
1054	* Fallback to HTML or ASCII when the encoding is unspecified
1055	*/
1056	if (handler == NULL)
1057	handler = xmlFindCharEncodingHandler("HTML");
1058	if (handler == NULL)
1059	handler = xmlFindCharEncodingHandler("ascii");
1060	}
1061
1062	buf = xmlOutputBufferCreateFile(f, handler);
1063	if (buf == NULL) return(-1);
1064	htmlDocContentDumpOutput(buf, cur, NULL);
1065
1066	ret = xmlOutputBufferClose(buf);
1067	return(ret);
1068	}
1069
1070	/**
1071	* htmlSaveFile:
1072	* @filename: the filename (or URL)
1073	* @cur: the document
1074	*
1075	* Dump an HTML document to a file. If @filename is "-" the stdout file is
1076	* used.
1077	* returns: the number of byte written or -1 in case of failure.
1078	*/
1079	int
1080	htmlSaveFile(const char *filename, xmlDocPtr cur) {
1081	xmlOutputBufferPtr buf;
1082	xmlCharEncodingHandlerPtr handler = NULL;
1083	const char *encoding;
1084	int ret;
1085
1086	if ((cur == NULL) \|\| (filename == NULL))
1087	return(-1);
1088
1089	xmlInitParser();
1090
1091	encoding = (const char *) htmlGetMetaEncoding(cur);
1092
1093	if (encoding != NULL) {
1094	xmlCharEncoding enc;
1095
1096	enc = xmlParseCharEncoding(encoding);
1097	if (enc != XML_CHAR_ENCODING_UTF8) {
1098	handler = xmlFindCharEncodingHandler(encoding);
1099	if (handler == NULL)
1100	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1101	}
1102	} else {
1103	/*
1104	* Fallback to HTML or ASCII when the encoding is unspecified
1105	*/
1106	if (handler == NULL)
1107	handler = xmlFindCharEncodingHandler("HTML");
1108	if (handler == NULL)
1109	handler = xmlFindCharEncodingHandler("ascii");
1110	}
1111
1112	/*
1113	* save the content to a temp buffer.
1114	*/
1115	buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression);
1116	if (buf == NULL) return(0);
1117
1118	htmlDocContentDumpOutput(buf, cur, NULL);
1119
1120	ret = xmlOutputBufferClose(buf);
1121	return(ret);
1122	}
1123
1124	/**
1125	* htmlSaveFileFormat:
1126	* @filename: the filename
1127	* @cur: the document
1128	* @format: should formatting spaces been added
1129	* @encoding: the document encoding
1130	*
1131	* Dump an HTML document to a file using a given encoding.
1132	*
1133	* returns: the number of byte written or -1 in case of failure.
1134	*/
1135	int
1136	htmlSaveFileFormat(const char *filename, xmlDocPtr cur,
1137	const char *encoding, int format) {
1138	xmlOutputBufferPtr buf;
1139	xmlCharEncodingHandlerPtr handler = NULL;
1140	int ret;
1141
1142	if ((cur == NULL) \|\| (filename == NULL))
1143	return(-1);
1144
1145	xmlInitParser();
1146
1147	if (encoding != NULL) {
1148	xmlCharEncoding enc;
1149
1150	enc = xmlParseCharEncoding(encoding);
1151	if (enc != XML_CHAR_ENCODING_UTF8) {
1152	handler = xmlFindCharEncodingHandler(encoding);
1153	if (handler == NULL)
1154	htmlSaveErr(XML_SAVE_UNKNOWN_ENCODING, NULL, encoding);
1155	}
1156	htmlSetMetaEncoding(cur, (const xmlChar *) encoding);
1157	} else {
1158	htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8");
1159
1160	/*
1161	* Fallback to HTML or ASCII when the encoding is unspecified
1162	*/
1163	if (handler == NULL)
1164	handler = xmlFindCharEncodingHandler("HTML");
1165	if (handler == NULL)
1166	handler = xmlFindCharEncodingHandler("ascii");
1167	}
1168
1169	/*
1170	* save the content to a temp buffer.
1171	*/
1172	buf = xmlOutputBufferCreateFilename(filename, handler, 0);
1173	if (buf == NULL) return(0);
1174
1175	htmlDocContentDumpFormatOutput(buf, cur, encoding, format);
1176
1177	ret = xmlOutputBufferClose(buf);
1178	return(ret);
1179	}
1180
1181	/**
1182	* htmlSaveFileEnc:
1183	* @filename: the filename
1184	* @cur: the document
1185	* @encoding: the document encoding
1186	*
1187	* Dump an HTML document to a file using a given encoding
1188	* and formatting returns/spaces are added.
1189	*
1190	* returns: the number of byte written or -1 in case of failure.
1191	*/
1192	int
1193	htmlSaveFileEnc(const char filename, xmlDocPtr cur, const char encoding) {
1194	return(htmlSaveFileFormat(filename, cur, encoding, 1));
1195	}
1196
1197	#endif /* LIBXML_OUTPUT_ENABLED */
1198
1199	#endif /* LIBXML_HTML_ENABLED */

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/libs/libxml2-2.12.6/HTMLtree.c@ 104932

Download in other formats: