uri.cpp@ 74762

Last change on this file since 74762 was 74424, checked in by vboxsync, 6 years ago
IPRT/uri: Better handling of empty port specifiers. bugref:9249
Property svn:eol-style set to `native` Property svn:keywords set to `Author Date Id Revision`
File size: 39.6 KB

Line
1	/* $Id: uri.cpp 74424 2018-09-22 20:00:36Z vboxsync $ */
2	/** @file
3	* IPRT - Uniform Resource Identifier handling.
4	*/
5
6	/*
7	* Copyright (C) 2011-2017 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.virtualbox.org. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*********************************************************************************************************************************
29	* Header Files *
30	*********************************************************************************************************************************/
31	#include <iprt/uri.h>
32
33	#include <iprt/assert.h>
34	#include <iprt/ctype.h>
35	#include <iprt/path.h>
36	#include <iprt/string.h>
37
38
39	/*********************************************************************************************************************************
40	* Defined Constants And Macros *
41	*********************************************************************************************************************************/
42	/** Internal magic value we use to check if a RTURIPARSED structure has made it thru RTUriParse. */
43	#define RTURIPARSED_MAGIC UINT32_C(0x439e0745)
44
45
46	/* General URI format:
47
48	foo://example.com:8042/over/there?name=ferret#nose
49	\_/ \______________/\_________/ \_________/ \__/
50	\| \| \| \| \|
51	scheme authority path query fragment
52	\| _____________________\|__
53	/ \ / \
54	urn:example:animal:ferret:nose
55	*/
56
57
58	/**
59	* The following defines characters which have to be % escaped:
60	* control = 00-1F
61	* space = ' '
62	* delims = '<' , '>' , '#' , '%' , '"'
63	* unwise = '{' , '}' , '\|' , '\' , '^' , '[' , ']' , '`'
64	*/
65	#define URI_EXCLUDED(a) \
66	( ((a) >= 0x0 && (a) <= 0x20) \
67	\|\| ((a) >= 0x5B && (a) <= 0x5E) \
68	\|\| ((a) >= 0x7B && (a) <= 0x7D) \
69	\|\| (a) == '<' \|\| (a) == '>' \|\| (a) == '#' \
70	\|\| (a) == '%' \|\| (a) == '"' \|\| (a) == '`' )
71
72	static char rtUriPercentEncodeN(const char pszString, size_t cchMax)
73	{
74	if (!pszString)
75	return NULL;
76
77	int rc = VINF_SUCCESS;
78
79	size_t cbLen = RT_MIN(strlen(pszString), cchMax);
80	/* The new string can be max 3 times in size of the original string. */
81	char pszNew = RTStrAlloc(cbLen 3 + 1);
82	if (!pszNew)
83	return NULL;
84
85	char *pszRes = NULL;
86	size_t iIn = 0;
87	size_t iOut = 0;
88	while (iIn < cbLen)
89	{
90	if (URI_EXCLUDED(pszString[iIn]))
91	{
92	char szNum[3] = { 0, 0, 0 };
93	RTStrFormatU8(&szNum[0], 3, pszString[iIn++], 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
94	pszNew[iOut++] = '%';
95	pszNew[iOut++] = szNum[0];
96	pszNew[iOut++] = szNum[1];
97	}
98	else
99	pszNew[iOut++] = pszString[iIn++];
100	}
101	if (RT_SUCCESS(rc))
102	{
103	pszNew[iOut] = '\0';
104	if (iOut != iIn)
105	{
106	/* If the source and target strings have different size, recreate
107	* the target string with the correct size. */
108	pszRes = RTStrDupN(pszNew, iOut);
109	RTStrFree(pszNew);
110	}
111	else
112	pszRes = pszNew;
113	}
114	else
115	RTStrFree(pszNew);
116
117	return pszRes;
118	}
119
120
121	/**
122	* Calculates the encoded string length.
123	*
124	* @returns Number of chars (excluding the terminator).
125	* @param pszString The string to encode.
126	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
127	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
128	*/
129	static size_t rtUriCalcEncodedLength(const char *pszString, size_t cchMax, bool fEncodeDosSlash)
130	{
131	size_t cchEncoded = 0;
132	if (pszString)
133	{
134	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
135	while (cchSrcLeft-- > 0)
136	{
137	char const ch = *pszString++;
138	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
139	cchEncoded += 1;
140	else
141	cchEncoded += 3;
142	}
143	}
144	return cchEncoded;
145	}
146
147
148	/**
149	* Encodes an URI into a caller allocated buffer.
150	*
151	* @returns IPRT status code.
152	* @param pszString The string to encode.
153	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
154	* @param fEncodeDosSlash Whether to encode DOS slashes or not.
155	* @param pszDst The destination buffer.
156	* @param cbDst The size of the destination buffer.
157	*/
158	static int rtUriEncodeIntoBuffer(const char pszString, size_t cchMax, bool fEncodeDosSlash, char pszDst, size_t cbDst)
159	{
160	AssertReturn(pszString, VERR_INVALID_POINTER);
161	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
162
163	/*
164	* We do buffer size checking up front and every time we encode a special
165	* character. That's faster than checking for each char.
166	*/
167	size_t cchSrcLeft = RTStrNLen(pszString, cchMax);
168	AssertMsgReturn(cbDst > cchSrcLeft, ("cbDst=%zu cchSrcLeft=%zu\n", cbDst, cchSrcLeft), VERR_BUFFER_OVERFLOW);
169	cbDst -= cchSrcLeft;
170
171	while (cchSrcLeft-- > 0)
172	{
173	char const ch = *pszString++;
174	if (!URI_EXCLUDED(ch) \|\| (ch == '\\' && !fEncodeDosSlash))
175	*pszDst++ = ch;
176	else
177	{
178	AssertReturn(cbDst >= 3, VERR_BUFFER_OVERFLOW); /* 2 extra bytes + zero terminator. */
179	cbDst -= 2;
180
181	*pszDst++ = '%';
182	ssize_t cchTmp = RTStrFormatU8(pszDst, 3, (unsigned char)ch, 16, 2, 2, RTSTR_F_CAPITAL \| RTSTR_F_ZEROPAD);
183	Assert(cchTmp == 2); NOREF(cchTmp);
184	pszDst += 2;
185	}
186	}
187
188	*pszDst = '\0';
189	return VINF_SUCCESS;
190	}
191
192
193	static char rtUriPercentDecodeN(const char pszString, size_t cchString)
194	{
195	AssertPtrReturn(pszString, NULL);
196	AssertReturn(memchr(pszString, '\0', cchString) == NULL, NULL);
197
198	/*
199	* The new string can only get smaller, so use the input length as a
200	* staring buffer size.
201	*/
202	char *pszDecoded = RTStrAlloc(cchString + 1);
203	if (pszDecoded)
204	{
205	/*
206	* Knowing that the pszString itself is valid UTF-8, we only have to
207	* validate the escape sequences.
208	*/
209	size_t cchLeft = cchString;
210	char const *pchSrc = pszString;
211	char *pchDst = pszDecoded;
212	while (cchLeft > 0)
213	{
214	const char pchPct = (const char )memchr(pchSrc, '%', cchLeft);
215	if (pchPct)
216	{
217	size_t cchBefore = pchPct - pchSrc;
218	if (cchBefore)
219	{
220	memcpy(pchDst, pchSrc, cchBefore);
221	pchDst += cchBefore;
222	pchSrc += cchBefore;
223	cchLeft -= cchBefore;
224	}
225
226	char chHigh, chLow;
227	if ( cchLeft >= 3
228	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
229	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
230	{
231	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
232	b <<= 4;
233	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
234	*pchDst++ = (char)b;
235	pchSrc += 3;
236	cchLeft -= 3;
237	}
238	else
239	{
240	AssertFailed();
241	pchDst++ = pchSrc++;
242	cchLeft--;
243	}
244	}
245	else
246	{
247	memcpy(pchDst, pchSrc, cchLeft);
248	pchDst += cchLeft;
249	pchSrc += cchLeft;
250	cchLeft = 0;
251	break;
252	}
253	}
254
255	*pchDst = '\0';
256
257	/*
258	* If we've got lof space room in the result string, reallocate it.
259	*/
260	size_t cchDecoded = pchDst - pszDecoded;
261	Assert(cchDecoded <= cchString);
262	if (cchString - cchDecoded > 64)
263	RTStrRealloc(&pszDecoded, cchDecoded + 1);
264	}
265	return pszDecoded;
266	}
267
268
269	/**
270	* Calculates the decoded string length.
271	*
272	* @returns Number of chars (excluding the terminator).
273	* @param pszString The string to decode.
274	* @param cchMax The maximum string length (e.g. RTSTR_MAX).
275	*/
276	static size_t rtUriCalcDecodedLength(const char *pszString, size_t cchMax)
277	{
278	size_t cchDecoded;
279	if (pszString)
280	{
281	size_t cchSrcLeft = cchDecoded = RTStrNLen(pszString, cchMax);
282	while (cchSrcLeft-- > 0)
283	{
284	char const ch = *pszString++;
285	if (ch != '%')
286	{ /* typical */}
287	else if ( cchSrcLeft >= 2
288	&& RT_C_IS_XDIGIT(pszString[0])
289	&& RT_C_IS_XDIGIT(pszString[1]))
290	{
291	cchDecoded -= 2;
292	pszString += 2;
293	cchSrcLeft -= 2;
294	}
295	}
296	}
297	else
298	cchDecoded = 0;
299	return cchDecoded;
300	}
301
302
303	/**
304	* Decodes a string into a buffer.
305	*
306	* @returns IPRT status code.
307	* @param pchSrc The source string.
308	* @param cchSrc The max number of bytes to decode in the source string.
309	* @param pszDst The destination buffer.
310	* @param cbDst The size of the buffer (including terminator).
311	*/
312	static int rtUriDecodeIntoBuffer(const char pchSrc, size_t cchSrc, char pszDst, size_t cbDst)
313	{
314	AssertPtrReturn(pchSrc, VERR_INVALID_POINTER);
315	AssertPtrReturn(pszDst, VERR_INVALID_POINTER);
316
317	/*
318	* Knowing that the pszString itself is valid UTF-8, we only have to
319	* validate the escape sequences.
320	*/
321	cchSrc = RTStrNLen(pchSrc, cchSrc);
322	while (cchSrc > 0)
323	{
324	const char pchPct = (const char )memchr(pchSrc, '%', cchSrc);
325	if (pchPct)
326	{
327	size_t cchBefore = pchPct - pchSrc;
328	AssertReturn(cchBefore + 1 < cbDst, VERR_BUFFER_OVERFLOW);
329	if (cchBefore)
330	{
331	memcpy(pszDst, pchSrc, cchBefore);
332	pszDst += cchBefore;
333	cbDst -= cchBefore;
334	pchSrc += cchBefore;
335	cchSrc -= cchBefore;
336	}
337
338	char chHigh, chLow;
339	if ( cchSrc >= 3
340	&& RT_C_IS_XDIGIT(chHigh = pchSrc[1])
341	&& RT_C_IS_XDIGIT(chLow = pchSrc[2]))
342	{
343	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
344	b <<= 4;
345	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
346	*pszDst++ = (char)b;
347	pchSrc += 3;
348	cchSrc -= 3;
349	}
350	else
351	{
352	AssertFailed();
353	pszDst++ = pchSrc++;
354	cchSrc--;
355	}
356	cbDst -= 1;
357	}
358	else
359	{
360	AssertReturn(cchSrc < cbDst, VERR_BUFFER_OVERFLOW);
361	memcpy(pszDst, pchSrc, cchSrc);
362	pszDst += cchSrc;
363	cbDst -= cchSrc;
364	pchSrc += cchSrc;
365	cchSrc = 0;
366	break;
367	}
368	}
369
370	AssertReturn(cbDst > 0, VERR_BUFFER_OVERFLOW);
371	*pszDst = '\0';
372	return VINF_SUCCESS;
373	}
374
375
376
377	static int rtUriParse(const char *pszUri, PRTURIPARSED pParsed)
378	{
379	/*
380	* Validate the input and clear the output.
381	*/
382	AssertPtrReturn(pParsed, VERR_INVALID_POINTER);
383	RT_ZERO(*pParsed);
384	pParsed->uAuthorityPort = UINT32_MAX;
385
386	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
387
388	size_t const cchUri = strlen(pszUri);
389	if (RT_LIKELY(cchUri >= 3)) { /* likely */ }
390	else return cchUri ? VERR_URI_TOO_SHORT : VERR_URI_EMPTY;
391
392	/*
393	* Validating escaped text sequences is much simpler if we know that
394	* that the base URI string is valid. Also, we don't necessarily trust
395	* the developer calling us to remember to do this.
396	*/
397	int rc = RTStrValidateEncoding(pszUri);
398	AssertRCReturn(rc, rc);
399
400	/*
401	* RFC-3986, section 3.1:
402	* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
403	*
404	* The scheme ends with a ':', which we also skip here.
405	*/
406	size_t off = 0;
407	char ch = pszUri[off++];
408	if (RT_LIKELY(RT_C_IS_ALPHA(ch))) { /* likely */ }
409	else return VERR_URI_INVALID_SCHEME;
410	for (;;)
411	{
412	ch = pszUri[off];
413	if (ch == ':')
414	break;
415	if (RT_LIKELY(RT_C_IS_ALNUM(ch) \|\| ch == '.' \|\| ch == '-' \|\| ch == '+')) { /* likely */ }
416	else return VERR_URI_INVALID_SCHEME;
417	off++;
418	}
419	pParsed->cchScheme = off;
420
421	/* Require the scheme length to be at least two chars so we won't confuse
422	it with a path starting with a DOS drive letter specification. */
423	if (RT_LIKELY(off >= 2)) { /* likely */ }
424	else return VERR_URI_INVALID_SCHEME;
425
426	off++; /* (skip colon) */
427
428	/*
429	* Find the end of the path, we'll need this several times.
430	* Also, while we're potentially scanning the whole thing, check for '%'.
431	*/
432	size_t const offHash = RTStrOffCharOrTerm(&pszUri[off], '#') + off;
433	size_t const offQuestionMark = RTStrOffCharOrTerm(&pszUri[off], '?') + off;
434
435	if (memchr(pszUri, '%', cchUri) != NULL)
436	pParsed->fFlags \|= RTURIPARSED_F_CONTAINS_ESCAPED_CHARS;
437
438	/*
439	* RFC-3986, section 3.2:
440	* The authority component is preceeded by a double slash ("//")...
441	*/
442	if ( pszUri[off] == '/'
443	&& pszUri[off + 1] == '/')
444	{
445	off += 2;
446	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
447	pParsed->fFlags \|= RTURIPARSED_F_HAS_AUTHORITY;
448
449	/*
450	* RFC-3986, section 3.2:
451	* ...and is terminated by the next slash ("/"), question mark ("?"),
452	* or number sign ("#") character, or by the end of the URI.
453	*/
454	const char *pszAuthority = &pszUri[off];
455	size_t cchAuthority = RTStrOffCharOrTerm(pszAuthority, '/');
456	cchAuthority = RT_MIN(cchAuthority, offHash - off);
457	cchAuthority = RT_MIN(cchAuthority, offQuestionMark - off);
458	pParsed->cchAuthority = cchAuthority;
459
460	/* The Authority can be empty, like for: file:///usr/bin/grep */
461	if (cchAuthority > 0)
462	{
463	pParsed->cchAuthorityHost = cchAuthority;
464
465	/*
466	* If there is a userinfo part, it is ended by a '@'.
467	*/
468	const char pszAt = (const char )memchr(pszAuthority, '@', cchAuthority);
469	if (pszAt)
470	{
471	size_t cchTmp = pszAt - pszAuthority;
472	pParsed->offAuthorityHost += cchTmp + 1;
473	pParsed->cchAuthorityHost -= cchTmp + 1;
474
475	/* If there is a password part, it's separated from the username with a colon. */
476	const char pszColon = (const char )memchr(pszAuthority, ':', cchTmp);
477	if (pszColon)
478	{
479	pParsed->cchAuthorityUsername = pszColon - pszAuthority;
480	pParsed->offAuthorityPassword = &pszColon[1] - pszUri;
481	pParsed->cchAuthorityPassword = pszAt - &pszColon[1];
482	}
483	else
484	{
485	pParsed->cchAuthorityUsername = cchTmp;
486	pParsed->offAuthorityPassword = off + cchTmp;
487	}
488	}
489
490	/*
491	* If there is a port part, its after the last colon in the host part.
492	*/
493	const char pszColon = (const char )memrchr(&pszUri[pParsed->offAuthorityHost], ':', pParsed->cchAuthorityHost);
494	if (pszColon)
495	{
496	size_t cchTmp = &pszUri[pParsed->offAuthorityHost + pParsed->cchAuthorityHost] - &pszColon[1];
497	pParsed->cchAuthorityHost -= cchTmp + 1;
498	pParsed->fFlags \|= RTURIPARSED_F_HAS_PORT;
499	if (cchTmp > 0)
500	{
501	pParsed->uAuthorityPort = 0;
502	while (cchTmp-- > 0)
503	{
504	ch = *++pszColon;
505	if ( RT_C_IS_DIGIT(ch)
506	&& pParsed->uAuthorityPort < UINT32_MAX / UINT32_C(10))
507	{
508	pParsed->uAuthorityPort *= 10;
509	pParsed->uAuthorityPort += ch - '0';
510	}
511	else
512	return VERR_URI_INVALID_PORT_NUMBER;
513	}
514	}
515	}
516	}
517
518	/* Skip past the authority. */
519	off += cchAuthority;
520	}
521	else
522	pParsed->offAuthority = pParsed->offAuthorityUsername = pParsed->offAuthorityPassword = pParsed->offAuthorityHost = off;
523
524	/*
525	* RFC-3986, section 3.3: Path
526	* The path is terminated by the first question mark ("?")
527	* or number sign ("#") character, or by the end of the URI.
528	*/
529	pParsed->offPath = off;
530	pParsed->cchPath = RT_MIN(offHash, offQuestionMark) - off;
531	off += pParsed->cchPath;
532
533	/*
534	* RFC-3986, section 3.4: Query
535	* The query component is indicated by the first question mark ("?")
536	* character and terminated by a number sign ("#") character or by the
537	* end of the URI.
538	*/
539	if ( off == offQuestionMark
540	&& off < cchUri)
541	{
542	Assert(pszUri[offQuestionMark] == '?');
543	pParsed->offQuery = ++off;
544	pParsed->cchQuery = offHash - off;
545	off = offHash;
546	}
547	else
548	{
549	Assert(!pszUri[offQuestionMark]);
550	pParsed->offQuery = off;
551	}
552
553	/*
554	* RFC-3986, section 3.5: Fragment
555	* A fragment identifier component is indicated by the presence of a
556	* number sign ("#") character and terminated by the end of the URI.
557	*/
558	if ( off == offHash
559	&& off < cchUri)
560	{
561	pParsed->offFragment = ++off;
562	pParsed->cchFragment = cchUri - off;
563	}
564	else
565	{
566	Assert(!pszUri[offHash]);
567	pParsed->offFragment = off;
568	}
569
570	/*
571	* If there are any escape sequences, validate them.
572	*
573	* This is reasonably simple as we already know that the string is valid UTF-8
574	* before they get decoded. Thus we only have to validate the escaped sequences.
575	*/
576	if (pParsed->fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
577	{
578	const char pchSrc = (const char )memchr(pszUri, '%', cchUri);
579	AssertReturn(pchSrc, VERR_INTERNAL_ERROR);
580	do
581	{
582	char szUtf8Seq[8];
583	unsigned cchUtf8Seq = 0;
584	unsigned cchNeeded = 0;
585	size_t cchLeft = &pszUri[cchUri] - pchSrc;
586	do
587	{
588	if (cchLeft >= 3)
589	{
590	char chHigh = pchSrc[1];
591	char chLow = pchSrc[2];
592	if ( RT_C_IS_XDIGIT(chHigh)
593	&& RT_C_IS_XDIGIT(chLow))
594	{
595	uint8_t b = RT_C_IS_DIGIT(chHigh) ? chHigh - '0' : (chHigh & ~0x20) - 'A' + 10;
596	b <<= 4;
597	b \|= RT_C_IS_DIGIT(chLow) ? chLow - '0' : (chLow & ~0x20) - 'A' + 10;
598
599	if (!(b & 0x80))
600	{
601	/* We don't want the string to be terminated prematurely. */
602	if (RT_LIKELY(b != 0)) { /* likely */ }
603	else return VERR_URI_ESCAPED_ZERO;
604
605	/* Check that we're not expecting more UTF-8 bytes. */
606	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
607	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
608	}
609	/* Are we waiting UTF-8 bytes? */
610	else if (cchNeeded > 0)
611	{
612	if (RT_LIKELY(!(b & 0x40))) { /* likely */ }
613	else return VERR_URI_INVALID_ESCAPED_UTF8_CONTINUATION_BYTE;
614
615	szUtf8Seq[cchUtf8Seq++] = (char)b;
616	if (--cchNeeded == 0)
617	{
618	szUtf8Seq[cchUtf8Seq] = '\0';
619	rc = RTStrValidateEncoding(szUtf8Seq);
620	if (RT_FAILURE(rc))
621	return VERR_URI_ESCAPED_CHARS_NOT_VALID_UTF8;
622	cchUtf8Seq = 0;
623	}
624	}
625	/* Start a new UTF-8 sequence. */
626	else
627	{
628	if ((b & 0xf8) == 0xf0)
629	cchNeeded = 3;
630	else if ((b & 0xf0) == 0xe0)
631	cchNeeded = 2;
632	else if ((b & 0xe0) == 0xc0)
633	cchNeeded = 1;
634	else
635	return VERR_URI_INVALID_ESCAPED_UTF8_LEAD_BYTE;
636	szUtf8Seq[0] = (char)b;
637	cchUtf8Seq = 1;
638	}
639	pchSrc += 3;
640	cchLeft -= 3;
641	}
642	else
643	return VERR_URI_INVALID_ESCAPE_SEQ;
644	}
645	else
646	return VERR_URI_INVALID_ESCAPE_SEQ;
647	} while (cchLeft > 0 && pchSrc[0] == '%');
648
649	/* Check that we're not expecting more UTF-8 bytes. */
650	if (RT_LIKELY(cchNeeded == 0)) { /* likely */ }
651	else return VERR_URI_MISSING_UTF8_CONTINUATION_BYTE;
652
653	/* next */
654	pchSrc = (const char *)memchr(pchSrc, '%', cchLeft);
655	} while (pchSrc);
656	}
657
658	pParsed->u32Magic = RTURIPARSED_MAGIC;
659	return VINF_SUCCESS;
660	}
661
662
663	RTDECL(int) RTUriParse(const char *pszUri, PRTURIPARSED pParsed)
664	{
665	return rtUriParse(pszUri, pParsed);
666	}
667
668
669	RTDECL(char ) RTUriParsedScheme(const char pszUri, PCRTURIPARSED pParsed)
670	{
671	AssertPtrReturn(pszUri, NULL);
672	AssertPtrReturn(pParsed, NULL);
673	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
674	return RTStrDupN(pszUri, pParsed->cchScheme);
675	}
676
677
678	RTDECL(char ) RTUriParsedAuthority(const char pszUri, PCRTURIPARSED pParsed)
679	{
680	AssertPtrReturn(pszUri, NULL);
681	AssertPtrReturn(pParsed, NULL);
682	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
683	if (pParsed->cchAuthority \|\| (pParsed->fFlags & RTURIPARSED_F_HAS_AUTHORITY))
684	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthority], pParsed->cchAuthority);
685	return NULL;
686	}
687
688
689	RTDECL(char ) RTUriParsedAuthorityUsername(const char pszUri, PCRTURIPARSED pParsed)
690	{
691	AssertPtrReturn(pszUri, NULL);
692	AssertPtrReturn(pParsed, NULL);
693	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
694	if (pParsed->cchAuthorityUsername)
695	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityUsername], pParsed->cchAuthorityUsername);
696	return NULL;
697	}
698
699
700	RTDECL(char ) RTUriParsedAuthorityPassword(const char pszUri, PCRTURIPARSED pParsed)
701	{
702	AssertPtrReturn(pszUri, NULL);
703	AssertPtrReturn(pParsed, NULL);
704	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
705	if (pParsed->cchAuthorityPassword)
706	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityPassword], pParsed->cchAuthorityPassword);
707	return NULL;
708	}
709
710
711	RTDECL(char ) RTUriParsedAuthorityHost(const char pszUri, PCRTURIPARSED pParsed)
712	{
713	AssertPtrReturn(pszUri, NULL);
714	AssertPtrReturn(pParsed, NULL);
715	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
716	if (pParsed->cchAuthorityHost)
717	return rtUriPercentDecodeN(&pszUri[pParsed->offAuthorityHost], pParsed->cchAuthorityHost);
718	return NULL;
719	}
720
721
722	RTDECL(uint32_t) RTUriParsedAuthorityPort(const char *pszUri, PCRTURIPARSED pParsed)
723	{
724	AssertPtrReturn(pszUri, UINT32_MAX);
725	AssertPtrReturn(pParsed, UINT32_MAX);
726	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, UINT32_MAX);
727	return pParsed->uAuthorityPort;
728	}
729
730
731	RTDECL(char ) RTUriParsedPath(const char pszUri, PCRTURIPARSED pParsed)
732	{
733	AssertPtrReturn(pszUri, NULL);
734	AssertPtrReturn(pParsed, NULL);
735	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
736	if (pParsed->cchPath)
737	return rtUriPercentDecodeN(&pszUri[pParsed->offPath], pParsed->cchPath);
738	return NULL;
739	}
740
741
742	RTDECL(char ) RTUriParsedQuery(const char pszUri, PCRTURIPARSED pParsed)
743	{
744	AssertPtrReturn(pszUri, NULL);
745	AssertPtrReturn(pParsed, NULL);
746	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
747	if (pParsed->cchQuery)
748	return rtUriPercentDecodeN(&pszUri[pParsed->offQuery], pParsed->cchQuery);
749	return NULL;
750	}
751
752
753	RTDECL(char ) RTUriParsedFragment(const char pszUri, PCRTURIPARSED pParsed)
754	{
755	AssertPtrReturn(pszUri, NULL);
756	AssertPtrReturn(pParsed, NULL);
757	AssertReturn(pParsed->u32Magic == RTURIPARSED_MAGIC, NULL);
758	if (pParsed->cchFragment)
759	return rtUriPercentDecodeN(&pszUri[pParsed->offFragment], pParsed->cchFragment);
760	return NULL;
761	}
762
763
764	RTDECL(char ) RTUriCreate(const char pszScheme, const char pszAuthority, const char pszPath, const char *pszQuery,
765	const char *pszFragment)
766	{
767	if (!pszScheme) /* Scheme is minimum requirement */
768	return NULL;
769
770	char *pszResult = 0;
771	char *pszAuthority1 = 0;
772	char *pszPath1 = 0;
773	char *pszQuery1 = 0;
774	char *pszFragment1 = 0;
775
776	do
777	{
778	/* Create the percent encoded strings and calculate the necessary uri
779	* length. */
780	size_t cbSize = strlen(pszScheme) + 1 + 1; /* plus zero byte */
781	if (pszAuthority)
782	{
783	pszAuthority1 = rtUriPercentEncodeN(pszAuthority, RTSTR_MAX);
784	if (!pszAuthority1)
785	break;
786	cbSize += strlen(pszAuthority1) + 2;
787	}
788	if (pszPath)
789	{
790	pszPath1 = rtUriPercentEncodeN(pszPath, RTSTR_MAX);
791	if (!pszPath1)
792	break;
793	cbSize += strlen(pszPath1);
794	}
795	if (pszQuery)
796	{
797	pszQuery1 = rtUriPercentEncodeN(pszQuery, RTSTR_MAX);
798	if (!pszQuery1)
799	break;
800	cbSize += strlen(pszQuery1) + 1;
801	}
802	if (pszFragment)
803	{
804	pszFragment1 = rtUriPercentEncodeN(pszFragment, RTSTR_MAX);
805	if (!pszFragment1)
806	break;
807	cbSize += strlen(pszFragment1) + 1;
808	}
809
810	char pszTmp = pszResult = (char )RTStrAlloc(cbSize);
811	if (!pszResult)
812	break;
813	RT_BZERO(pszTmp, cbSize);
814
815	/* Compose the target uri string. */
816	RTStrCatP(&pszTmp, &cbSize, pszScheme);
817	RTStrCatP(&pszTmp, &cbSize, ":");
818	if (pszAuthority1)
819	{
820	RTStrCatP(&pszTmp, &cbSize, "//");
821	RTStrCatP(&pszTmp, &cbSize, pszAuthority1);
822	}
823	if (pszPath1)
824	{
825	RTStrCatP(&pszTmp, &cbSize, pszPath1);
826	}
827	if (pszQuery1)
828	{
829	RTStrCatP(&pszTmp, &cbSize, "?");
830	RTStrCatP(&pszTmp, &cbSize, pszQuery1);
831	}
832	if (pszFragment1)
833	{
834	RTStrCatP(&pszTmp, &cbSize, "#");
835	RTStrCatP(&pszTmp, &cbSize, pszFragment1);
836	}
837	} while (0);
838
839	/* Cleanup */
840	if (pszAuthority1)
841	RTStrFree(pszAuthority1);
842	if (pszPath1)
843	RTStrFree(pszPath1);
844	if (pszQuery1)
845	RTStrFree(pszQuery1);
846	if (pszFragment1)
847	RTStrFree(pszFragment1);
848
849	return pszResult;
850	}
851
852
853	RTDECL(bool) RTUriIsSchemeMatch(const char pszUri, const char pszScheme)
854	{
855	AssertPtrReturn(pszUri, false);
856	size_t const cchScheme = strlen(pszScheme);
857	return RTStrNICmp(pszUri, pszScheme, cchScheme) == 0
858	&& pszUri[cchScheme] == ':';
859	}
860
861
862	RTDECL(int) RTUriFileCreateEx(const char pszPath, uint32_t fPathStyle, char ppszUri, size_t cbUri, size_t pcchUri)
863	{
864	/*
865	* Validate and adjust input. (RTPathParse check pszPath out for us)
866	*/
867	if (pcchUri)
868	{
869	AssertPtrReturn(pcchUri, VERR_INVALID_POINTER);
870	*pcchUri = ~(size_t)0;
871	}
872	AssertPtrReturn(ppszUri, VERR_INVALID_POINTER);
873	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
874	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
875	fPathStyle = RTPATH_STYLE;
876
877	/*
878	* Let the RTPath code parse the stuff (no reason to duplicate path parsing
879	* and get it slightly wrong here).
880	*/
881	RTPATHPARSED ParsedPath;
882	int rc = RTPathParse(pszPath, &ParsedPath, sizeof(ParsedPath), fPathStyle);
883	if (RT_SUCCESS(rc) \|\| rc == VERR_BUFFER_OVERFLOW)
884	{
885	/* Skip leading slashes. */
886	if (ParsedPath.fProps & RTPATH_PROP_ROOT_SLASH)
887	{
888	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
889	while (pszPath[0] == '/' \|\| pszPath[0] == '\\')
890	pszPath++;
891	else
892	while (pszPath[0] == '/')
893	pszPath++;
894	}
895	const size_t cchPath = strlen(pszPath);
896
897	/*
898	* Calculate the encoded length and figure destination buffering.
899	*/
900	static const char s_szPrefix[] = "file:///";
901	size_t const cchPrefix = sizeof(s_szPrefix) - (ParsedPath.fProps & RTPATH_PROP_UNC ? 2 : 1);
902	size_t cchEncoded = rtUriCalcEncodedLength(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS);
903
904	if (pcchUri)
905	*pcchUri = cchEncoded;
906
907	char *pszDst;
908	char *pszFreeMe = NULL;
909	if (!cbUri \|\| *ppszUri == NULL)
910	{
911	cbUri = RT_MAX(cbUri, cchPrefix + cchEncoded + 1);
912	*ppszUri = pszFreeMe = pszDst = RTStrAlloc(cbUri);
913	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
914	}
915	else if (cchEncoded < cbUri)
916	pszDst = *ppszUri;
917	else
918	return VERR_BUFFER_OVERFLOW;
919
920	/*
921	* Construct the URI.
922	*/
923	memcpy(pszDst, s_szPrefix, cchPrefix);
924	pszDst[cchPrefix] = '\0';
925	rc = rtUriEncodeIntoBuffer(pszPath, cchPath, fPathStyle != RTPATH_STR_F_STYLE_DOS, &pszDst[cchPrefix], cbUri - cchPrefix);
926	if (RT_SUCCESS(rc))
927	{
928	Assert(strlen(pszDst) == cbUri - 1);
929	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
930	RTPathChangeToUnixSlashes(pszDst, true /fForce/);
931	return VINF_SUCCESS;
932	}
933
934	AssertRC(rc); /* Impossible! rtUriCalcEncodedLength or something above is busted! */
935	if (pszFreeMe)
936	RTStrFree(pszFreeMe);
937	}
938	return rc;
939	}
940
941
942	RTDECL(char ) RTUriFileCreate(const char pszPath)
943	{
944	char *pszUri = NULL;
945	int rc = RTUriFileCreateEx(pszPath, RTPATH_STR_F_STYLE_HOST, &pszUri, 0 /cbUri/, NULL /pcchUri/);
946	if (RT_SUCCESS(rc))
947	return pszUri;
948	return NULL;
949	}
950
951
952	RTDECL(int) RTUriFilePathEx(const char pszUri, uint32_t fPathStyle, char ppszPath, size_t cbPath, size_t pcchPath)
953	{
954	/*
955	* Validate and adjust input.
956	*/
957	if (pcchPath)
958	{
959	AssertPtrReturn(pcchPath, VERR_INVALID_POINTER);
960	*pcchPath = ~(size_t)0;
961	}
962	AssertPtrReturn(ppszPath, VERR_INVALID_POINTER);
963	AssertReturn(!(fPathStyle & ~RTPATH_STR_F_STYLE_MASK) && fPathStyle != RTPATH_STR_F_STYLE_RESERVED, VERR_INVALID_FLAGS);
964	if (fPathStyle == RTPATH_STR_F_STYLE_HOST)
965	fPathStyle = RTPATH_STYLE;
966	AssertPtrReturn(pszUri, VERR_INVALID_POINTER);
967
968	/*
969	* Check that this is a file URI.
970	*/
971	if (RTStrNICmp(pszUri, RT_STR_TUPLE("file:")) == 0)
972	{ /* likely */ }
973	else
974	return VERR_URI_NOT_FILE_SCHEME;
975
976	/*
977	* We may have a number of variations here, mostly thanks to
978	* various windows software. First the canonical variations:
979	* - file:///C:/Windows/System32/kernel32.dll
980	* - file:///C\|/Windows/System32/kernel32.dll
981	* - file:///C:%5CWindows%5CSystem32%5Ckernel32.dll
982	* - file://localhost/C:%5CWindows%5CSystem32%5Ckernel32.dll
983	* - file://cifsserver.dev/systemshare%5CWindows%5CSystem32%5Ckernel32.dll
984	* - file://cifsserver.dev:139/systemshare%5CWindows%5CSystem32%5Ckernel32.dll (not quite sure here, but whatever)
985	*
986	* Legacy variant without any slashes after the schema:
987	* - file:C:/Windows/System32/kernel32.dll
988	* - file:C\|/Windows/System32%5Ckernel32.dll
989	* - file:~/.bashrc
990	* \--path-/
991	*
992	* Legacy variant with exactly one slashes after the schema:
993	* - file:/C:/Windows/System32%5Ckernel32.dll
994	* - file:/C\|/Windows/System32/kernel32.dll
995	* - file:/usr/bin/env
996	* \---path---/
997	*
998	* Legacy variant with two slashes after the schema and an unescaped DOS path:
999	* - file://C:/Windows/System32\kernel32.dll (**)
1000	* - file://C\|/Windows/System32\kernel32.dll
1001	* \---path---------------------/
1002	* -- authority, with ':' as non-working port separator
1003	*
1004	* Legacy variant with exactly four slashes after the schema and an unescaped DOS path.
1005	* - file:////C:/Windows\System32\user32.dll
1006	*
1007	* Legacy variant with four or more slashes after the schema and an unescaped UNC path:
1008	* - file:////cifsserver.dev/systemshare/System32%\kernel32.dll
1009	* - file://///cifsserver.dev/systemshare/System32\kernel32.dll
1010	* \---path--------------------------------------------/
1011	*
1012	* The two unescaped variants shouldn't be handed to rtUriParse, which
1013	* is good as we cannot actually handle the one marked by (**). So, handle
1014	* those two special when parsing.
1015	*/
1016	RTURIPARSED Parsed;
1017	int rc;
1018	size_t cSlashes = 0;
1019	while (pszUri[5 + cSlashes] == '/')
1020	cSlashes++;
1021	if ( (cSlashes == 2 \|\| cSlashes == 4)
1022	&& RT_C_IS_ALPHA(pszUri[5 + cSlashes])
1023	&& (pszUri[5 + cSlashes + 1] == ':' \|\| pszUri[5 + cSlashes + 1] == '\|'))
1024	{
1025	RT_ZERO(Parsed); /* RTURIPARSED_F_CONTAINS_ESCAPED_CHARS is now clear. */
1026	Parsed.offPath = 5 + cSlashes;
1027	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1028	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1029	}
1030	else if (cSlashes >= 4)
1031	{
1032	RT_ZERO(Parsed);
1033	Parsed.fFlags = cSlashes > 4 ? RTURIPARSED_F_CONTAINS_ESCAPED_CHARS : 0;
1034	Parsed.offPath = 5 + cSlashes - 2;
1035	Parsed.cchPath = strlen(&pszUri[Parsed.offPath]);
1036	rc = RTStrValidateEncoding(&pszUri[Parsed.offPath]);
1037	}
1038	else
1039	rc = rtUriParse(pszUri, &Parsed);
1040	if (RT_SUCCESS(rc))
1041	{
1042	/*
1043	* Ignore localhost as hostname (it's implicit).
1044	*/
1045	static char const s_szLocalhost[] = "localhost";
1046	if ( Parsed.cchAuthorityHost == sizeof(s_szLocalhost) - 1U
1047	&& RTStrNICmp(&pszUri[Parsed.offAuthorityHost], RT_STR_TUPLE(s_szLocalhost)) == 0)
1048	{
1049	Parsed.cchAuthorityHost = 0;
1050	Parsed.cchAuthority = 0;
1051	}
1052
1053	/*
1054	* Ignore leading path slash/separator if we detect a DOS drive letter
1055	* and we don't have a host name.
1056	*/
1057	if ( Parsed.cchPath >= 3
1058	&& Parsed.cchAuthorityHost == 0
1059	&& pszUri[Parsed.offPath] == '/' /* Leading path slash/separator. */
1060	&& ( pszUri[Parsed.offPath + 2] == ':' /* Colon after drive letter. */
1061	\|\| pszUri[Parsed.offPath + 2] == '\|') /* Colon alternative. */
1062	&& RT_C_IS_ALPHA(pszUri[Parsed.offPath + 1]) ) /* Drive letter. */
1063	{
1064	Parsed.offPath++;
1065	Parsed.cchPath--;
1066	}
1067
1068	/*
1069	* Calculate the size of the encoded result.
1070	*
1071	* Since we're happily returning "C:/Windows/System32/kernel.dll"
1072	* style paths when the caller requested UNIX style paths, we will
1073	* return straight UNC paths too ("//cifsserver/share/dir/file").
1074	*/
1075	size_t cchDecodedHost = 0;
1076	size_t cbResult;
1077	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1078	{
1079	cchDecodedHost = rtUriCalcDecodedLength(&pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1080	cbResult = cchDecodedHost + rtUriCalcDecodedLength(&pszUri[Parsed.offPath], Parsed.cchPath) + 1;
1081	}
1082	else
1083	{
1084	cchDecodedHost = 0;
1085	cbResult = Parsed.cchAuthorityHost + Parsed.cchPath + 1;
1086	}
1087	if (pcchPath)
1088	*pcchPath = cbResult - 1;
1089	if (cbResult > 1)
1090	{
1091	/*
1092	* Prepare the necessary buffer space for the result.
1093	*/
1094	char *pszDst;
1095	char *pszFreeMe = NULL;
1096	if (!cbPath \|\| *ppszPath == NULL)
1097	{
1098	cbPath = RT_MAX(cbPath, cbResult);
1099	*ppszPath = pszFreeMe = pszDst = RTStrAlloc(cbPath);
1100	AssertReturn(pszDst, VERR_NO_STR_MEMORY);
1101	}
1102	else if (cbResult <= cbPath)
1103	pszDst = *ppszPath;
1104	else
1105	return VERR_BUFFER_OVERFLOW;
1106
1107	/*
1108	* Compose the result.
1109	*/
1110	if (Parsed.fFlags & RTURIPARSED_F_CONTAINS_ESCAPED_CHARS)
1111	{
1112	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offAuthorityHost],Parsed.cchAuthorityHost,
1113	pszDst, cchDecodedHost + 1);
1114	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cchDecodedHost);
1115	if (RT_SUCCESS(rc))
1116	rc = rtUriDecodeIntoBuffer(&pszUri[Parsed.offPath], Parsed.cchPath,
1117	&pszDst[cchDecodedHost], cbResult - cchDecodedHost);
1118	Assert(RT_SUCCESS(rc) && strlen(pszDst) == cbResult - 1);
1119	}
1120	else
1121	{
1122	memcpy(pszDst, &pszUri[Parsed.offAuthorityHost], Parsed.cchAuthorityHost);
1123	memcpy(&pszDst[Parsed.cchAuthorityHost], &pszUri[Parsed.offPath], Parsed.cchPath);
1124	pszDst[cbResult - 1] = '\0';
1125	}
1126	if (RT_SUCCESS(rc))
1127	{
1128	/*
1129	* Convert colon DOS driver letter colon alternative.
1130	* We do this regardless of the desired path style.
1131	*/
1132	if ( RT_C_IS_ALPHA(pszDst[0])
1133	&& pszDst[1] == '\|')
1134	pszDst[1] = ':';
1135
1136	/*
1137	* Fix slashes.
1138	*/
1139	if (fPathStyle == RTPATH_STR_F_STYLE_DOS)
1140	RTPathChangeToDosSlashes(pszDst, true);
1141	else if (fPathStyle == RTPATH_STR_F_STYLE_UNIX)
1142	RTPathChangeToUnixSlashes(pszDst, true); /** @todo not quite sure how this actually makes sense... */
1143	else
1144	AssertFailed();
1145	return rc;
1146	}
1147
1148	/* bail out */
1149	RTStrFree(pszFreeMe);
1150	}
1151	else
1152	rc = VERR_PATH_ZERO_LENGTH;
1153	}
1154	return rc;
1155	}
1156
1157
1158	RTDECL(char ) RTUriFilePath(const char pszUri)
1159	{
1160	char *pszPath = NULL;
1161	int rc = RTUriFilePathEx(pszUri, RTPATH_STR_F_STYLE_HOST, &pszPath, 0 /cbPath/, NULL /pcchPath/);
1162	if (RT_SUCCESS(rc))
1163	return pszPath;
1164	return NULL;
1165	}
1166

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/misc/uri.cpp@ 74762

Download in other formats: