VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 52755

Last change on this file since 52755 was 52712, checked in by vboxsync, 10 years ago

NAT: G/c bogus VBOX && !NO_USE_SOCKETS code disabled in r93447.
Same object code is generated.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 49.6 KB
Line 
1/* $Id: socket.c 52712 2014-09-11 21:37:27Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2012 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iphlpapi.h>
36#include <icmpapi.h>
37#endif
38
39#if defined(DECLARE_IOVEC) && defined(RT_OS_WINDOWS)
40AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, WSABUF, buf);
41AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, WSABUF, len);
42#endif
43
44#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
45/**
46 *
47 */
48struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
49{
50 struct socket *pNewSocket = NULL;
51 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
52 pNewSocket = socreate();
53 if (!pNewSocket)
54 {
55 LogFunc(("Can't create socket\n"));
56 LogFlowFunc(("Leave: NULL\n"));
57 return NULL;
58 }
59 if (fBindSocket)
60 {
61 if (udp_attach(pData, pNewSocket, 0) <= 0)
62 {
63 sofree(pData, pNewSocket);
64 LogFunc(("Can't attach fresh created socket\n"));
65 return NULL;
66 }
67 }
68 else
69 {
70 pNewSocket->so_cloneOf = (struct socket *)pSo;
71 pNewSocket->s = pSo->s;
72 insque(pData, pNewSocket, &udb);
73 }
74 pNewSocket->so_laddr = pSo->so_laddr;
75 pNewSocket->so_lport = pSo->so_lport;
76 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
77 pNewSocket->so_fport = pSo->so_fport;
78 pSo->so_cCloneCounter++;
79 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
80 return pNewSocket;
81}
82
83struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
84{
85 struct socket *pSoClone = NULL;
86 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
87 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
88 {
89 if ( pSoClone->so_cloneOf
90 && pSoClone->so_cloneOf == pcSo
91 && pSoClone->so_lport == pcSo->so_lport
92 && pSoClone->so_fport == pcSo->so_fport
93 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
94 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
95 goto done;
96 }
97 pSoClone = NULL;
98done:
99 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
100 return pSoClone;
101}
102#endif
103
104#ifdef VBOX_WITH_NAT_SEND2HOME
105DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
106{
107 int idxAddr;
108 int ret = 0;
109 bool fSendDone = false;
110 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
111 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
112 {
113
114 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
115 AssertReturn((pNewSocket, false));
116 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
117 /* @todo: more verbose on errors,
118 * @note: we shouldn't care if this send fail or not (we're in broadcast).
119 */
120 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
121 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
122 if (ret < 0)
123 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
124 fSendDone |= ret > 0;
125 }
126 LogFlowFunc(("Leave %RTbool\n", fSendDone));
127 return fSendDone;
128}
129#endif /* !VBOX_WITH_NAT_SEND2HOME */
130static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
131#ifdef RT_OS_WINDOWS
132static void sorecvfrom_icmp_win(PNATState, struct socket *);
133#else /* RT_OS_WINDOWS */
134static void sorecvfrom_icmp_unix(PNATState, struct socket *);
135#endif /* !RT_OS_WINDOWS */
136
137void
138so_init()
139{
140}
141
142struct socket *
143solookup(struct socket *head, struct in_addr laddr,
144 u_int lport, struct in_addr faddr, u_int fport)
145{
146 struct socket *so;
147
148 for (so = head->so_next; so != head; so = so->so_next)
149 {
150 if ( so->so_lport == lport
151 && so->so_laddr.s_addr == laddr.s_addr
152 && so->so_faddr.s_addr == faddr.s_addr
153 && so->so_fport == fport)
154 return so;
155 }
156
157 return (struct socket *)NULL;
158}
159
160/*
161 * Create a new socket, initialise the fields
162 * It is the responsibility of the caller to
163 * insque() it into the correct linked-list
164 */
165struct socket *
166socreate()
167{
168 struct socket *so;
169
170 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
171 if (so)
172 {
173 so->so_state = SS_NOFDREF;
174 so->s = -1;
175#if !defined(RT_OS_WINDOWS)
176 so->so_poll_index = -1;
177#endif
178 }
179 return so;
180}
181
182/*
183 * remque and free a socket, clobber cache
184 */
185void
186sofree(PNATState pData, struct socket *so)
187{
188 LogFlowFunc(("ENTER:%R[natsock]\n", so));
189 /*
190 * We should not remove socket when polling routine do the polling
191 * instead we mark it for deletion.
192 */
193 if (so->fUnderPolling)
194 {
195 so->fShouldBeRemoved = 1;
196 LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so));
197 return;
198 }
199 /**
200 * Check that we don't freeng socket with tcbcb
201 */
202 Assert(!sototcpcb(so));
203 /* udp checks */
204 Assert(!so->so_timeout);
205 Assert(!so->so_timeout_arg);
206 if (so == tcp_last_so)
207 tcp_last_so = &tcb;
208 else if (so == udp_last_so)
209 udp_last_so = &udb;
210
211 /* check if mbuf haven't been already freed */
212 if (so->so_m != NULL)
213 {
214 m_freem(pData, so->so_m);
215 so->so_m = NULL;
216 }
217
218 if (so->so_next && so->so_prev)
219 {
220 remque(pData, so); /* crashes if so is not in a queue */
221 NSOCK_DEC();
222 }
223
224 RTMemFree(so);
225 LogFlowFuncLeave();
226}
227
228/*
229 * Read from so's socket into sb_snd, updating all relevant sbuf fields
230 * NOTE: This will only be called if it is select()ed for reading, so
231 * a read() of 0 (or less) means it's disconnected
232 */
233#ifndef VBOX_WITH_SLIRP_BSD_SBUF
234int
235soread(PNATState pData, struct socket *so)
236{
237 int n, nn, lss, total;
238 struct sbuf *sb = &so->so_snd;
239 size_t len = sb->sb_datalen - sb->sb_cc;
240 struct iovec iov[2];
241 int mss = so->so_tcpcb->t_maxseg;
242
243 STAM_PROFILE_START(&pData->StatIOread, a);
244 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
245 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
246
247 QSOCKET_LOCK(tcb);
248 SOCKET_LOCK(so);
249 QSOCKET_UNLOCK(tcb);
250
251 LogFlow(("soread: so = %R[natsock]\n", so));
252 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
253
254 /*
255 * No need to check if there's enough room to read.
256 * soread wouldn't have been called if there weren't
257 */
258
259 len = sb->sb_datalen - sb->sb_cc;
260
261 iov[0].iov_base = sb->sb_wptr;
262 iov[1].iov_base = 0;
263 iov[1].iov_len = 0;
264 if (sb->sb_wptr < sb->sb_rptr)
265 {
266 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
267 /* Should never succeed, but... */
268 if (iov[0].iov_len > len)
269 iov[0].iov_len = len;
270 if (iov[0].iov_len > mss)
271 iov[0].iov_len -= iov[0].iov_len%mss;
272 n = 1;
273 }
274 else
275 {
276 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
277 /* Should never succeed, but... */
278 if (iov[0].iov_len > len)
279 iov[0].iov_len = len;
280 len -= iov[0].iov_len;
281 if (len)
282 {
283 iov[1].iov_base = sb->sb_data;
284 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
285 if (iov[1].iov_len > len)
286 iov[1].iov_len = len;
287 total = iov[0].iov_len + iov[1].iov_len;
288 if (total > mss)
289 {
290 lss = total % mss;
291 if (iov[1].iov_len > lss)
292 {
293 iov[1].iov_len -= lss;
294 n = 2;
295 }
296 else
297 {
298 lss -= iov[1].iov_len;
299 iov[0].iov_len -= lss;
300 n = 1;
301 }
302 }
303 else
304 n = 2;
305 }
306 else
307 {
308 if (iov[0].iov_len > mss)
309 iov[0].iov_len -= iov[0].iov_len%mss;
310 n = 1;
311 }
312 }
313
314#ifdef HAVE_READV
315 nn = readv(so->s, (struct iovec *)iov, n);
316#else
317 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
318#endif
319 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
320 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
321 if (nn <= 0)
322 {
323 /*
324 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
325 * _could_ mean that the connection is closed. But we will receive an
326 * FD_CLOSE event later if the connection was _really_ closed. With
327 * www.youtube.com I see this very often. Closing the socket too early
328 * would be dangerous.
329 */
330 int status;
331 unsigned long pending = 0;
332 status = ioctlsocket(so->s, FIONREAD, &pending);
333 if (status < 0)
334 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
335 if (nn == 0 && (pending != 0))
336 {
337 SOCKET_UNLOCK(so);
338 STAM_PROFILE_STOP(&pData->StatIOread, a);
339 return 0;
340 }
341 if ( nn < 0
342 && soIgnorableErrorCode(errno))
343 {
344 SOCKET_UNLOCK(so);
345 STAM_PROFILE_STOP(&pData->StatIOread, a);
346 return 0;
347 }
348 else
349 {
350 int fUninitiolizedTemplate = 0;
351 fUninitiolizedTemplate = RT_BOOL(( sototcpcb(so)
352 && ( sototcpcb(so)->t_template.ti_src.s_addr == INADDR_ANY
353 || sototcpcb(so)->t_template.ti_dst.s_addr == INADDR_ANY)));
354 /* nn == 0 means peer has performed an orderly shutdown */
355 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
356 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
357 sofcantrcvmore(so);
358 if (!fUninitiolizedTemplate)
359 tcp_sockclosed(pData, sototcpcb(so));
360 else
361 tcp_drop(pData, sototcpcb(so), errno);
362 SOCKET_UNLOCK(so);
363 STAM_PROFILE_STOP(&pData->StatIOread, a);
364 return -1;
365 }
366 }
367 STAM_STATS(
368 if (n == 1)
369 {
370 STAM_COUNTER_INC(&pData->StatIORead_in_1);
371 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
372 }
373 else
374 {
375 STAM_COUNTER_INC(&pData->StatIORead_in_2);
376 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
377 }
378 );
379
380#ifndef HAVE_READV
381 /*
382 * If there was no error, try and read the second time round
383 * We read again if n = 2 (ie, there's another part of the buffer)
384 * and we read as much as we could in the first read
385 * We don't test for <= 0 this time, because there legitimately
386 * might not be any more data (since the socket is non-blocking),
387 * a close will be detected on next iteration.
388 * A return of -1 wont (shouldn't) happen, since it didn't happen above
389 */
390 if (n == 2 && nn == iov[0].iov_len)
391 {
392 int ret;
393 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
394 if (ret > 0)
395 nn += ret;
396 STAM_STATS(
397 if (ret > 0)
398 {
399 STAM_COUNTER_INC(&pData->StatIORead_in_2);
400 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
401 }
402 );
403 }
404
405 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
406#endif
407
408 /* Update fields */
409 sb->sb_cc += nn;
410 sb->sb_wptr += nn;
411 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
412 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
413 {
414 sb->sb_wptr -= sb->sb_datalen;
415 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
416 }
417 STAM_PROFILE_STOP(&pData->StatIOread, a);
418 SOCKET_UNLOCK(so);
419 return nn;
420}
421#else /* VBOX_WITH_SLIRP_BSD_SBUF */
422int
423soread(PNATState pData, struct socket *so)
424{
425 int n;
426 char *buf;
427 struct sbuf *sb = &so->so_snd;
428 size_t len = sbspace(sb);
429 int mss = so->so_tcpcb->t_maxseg;
430
431 STAM_PROFILE_START(&pData->StatIOread, a);
432 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
433 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
434
435 QSOCKET_LOCK(tcb);
436 SOCKET_LOCK(so);
437 QSOCKET_UNLOCK(tcb);
438
439 LogFlowFunc(("soread: so = %lx\n", (long)so));
440
441 if (len > mss)
442 len -= len % mss;
443 buf = RTMemAlloc(len);
444 if (buf == NULL)
445 {
446 Log(("NAT: can't alloc enough memory\n"));
447 return -1;
448 }
449
450 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
451 if (n <= 0)
452 {
453 /*
454 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
455 * _could_ mean that the connection is closed. But we will receive an
456 * FD_CLOSE event later if the connection was _really_ closed. With
457 * www.youtube.com I see this very often. Closing the socket too early
458 * would be dangerous.
459 */
460 int status;
461 unsigned long pending = 0;
462 status = ioctlsocket(so->s, FIONREAD, &pending);
463 if (status < 0)
464 Log(("NAT:error in WSAIoctl: %d\n", errno));
465 if (n == 0 && (pending != 0))
466 {
467 SOCKET_UNLOCK(so);
468 STAM_PROFILE_STOP(&pData->StatIOread, a);
469 RTMemFree(buf);
470 return 0;
471 }
472 if ( n < 0
473 && soIgnorableErrorCode(errno))
474 {
475 SOCKET_UNLOCK(so);
476 STAM_PROFILE_STOP(&pData->StatIOread, a);
477 RTMemFree(buf);
478 return 0;
479 }
480 else
481 {
482 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
483 n, errno, strerror(errno)));
484 sofcantrcvmore(so);
485 tcp_sockclosed(pData, sototcpcb(so));
486 SOCKET_UNLOCK(so);
487 STAM_PROFILE_STOP(&pData->StatIOread, a);
488 RTMemFree(buf);
489 return -1;
490 }
491 }
492
493 sbuf_bcat(sb, buf, n);
494 RTMemFree(buf);
495 return n;
496}
497#endif
498
499/*
500 * Get urgent data
501 *
502 * When the socket is created, we set it SO_OOBINLINE,
503 * so when OOB data arrives, we soread() it and everything
504 * in the send buffer is sent as urgent data
505 */
506void
507sorecvoob(PNATState pData, struct socket *so)
508{
509 struct tcpcb *tp = sototcpcb(so);
510 ssize_t ret;
511
512 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
513
514 /*
515 * We take a guess at how much urgent data has arrived.
516 * In most situations, when urgent data arrives, the next
517 * read() should get all the urgent data. This guess will
518 * be wrong however if more data arrives just after the
519 * urgent data, or the read() doesn't return all the
520 * urgent data.
521 */
522 ret = soread(pData, so);
523 if (RT_LIKELY(ret > 0))
524 {
525 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
526 tp->t_force = 1;
527 tcp_output(pData, tp);
528 tp->t_force = 0;
529 }
530}
531#ifndef VBOX_WITH_SLIRP_BSD_SBUF
532/*
533 * Send urgent data
534 * There's a lot duplicated code here, but...
535 */
536int
537sosendoob(struct socket *so)
538{
539 struct sbuf *sb = &so->so_rcv;
540 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
541
542 int n, len;
543
544 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
545
546 if (so->so_urgc > sizeof(buff))
547 so->so_urgc = sizeof(buff); /* XXX */
548
549 if (sb->sb_rptr < sb->sb_wptr)
550 {
551 /* We can send it directly */
552 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
553 so->so_urgc -= n;
554
555 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
556 n, so->so_urgc));
557 }
558 else
559 {
560 /*
561 * Since there's no sendv or sendtov like writev,
562 * we must copy all data to a linear buffer then
563 * send it all
564 */
565 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
566 if (len > so->so_urgc)
567 len = so->so_urgc;
568 memcpy(buff, sb->sb_rptr, len);
569 so->so_urgc -= len;
570 if (so->so_urgc)
571 {
572 n = sb->sb_wptr - sb->sb_data;
573 if (n > so->so_urgc)
574 n = so->so_urgc;
575 memcpy(buff + len, sb->sb_data, n);
576 so->so_urgc -= n;
577 len += n;
578 }
579 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
580#ifdef DEBUG
581 if (n != len)
582 Log(("Didn't send all data urgently XXXXX\n"));
583#endif
584 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
585 n, so->so_urgc));
586 }
587
588 sb->sb_cc -= n;
589 sb->sb_rptr += n;
590 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
591 sb->sb_rptr -= sb->sb_datalen;
592
593 return n;
594}
595
596/*
597 * Write data from so_rcv to so's socket,
598 * updating all sbuf field as necessary
599 */
600int
601sowrite(PNATState pData, struct socket *so)
602{
603 int n, nn;
604 struct sbuf *sb = &so->so_rcv;
605 size_t len = sb->sb_cc;
606 struct iovec iov[2];
607
608 STAM_PROFILE_START(&pData->StatIOwrite, a);
609 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
610 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
611 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
612 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
613 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
614 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
615 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
616 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
617 LogFlowFunc(("so = %R[natsock]\n", so));
618 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
619 QSOCKET_LOCK(tcb);
620 SOCKET_LOCK(so);
621 QSOCKET_UNLOCK(tcb);
622 if (so->so_urgc)
623 {
624 sosendoob(so);
625 if (sb->sb_cc == 0)
626 {
627 SOCKET_UNLOCK(so);
628 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
629 return 0;
630 }
631 }
632
633 /*
634 * No need to check if there's something to write,
635 * sowrite wouldn't have been called otherwise
636 */
637
638 len = sb->sb_cc;
639
640 iov[0].iov_base = sb->sb_rptr;
641 iov[1].iov_base = 0;
642 iov[1].iov_len = 0;
643 if (sb->sb_rptr < sb->sb_wptr)
644 {
645 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
646 /* Should never succeed, but... */
647 if (iov[0].iov_len > len)
648 iov[0].iov_len = len;
649 n = 1;
650 }
651 else
652 {
653 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
654 if (iov[0].iov_len > len)
655 iov[0].iov_len = len;
656 len -= iov[0].iov_len;
657 if (len)
658 {
659 iov[1].iov_base = sb->sb_data;
660 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
661 if (iov[1].iov_len > len)
662 iov[1].iov_len = len;
663 n = 2;
664 }
665 else
666 n = 1;
667 }
668 STAM_STATS({
669 if (n == 1)
670 {
671 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
672 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
673 }
674 else
675 {
676 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
677 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
678 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
679 }
680 });
681 /* Check if there's urgent data to send, and if so, send it */
682#ifdef HAVE_READV
683 nn = writev(so->s, (const struct iovec *)iov, n);
684#else
685 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
686#endif
687 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
688 /* This should never happen, but people tell me it does *shrug* */
689 if ( nn < 0
690 && soIgnorableErrorCode(errno))
691 {
692 SOCKET_UNLOCK(so);
693 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
694 return 0;
695 }
696
697 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
698 {
699 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
700 __PRETTY_FUNCTION__, so->so_state, errno));
701 sofcantsendmore(so);
702 tcp_sockclosed(pData, sototcpcb(so));
703 SOCKET_UNLOCK(so);
704 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
705 return -1;
706 }
707
708#ifndef HAVE_READV
709 if (n == 2 && nn == iov[0].iov_len)
710 {
711 int ret;
712 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
713 if (ret > 0)
714 nn += ret;
715 STAM_STATS({
716 if (ret > 0 && ret != iov[1].iov_len)
717 {
718 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
719 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
720 }
721 });
722 }
723 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
724#endif
725
726 /* Update sbuf */
727 sb->sb_cc -= nn;
728 sb->sb_rptr += nn;
729 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
730 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
731 {
732 sb->sb_rptr -= sb->sb_datalen;
733 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
734 }
735
736 /*
737 * If in DRAIN mode, and there's no more data, set
738 * it CANTSENDMORE
739 */
740 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
741 sofcantsendmore(so);
742
743 SOCKET_UNLOCK(so);
744 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
745 return nn;
746}
747#else /* VBOX_WITH_SLIRP_BSD_SBUF */
748static int
749do_sosend(struct socket *so, int fUrg)
750{
751 struct sbuf *sb = &so->so_rcv;
752
753 int n, len;
754
755 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
756
757 len = sbuf_len(sb);
758
759 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
760 if (n < 0)
761 Log(("NAT: Can't sent sbuf via socket.\n"));
762 if (fUrg)
763 so->so_urgc -= n;
764 if (n > 0 && n < len)
765 {
766 char *ptr;
767 char *buff;
768 buff = RTMemAlloc(len);
769 if (buff == NULL)
770 {
771 Log(("NAT: No space to allocate temporal buffer\n"));
772 return -1;
773 }
774 ptr = sbuf_data(sb);
775 memcpy(buff, &ptr[n], len - n);
776 sbuf_bcpy(sb, buff, len - n);
777 RTMemFree(buff);
778 return n;
779 }
780 sbuf_clear(sb);
781 return n;
782}
783int
784sosendoob(struct socket *so)
785{
786 return do_sosend(so, 1);
787}
788
789/*
790 * Write data from so_rcv to so's socket,
791 * updating all sbuf field as necessary
792 */
793int
794sowrite(PNATState pData, struct socket *so)
795{
796 return do_sosend(so, 0);
797}
798#endif
799
800/*
801 * recvfrom() a UDP socket
802 */
803void
804sorecvfrom(PNATState pData, struct socket *so)
805{
806 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
807
808 if (so->so_type == IPPROTO_ICMP)
809 {
810 /* This is a "ping" reply */
811#ifdef RT_OS_WINDOWS
812 sorecvfrom_icmp_win(pData, so);
813#else /* RT_OS_WINDOWS */
814 sorecvfrom_icmp_unix(pData, so);
815#endif /* !RT_OS_WINDOWS */
816 udp_detach(pData, so);
817 }
818 else
819 {
820 static uint8_t au8Buf[64 * 1024];
821
822 /* A "normal" UDP packet */
823 struct sockaddr_in addr;
824 socklen_t addrlen = sizeof(struct sockaddr_in);
825 struct iovec iov[2];
826 ssize_t nread;
827 struct mbuf *m;
828
829 QSOCKET_LOCK(udb);
830 SOCKET_LOCK(so);
831 QSOCKET_UNLOCK(udb);
832
833 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
834 if (m == NULL)
835 {
836 SOCKET_UNLOCK(so);
837 return;
838 }
839
840 m->m_data += ETH_HLEN;
841 m->m_pkthdr.header = mtod(m, void *);
842
843 m->m_data += sizeof(struct udpiphdr);
844
845 /* small packets will fit without copying */
846 iov[0].iov_base = mtod(m, char *);
847 iov[0].iov_len = M_TRAILINGSPACE(m);
848
849 /* large packets will spill into a temp buffer */
850 iov[1].iov_base = au8Buf;
851 iov[1].iov_len = sizeof(au8Buf);
852
853#if !defined(RT_OS_WINDOWS)
854 {
855 struct msghdr mh;
856 memset(&mh, 0, sizeof(mh));
857
858 mh.msg_iov = iov;
859 mh.msg_iovlen = 2;
860 mh.msg_name = &addr;
861 mh.msg_namelen = addrlen;
862
863 nread = recvmsg(so->s, &mh, 0);
864 }
865#else /* RT_OS_WINDOWS */
866 {
867 DWORD nbytes; /* NB: can't use nread b/c of different size */
868 DWORD flags;
869 int status;
870
871 flags = 0;
872 status = WSARecvFrom(so->s, iov, 2, &nbytes, &flags,
873 (struct sockaddr *)&addr, &addrlen,
874 NULL, NULL);
875 nread = (status != SOCKET_ERROR) ? nbytes : -1;
876 }
877#endif
878 if (nread >= 0)
879 {
880 if (nread <= iov[0].iov_len)
881 m->m_len = nread;
882 else
883 {
884 m->m_len = iov[0].iov_len;
885 m_append(pData, m, nread - iov[0].iov_len, iov[1].iov_base);
886 }
887 Assert((m_length(m, NULL) == nread));
888
889 /*
890 * Hack: domain name lookup will be used the most for UDP,
891 * and since they'll only be used once there's no need
892 * for the 4 minute (or whatever) timeout... So we time them
893 * out much quicker (10 seconds for now...)
894 */
895 if (so->so_expire)
896 {
897 if (so->so_fport != RT_H2N_U16_C(53))
898 so->so_expire = curtime + SO_EXPIRE;
899 }
900
901 /*
902 * last argument should be changed if Slirp will inject IP attributes
903 * Note: Here we can't check if dnsproxy's sent initial request
904 */
905 if ( pData->fUseDnsProxy
906 && so->so_fport == RT_H2N_U16_C(53))
907 dnsproxy_answer(pData, so, m);
908
909 /* packets definetly will be fragmented, could confuse receiver peer. */
910 if (nread > if_mtu)
911 m->m_flags |= M_SKIP_FIREWALL;
912
913 /*
914 * If this packet was destined for CTL_ADDR,
915 * make it look like that's where it came from, done by udp_output
916 */
917 udp_output(pData, so, m, &addr);
918 }
919 else
920 {
921 m_freem(pData, m);
922 so->so_m = NULL;
923
924 if (!soIgnorableErrorCode(errno))
925 {
926 u_char code;
927 if (errno == EHOSTUNREACH)
928 code = ICMP_UNREACH_HOST;
929 else if (errno == ENETUNREACH)
930 code = ICMP_UNREACH_NET;
931 else
932 code = ICMP_UNREACH_PORT;
933
934 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
935 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
936 }
937 }
938
939 SOCKET_UNLOCK(so);
940 }
941}
942
943/*
944 * sendto() a socket
945 */
946int
947sosendto(PNATState pData, struct socket *so, struct mbuf *m)
948{
949 int ret;
950 struct sockaddr_in *paddr;
951 struct sockaddr addr;
952#if 0
953 struct sockaddr_in host_addr;
954#endif
955 caddr_t buf = 0;
956 int mlen;
957
958 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
959
960 memset(&addr, 0, sizeof(struct sockaddr));
961#ifdef RT_OS_DARWIN
962 addr.sa_len = sizeof(struct sockaddr_in);
963#endif
964 paddr = (struct sockaddr_in *)&addr;
965 paddr->sin_family = AF_INET;
966 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
967 {
968 /* It's an alias */
969 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
970 switch(last_byte)
971 {
972#if 0
973 /* handle this case at 'default:' */
974 case CTL_BROADCAST:
975 addr.sin_addr.s_addr = INADDR_BROADCAST;
976 /* Send the packet to host to fully emulate broadcast */
977 /** @todo r=klaus: on Linux host this causes the host to receive
978 * the packet twice for some reason. And I cannot find any place
979 * in the man pages which states that sending a broadcast does not
980 * reach the host itself. */
981 host_addr.sin_family = AF_INET;
982 host_addr.sin_port = so->so_fport;
983 host_addr.sin_addr = our_addr;
984 sendto(so->s, m->m_data, m->m_len, 0,
985 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
986 break;
987#endif
988 case CTL_DNS:
989 case CTL_ALIAS:
990 default:
991 if (last_byte == ~pData->netmask)
992 paddr->sin_addr.s_addr = INADDR_BROADCAST;
993 else
994 paddr->sin_addr = loopback_addr;
995 break;
996 }
997 }
998 else
999 paddr->sin_addr = so->so_faddr;
1000 paddr->sin_port = so->so_fport;
1001
1002 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1003 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1004
1005 /* Don't care what port we get */
1006 /*
1007 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1008 * generates bodyless messages, annoying memmory management system.
1009 */
1010 mlen = m_length(m, NULL);
1011 if (mlen > 0)
1012 {
1013 buf = RTMemAlloc(mlen);
1014 if (buf == NULL)
1015 {
1016 return -1;
1017 }
1018 m_copydata(m, 0, mlen, buf);
1019 }
1020 ret = sendto(so->s, buf, mlen, 0,
1021 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1022#ifdef VBOX_WITH_NAT_SEND2HOME
1023 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1024 {
1025 slirpSend2Home(pData, so, buf, mlen, 0);
1026 }
1027#endif
1028 if (buf)
1029 RTMemFree(buf);
1030 if (ret < 0)
1031 {
1032 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1033 return -1;
1034 }
1035
1036 /*
1037 * Kill the socket if there's no reply in 4 minutes,
1038 * but only if it's an expirable socket
1039 */
1040 if (so->so_expire)
1041 so->so_expire = curtime + SO_EXPIRE;
1042 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1043 return 0;
1044}
1045
1046/*
1047 * XXX This should really be tcp_listen
1048 */
1049struct socket *
1050solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1051{
1052 struct sockaddr_in addr;
1053 struct socket *so;
1054 socklen_t addrlen = sizeof(addr);
1055 int s, opt = 1;
1056 int status;
1057
1058 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1059
1060 if ((so = socreate()) == NULL)
1061 {
1062 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1063 return NULL;
1064 }
1065
1066 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1067 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1068 {
1069 RTMemFree(so);
1070 return NULL;
1071 }
1072
1073 SOCKET_LOCK_CREATE(so);
1074 SOCKET_LOCK(so);
1075 QSOCKET_LOCK(tcb);
1076 insque(pData, so,&tcb);
1077 NSOCK_INC();
1078 QSOCKET_UNLOCK(tcb);
1079
1080 /*
1081 * SS_FACCEPTONCE sockets must time out.
1082 */
1083 if (flags & SS_FACCEPTONCE)
1084 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1085
1086 so->so_state = (SS_FACCEPTCONN|flags);
1087 so->so_lport = lport; /* Kept in network format */
1088 so->so_laddr.s_addr = laddr; /* Ditto */
1089
1090 memset(&addr, 0, sizeof(addr));
1091#ifdef RT_OS_DARWIN
1092 addr.sin_len = sizeof(addr);
1093#endif
1094 addr.sin_family = AF_INET;
1095 addr.sin_addr.s_addr = bind_addr;
1096 addr.sin_port = port;
1097
1098 /**
1099 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1100 * kernel will choose the optimal value for requests queue length.
1101 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1102 */
1103 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1104 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1105 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1106 || (listen(s, pData->soMaxConn) < 0))
1107 {
1108#ifdef RT_OS_WINDOWS
1109 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1110 closesocket(s);
1111 QSOCKET_LOCK(tcb);
1112 sofree(pData, so);
1113 QSOCKET_UNLOCK(tcb);
1114 /* Restore the real errno */
1115 WSASetLastError(tmperrno);
1116#else
1117 int tmperrno = errno; /* Don't clobber the real reason we failed */
1118 close(s);
1119 if (sototcpcb(so))
1120 tcp_close(pData, sototcpcb(so));
1121 else
1122 sofree(pData, so);
1123 /* Restore the real errno */
1124 errno = tmperrno;
1125#endif
1126 return NULL;
1127 }
1128 fd_nonblock(s);
1129 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1130
1131 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1132 so->so_fport = addr.sin_port;
1133 /* set socket buffers */
1134 opt = pData->socket_rcv;
1135 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1136 if (status < 0)
1137 {
1138 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1139 goto no_sockopt;
1140 }
1141 opt = pData->socket_snd;
1142 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1143 if (status < 0)
1144 {
1145 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1146 goto no_sockopt;
1147 }
1148no_sockopt:
1149 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1150 so->so_faddr = alias_addr;
1151 else
1152 so->so_faddr = addr.sin_addr;
1153
1154 so->s = s;
1155 SOCKET_UNLOCK(so);
1156 return so;
1157}
1158
1159/*
1160 * Data is available in so_rcv
1161 * Just write() the data to the socket
1162 * XXX not yet...
1163 * @todo do we really need this function, what it's intended to do?
1164 */
1165void
1166sorwakeup(struct socket *so)
1167{
1168 NOREF(so);
1169#if 0
1170 sowrite(so);
1171 FD_CLR(so->s,&writefds);
1172#endif
1173}
1174
1175/*
1176 * Data has been freed in so_snd
1177 * We have room for a read() if we want to
1178 * For now, don't read, it'll be done in the main loop
1179 */
1180void
1181sowwakeup(struct socket *so)
1182{
1183 NOREF(so);
1184}
1185
1186/*
1187 * Various session state calls
1188 * XXX Should be #define's
1189 * The socket state stuff needs work, these often get call 2 or 3
1190 * times each when only 1 was needed
1191 */
1192void
1193soisfconnecting(struct socket *so)
1194{
1195 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1196 SS_FCANTSENDMORE|SS_FWDRAIN);
1197 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1198}
1199
1200void
1201soisfconnected(struct socket *so)
1202{
1203 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1204 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1205 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1206 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1207}
1208
1209void
1210sofcantrcvmore(struct socket *so)
1211{
1212 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1213 if ((so->so_state & SS_NOFDREF) == 0)
1214 {
1215 shutdown(so->s, 0);
1216 }
1217 so->so_state &= ~(SS_ISFCONNECTING);
1218 if (so->so_state & SS_FCANTSENDMORE)
1219 so->so_state = SS_NOFDREF; /* Don't select it */
1220 /* XXX close() here as well? */
1221 else
1222 so->so_state |= SS_FCANTRCVMORE;
1223 LogFlowFuncLeave();
1224}
1225
1226void
1227sofcantsendmore(struct socket *so)
1228{
1229 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1230 if ((so->so_state & SS_NOFDREF) == 0)
1231 shutdown(so->s, 1); /* send FIN to fhost */
1232
1233 so->so_state &= ~(SS_ISFCONNECTING);
1234 if (so->so_state & SS_FCANTRCVMORE)
1235 so->so_state = SS_NOFDREF; /* as above */
1236 else
1237 so->so_state |= SS_FCANTSENDMORE;
1238 LogFlowFuncLeave();
1239}
1240
1241void
1242soisfdisconnected(struct socket *so)
1243{
1244 NOREF(so);
1245#if 0
1246 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1247 close(so->s);
1248 so->so_state = SS_ISFDISCONNECTED;
1249 /*
1250 * XXX Do nothing ... ?
1251 */
1252#endif
1253}
1254
1255/*
1256 * Set write drain mode
1257 * Set CANTSENDMORE once all data has been write()n
1258 */
1259void
1260sofwdrain(struct socket *so)
1261{
1262 if (SBUF_LEN(&so->so_rcv))
1263 so->so_state |= SS_FWDRAIN;
1264 else
1265 sofcantsendmore(so);
1266}
1267
1268static void
1269send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1270{
1271 struct ip *ip;
1272 uint32_t dst, src;
1273 char ip_copy[256];
1274 struct icmp *icp;
1275 int old_ip_len = 0;
1276 int hlen, original_hlen = 0;
1277 struct mbuf *m;
1278 struct icmp_msg *icm;
1279 uint8_t proto;
1280 int type = 0;
1281
1282 ip = (struct ip *)buff;
1283 /* Fix ip->ip_len to contain the total packet length including the header
1284 * in _host_ byte order for all OSes. On Darwin, that value already is in
1285 * host byte order. Solaris and Darwin report only the payload. */
1286#ifndef RT_OS_DARWIN
1287 ip->ip_len = RT_N2H_U16(ip->ip_len);
1288#endif
1289 hlen = (ip->ip_hl << 2);
1290#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1291 ip->ip_len += hlen;
1292#endif
1293 if (ip->ip_len < hlen + ICMP_MINLEN)
1294 {
1295 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1296 return;
1297 }
1298 icp = (struct icmp *)((char *)ip + hlen);
1299
1300 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1301 if ( icp->icmp_type != ICMP_ECHOREPLY
1302 && icp->icmp_type != ICMP_TIMXCEED
1303 && icp->icmp_type != ICMP_UNREACH)
1304 {
1305 return;
1306 }
1307
1308 /*
1309 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1310 * ICMP_ECHOREPLY assuming data 0
1311 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1312 */
1313 if (ip->ip_len < hlen + 8)
1314 {
1315 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1316 return;
1317 }
1318
1319 type = icp->icmp_type;
1320 if ( type == ICMP_TIMXCEED
1321 || type == ICMP_UNREACH)
1322 {
1323 /*
1324 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1325 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1326 */
1327 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1328 {
1329 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1330 return;
1331 }
1332 ip = &icp->icmp_ip;
1333 }
1334
1335 icm = icmp_find_original_mbuf(pData, ip);
1336 if (icm == NULL)
1337 {
1338 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1339 return;
1340 }
1341
1342 m = icm->im_m;
1343 if (!m)
1344 {
1345 LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so));
1346 LIST_REMOVE(icm, im_list);
1347 RTMemFree(icm);
1348 return;
1349 }
1350
1351 src = addr->sin_addr.s_addr;
1352 if (type == ICMP_ECHOREPLY)
1353 {
1354 struct ip *ip0 = mtod(m, struct ip *);
1355 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1356 if (icp0->icmp_type != ICMP_ECHO)
1357 {
1358 Log(("NAT: we haven't found echo for this reply\n"));
1359 return;
1360 }
1361 /*
1362 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1363 * IP header combined by OS network stack, our local copy of IP header contians values
1364 * in host byte order so no byte order conversion is required. IP headers fields are converting
1365 * in ip_output0 routine only.
1366 */
1367 if ( (ip->ip_len - hlen)
1368 != (ip0->ip_len - (ip0->ip_hl << 2)))
1369 {
1370 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1371 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1372 return;
1373 }
1374 }
1375
1376 /* ip points on origianal ip header */
1377 ip = mtod(m, struct ip *);
1378 proto = ip->ip_p;
1379 /* Now ip is pointing on header we've sent from guest */
1380 if ( icp->icmp_type == ICMP_TIMXCEED
1381 || icp->icmp_type == ICMP_UNREACH)
1382 {
1383 old_ip_len = (ip->ip_hl << 2) + 64;
1384 if (old_ip_len > sizeof(ip_copy))
1385 old_ip_len = sizeof(ip_copy);
1386 memcpy(ip_copy, ip, old_ip_len);
1387 }
1388
1389 /* source address from original IP packet*/
1390 dst = ip->ip_src.s_addr;
1391
1392 /* overide ther tail of old packet */
1393 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1394 original_hlen = ip->ip_hl << 2;
1395 /* saves original ip header and options */
1396 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1397 ip->ip_len = m_length(m, NULL);
1398 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1399
1400 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1401 type = icp->icmp_type;
1402 if ( type == ICMP_TIMXCEED
1403 || type == ICMP_UNREACH)
1404 {
1405 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1406 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1407 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1408 }
1409
1410 ip->ip_src.s_addr = src;
1411 ip->ip_dst.s_addr = dst;
1412 icmp_reflect(pData, m);
1413 LIST_REMOVE(icm, im_list);
1414 pData->cIcmpCacheSize--;
1415 /* Don't call m_free here*/
1416
1417 if ( type == ICMP_TIMXCEED
1418 || type == ICMP_UNREACH)
1419 {
1420 icm->im_so->so_m = NULL;
1421 switch (proto)
1422 {
1423 case IPPROTO_UDP:
1424 /*XXX: so->so_m already freed so we shouldn't call sofree */
1425 udp_detach(pData, icm->im_so);
1426 break;
1427 case IPPROTO_TCP:
1428 /*close tcp should be here */
1429 break;
1430 default:
1431 /* do nothing */
1432 break;
1433 }
1434 }
1435 RTMemFree(icm);
1436}
1437
1438#ifdef RT_OS_WINDOWS
1439static void
1440sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1441{
1442 int len;
1443 int i;
1444 struct ip *ip;
1445 struct mbuf *m;
1446 struct icmp *icp;
1447 struct icmp_msg *icm;
1448 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1449 uint32_t src;
1450 ICMP_ECHO_REPLY *icr;
1451 int hlen = 0;
1452 int nbytes = 0;
1453 u_char code = ~0;
1454 int out_len;
1455 int size;
1456
1457 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->cbIcmpBuffer);
1458 if (len < 0)
1459 {
1460 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1461 return;
1462 }
1463 if (len == 0)
1464 return; /* no error */
1465
1466 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1467 for (i = 0; i < len; ++i)
1468 {
1469 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1470 i, icr[i].Data, icr[i].DataSize));
1471 switch(icr[i].Status)
1472 {
1473 case IP_DEST_HOST_UNREACHABLE:
1474 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1475 case IP_DEST_NET_UNREACHABLE:
1476 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1477 case IP_DEST_PROT_UNREACHABLE:
1478 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1479 /* UNREACH error inject here */
1480 case IP_DEST_PORT_UNREACHABLE:
1481 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1482 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1483 so->so_m = NULL;
1484 break;
1485 case IP_SUCCESS: /* echo replied */
1486 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1487 size;
1488 size = MCLBYTES;
1489 if (out_len < MSIZE)
1490 size = MCLBYTES;
1491 else if (out_len < MCLBYTES)
1492 size = MCLBYTES;
1493 else if (out_len < MJUM9BYTES)
1494 size = MJUM9BYTES;
1495 else if (out_len < MJUM16BYTES)
1496 size = MJUM16BYTES;
1497 else
1498 AssertMsgFailed(("Unsupported size"));
1499
1500 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1501 LogFunc(("m_getjcl returns m: %p\n", m));
1502 if (m == NULL)
1503 return;
1504 m->m_len = 0;
1505 m->m_data += if_maxlinkhdr;
1506 m->m_pkthdr.header = mtod(m, void *);
1507
1508 ip = mtod(m, struct ip *);
1509 ip->ip_src.s_addr = icr[i].Address;
1510 ip->ip_p = IPPROTO_ICMP;
1511 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1512 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1513 ip->ip_ttl = icr[i].Options.Ttl;
1514
1515 icp = (struct icmp *)&ip[1]; /* no options */
1516 icp->icmp_type = ICMP_ECHOREPLY;
1517 icp->icmp_code = 0;
1518 icp->icmp_id = so->so_icmp_id;
1519 icp->icmp_seq = so->so_icmp_seq;
1520
1521 icm = icmp_find_original_mbuf(pData, ip);
1522 if (icm)
1523 {
1524 /* on this branch we don't need stored variant */
1525 m_freem(pData, icm->im_m);
1526 LIST_REMOVE(icm, im_list);
1527 pData->cIcmpCacheSize--;
1528 RTMemFree(icm);
1529 }
1530
1531
1532 hlen = (ip->ip_hl << 2);
1533 Assert((hlen >= sizeof(struct ip)));
1534
1535 m->m_data += hlen + ICMP_MINLEN;
1536 if (!RT_VALID_PTR(icr[i].Data))
1537 {
1538 m_freem(pData, m);
1539 break;
1540 }
1541 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1542 m->m_data -= hlen + ICMP_MINLEN;
1543 m->m_len += hlen + ICMP_MINLEN;
1544
1545
1546 ip->ip_len = m_length(m, NULL);
1547 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1548
1549 icmp_reflect(pData, m);
1550 break;
1551 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1552
1553 ip_broken = icr[i].Data;
1554 icm = icmp_find_original_mbuf(pData, ip_broken);
1555 if (icm == NULL) {
1556 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1557 return;
1558 }
1559 m = icm->im_m;
1560 ip = mtod(m, struct ip *);
1561 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1562 ip->ip_ttl = icr[i].Options.Ttl;
1563 src = ip->ip_src.s_addr;
1564 ip->ip_dst.s_addr = src;
1565 ip->ip_dst.s_addr = icr[i].Address;
1566
1567 hlen = (ip->ip_hl << 2);
1568 icp = (struct icmp *)((char *)ip + hlen);
1569 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1570
1571 m->m_len = (ip_broken->ip_hl << 2) + 64;
1572 m->m_pkthdr.header = mtod(m, void *);
1573 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1574 icmp_reflect(pData, m);
1575 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1576 LIST_REMOVE(icm, im_list);
1577 pData->cIcmpCacheSize--;
1578 RTMemFree(icm);
1579 break;
1580 default:
1581 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1582 break;
1583 }
1584 }
1585}
1586#else /* !RT_OS_WINDOWS */
1587static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1588{
1589 struct sockaddr_in addr;
1590 socklen_t addrlen = sizeof(struct sockaddr_in);
1591 struct ip ip;
1592 char *buff;
1593 int len = 0;
1594
1595 /* 1- step: read the ip header */
1596 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1597 (struct sockaddr *)&addr, &addrlen);
1598 if ( len < 0
1599 && ( soIgnorableErrorCode(errno)
1600 || errno == ENOTCONN))
1601 {
1602 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1603 return;
1604 }
1605
1606 if ( len < sizeof(struct ip)
1607 || len < 0
1608 || len == 0)
1609 {
1610 u_char code;
1611 code = ICMP_UNREACH_PORT;
1612
1613 if (errno == EHOSTUNREACH)
1614 code = ICMP_UNREACH_HOST;
1615 else if (errno == ENETUNREACH)
1616 code = ICMP_UNREACH_NET;
1617
1618 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1619 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1620 so->so_m = NULL;
1621 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1622 return;
1623 }
1624 /* basic check of IP header */
1625 if ( ip.ip_v != IPVERSION
1626# ifndef RT_OS_DARWIN
1627 || ip.ip_p != IPPROTO_ICMP
1628# endif
1629 )
1630 {
1631 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1632 return;
1633 }
1634# ifndef RT_OS_DARWIN
1635 /* Darwin reports the IP length already in host byte order. */
1636 ip.ip_len = RT_N2H_U16(ip.ip_len);
1637# endif
1638# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1639 /* Solaris and Darwin report the payload only */
1640 ip.ip_len += (ip.ip_hl << 2);
1641# endif
1642 /* Note: ip->ip_len in host byte order (all OS) */
1643 len = ip.ip_len;
1644 buff = RTMemAlloc(len);
1645 if (buff == NULL)
1646 {
1647 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1648 return;
1649 }
1650 /* 2 - step: we're reading rest of the datagramm to the buffer */
1651 addrlen = sizeof(struct sockaddr_in);
1652 memset(&addr, 0, addrlen);
1653 len = recvfrom(so->s, buff, len, 0,
1654 (struct sockaddr *)&addr, &addrlen);
1655 if ( len < 0
1656 && ( soIgnorableErrorCode(errno)
1657 || errno == ENOTCONN))
1658 {
1659 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1660 ip.ip_len));
1661 RTMemFree(buff);
1662 return;
1663 }
1664 if ( len < 0
1665 || len == 0)
1666 {
1667 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1668 errno, len, (ip.ip_len - sizeof(struct ip))));
1669 RTMemFree(buff);
1670 return;
1671 }
1672 /* len is modified in 2nd read, when the rest of the datagramm was read */
1673 send_icmp_to_guest(pData, buff, len, &addr);
1674 RTMemFree(buff);
1675}
1676#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette