VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 41228

Last change on this file since 41228 was 41227, checked in by vboxsync, 13 years ago

NAT: don't touch anything, except fShouldBeRemoved, when sofree called with fUnderPolling.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 50.7 KB
Line 
1/* $Id: socket.c 41227 2012-05-10 06:10:17Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iphlpapi.h>
36#include <icmpapi.h>
37#endif
38
39#ifdef VBOX_WITH_NAT_UDP_SOCKET_CLONE
40/**
41 *
42 */
43struct socket * soCloneUDPSocketWithForegnAddr(PNATState pData, bool fBindSocket, struct socket *pSo, uint32_t u32ForeignAddr)
44{
45 struct socket *pNewSocket = NULL;
46 LogFlowFunc(("Enter: fBindSocket:%RTbool, so:%R[natsock], u32ForeignAddr:%RTnaipv4\n", fBindSocket, pSo, u32ForeignAddr));
47 pNewSocket = socreate();
48 if (!pNewSocket)
49 {
50 LogFunc(("Can't create socket\n"));
51 LogFlowFunc(("Leave: NULL\n"));
52 return NULL;
53 }
54 if (fBindSocket)
55 {
56 if (udp_attach(pData, pNewSocket, 0) <= 0)
57 {
58 sofree(pData, pNewSocket);
59 LogFunc(("Can't attach fresh created socket\n"));
60 return NULL;
61 }
62 }
63 else
64 {
65 pNewSocket->so_cloneOf = (struct socket *)pSo;
66 pNewSocket->s = pSo->s;
67 insque(pData, pNewSocket, &udb);
68 }
69 pNewSocket->so_laddr = pSo->so_laddr;
70 pNewSocket->so_lport = pSo->so_lport;
71 pNewSocket->so_faddr.s_addr = u32ForeignAddr;
72 pNewSocket->so_fport = pSo->so_fport;
73 pSo->so_cCloneCounter++;
74 LogFlowFunc(("Leave: %R[natsock]\n", pNewSocket));
75 return pNewSocket;
76}
77
78struct socket *soLookUpClonedUDPSocket(PNATState pData, const struct socket *pcSo, uint32_t u32ForeignAddress)
79{
80 struct socket *pSoClone = NULL;
81 LogFlowFunc(("Enter: pcSo:%R[natsock], u32ForeignAddress:%RTnaipv4\n", pcSo, u32ForeignAddress));
82 for (pSoClone = udb.so_next; pSoClone != &udb; pSoClone = pSoClone->so_next)
83 {
84 if ( pSoClone->so_cloneOf
85 && pSoClone->so_cloneOf == pcSo
86 && pSoClone->so_lport == pcSo->so_lport
87 && pSoClone->so_fport == pcSo->so_fport
88 && pSoClone->so_laddr.s_addr == pcSo->so_laddr.s_addr
89 && pSoClone->so_faddr.s_addr == u32ForeignAddress)
90 goto done;
91 }
92 pSoClone = NULL;
93done:
94 LogFlowFunc(("Leave: pSoClone: %R[natsock]\n", pSoClone));
95 return pSoClone;
96}
97#endif
98
99#ifdef VBOX_WITH_NAT_SEND2HOME
100DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
101{
102 int idxAddr;
103 int ret = 0;
104 bool fSendDone = false;
105 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
106 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
107 {
108
109 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
110 AssertReturn((pNewSocket, false));
111 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
112 /* @todo: more verbose on errors,
113 * @note: we shouldn't care if this send fail or not (we're in broadcast).
114 */
115 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
116 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
117 if (ret < 0)
118 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
119 fSendDone |= ret > 0;
120 }
121 LogFlowFunc(("Leave %RTbool\n", fSendDone));
122 return fSendDone;
123}
124#endif /* !VBOX_WITH_NAT_SEND2HOME */
125static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
126#ifdef RT_OS_WINDOWS
127static void sorecvfrom_icmp_win(PNATState, struct socket *);
128#else /* RT_OS_WINDOWS */
129static void sorecvfrom_icmp_unix(PNATState, struct socket *);
130#endif /* !RT_OS_WINDOWS */
131
132void
133so_init()
134{
135}
136
137struct socket *
138solookup(struct socket *head, struct in_addr laddr,
139 u_int lport, struct in_addr faddr, u_int fport)
140{
141 struct socket *so;
142
143 for (so = head->so_next; so != head; so = so->so_next)
144 {
145 if ( so->so_lport == lport
146 && so->so_laddr.s_addr == laddr.s_addr
147 && so->so_faddr.s_addr == faddr.s_addr
148 && so->so_fport == fport)
149 return so;
150 }
151
152 return (struct socket *)NULL;
153}
154
155/*
156 * Create a new socket, initialise the fields
157 * It is the responsibility of the caller to
158 * insque() it into the correct linked-list
159 */
160struct socket *
161socreate()
162{
163 struct socket *so;
164
165 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
166 if (so)
167 {
168 so->so_state = SS_NOFDREF;
169 so->s = -1;
170#if !defined(RT_OS_WINDOWS)
171 so->so_poll_index = -1;
172#endif
173 }
174 return so;
175}
176
177/*
178 * remque and free a socket, clobber cache
179 * VBOX_WITH_SLIRP_MT: before sofree queue should be locked, because
180 * in sofree we don't know from which queue item beeing removed.
181 */
182void
183sofree(PNATState pData, struct socket *so)
184{
185 LogFlowFunc(("ENTER:%R[natsock]\n", so));
186 /*
187 * We should not remove socket when polling routine do the polling
188 * instead we mark it for deletion.
189 */
190 if (so->fUnderPolling)
191 {
192 so->fShouldBeRemoved = 1;
193 LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so));
194 return;
195 }
196 if (so == tcp_last_so)
197 tcp_last_so = &tcb;
198 else if (so == udp_last_so)
199 udp_last_so = &udb;
200
201#ifndef VBOX_WITH_SLIRP_MT
202
203 /* libalias notification */
204 if (so->so_pvLnk)
205 slirpDeleteLinkSocket(so->so_pvLnk);
206 /* check if mbuf haven't been already freed */
207 if (so->so_m != NULL)
208 {
209 m_freem(pData, so->so_m);
210 so->so_m = NULL;
211 }
212
213 if (so->so_next && so->so_prev)
214 {
215 remque(pData, so); /* crashes if so is not in a queue */
216 NSOCK_DEC();
217 }
218
219 RTMemFree(so);
220#else
221 so->so_deleted = 1;
222#endif
223 LogFlowFuncLeave();
224}
225
226#ifdef VBOX_WITH_SLIRP_MT
227void
228soread_queue(PNATState pData, struct socket *so, int *ret)
229{
230 *ret = soread(pData, so);
231}
232#endif
233
234/*
235 * Read from so's socket into sb_snd, updating all relevant sbuf fields
236 * NOTE: This will only be called if it is select()ed for reading, so
237 * a read() of 0 (or less) means it's disconnected
238 */
239#ifndef VBOX_WITH_SLIRP_BSD_SBUF
240int
241soread(PNATState pData, struct socket *so)
242{
243 int n, nn, lss, total;
244 struct sbuf *sb = &so->so_snd;
245 size_t len = sb->sb_datalen - sb->sb_cc;
246 struct iovec iov[2];
247 int mss = so->so_tcpcb->t_maxseg;
248
249 STAM_PROFILE_START(&pData->StatIOread, a);
250 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
251 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
252
253 QSOCKET_LOCK(tcb);
254 SOCKET_LOCK(so);
255 QSOCKET_UNLOCK(tcb);
256
257 LogFlow(("soread: so = %R[natsock]\n", so));
258 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
259
260 /*
261 * No need to check if there's enough room to read.
262 * soread wouldn't have been called if there weren't
263 */
264
265 len = sb->sb_datalen - sb->sb_cc;
266
267 iov[0].iov_base = sb->sb_wptr;
268 iov[1].iov_base = 0;
269 iov[1].iov_len = 0;
270 if (sb->sb_wptr < sb->sb_rptr)
271 {
272 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
273 /* Should never succeed, but... */
274 if (iov[0].iov_len > len)
275 iov[0].iov_len = len;
276 if (iov[0].iov_len > mss)
277 iov[0].iov_len -= iov[0].iov_len%mss;
278 n = 1;
279 }
280 else
281 {
282 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
283 /* Should never succeed, but... */
284 if (iov[0].iov_len > len)
285 iov[0].iov_len = len;
286 len -= iov[0].iov_len;
287 if (len)
288 {
289 iov[1].iov_base = sb->sb_data;
290 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
291 if (iov[1].iov_len > len)
292 iov[1].iov_len = len;
293 total = iov[0].iov_len + iov[1].iov_len;
294 if (total > mss)
295 {
296 lss = total % mss;
297 if (iov[1].iov_len > lss)
298 {
299 iov[1].iov_len -= lss;
300 n = 2;
301 }
302 else
303 {
304 lss -= iov[1].iov_len;
305 iov[0].iov_len -= lss;
306 n = 1;
307 }
308 }
309 else
310 n = 2;
311 }
312 else
313 {
314 if (iov[0].iov_len > mss)
315 iov[0].iov_len -= iov[0].iov_len%mss;
316 n = 1;
317 }
318 }
319
320#ifdef HAVE_READV
321 nn = readv(so->s, (struct iovec *)iov, n);
322#else
323 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
324#endif
325 Log2(("%s: read(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
326 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
327 if (nn <= 0)
328 {
329 /*
330 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
331 * _could_ mean that the connection is closed. But we will receive an
332 * FD_CLOSE event later if the connection was _really_ closed. With
333 * www.youtube.com I see this very often. Closing the socket too early
334 * would be dangerous.
335 */
336 int status;
337 unsigned long pending = 0;
338 status = ioctlsocket(so->s, FIONREAD, &pending);
339 if (status < 0)
340 Log(("NAT:%s: error in WSAIoctl: %d\n", __PRETTY_FUNCTION__, errno));
341 if (nn == 0 && (pending != 0))
342 {
343 SOCKET_UNLOCK(so);
344 STAM_PROFILE_STOP(&pData->StatIOread, a);
345 return 0;
346 }
347 if ( nn < 0
348 && ( errno == EINTR
349 || errno == EAGAIN
350 || errno == EWOULDBLOCK))
351 {
352 SOCKET_UNLOCK(so);
353 STAM_PROFILE_STOP(&pData->StatIOread, a);
354 return 0;
355 }
356 else
357 {
358 /* nn == 0 means peer has performed an orderly shutdown */
359 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
360 __PRETTY_FUNCTION__, nn, errno, strerror(errno)));
361 sofcantrcvmore(so);
362 tcp_sockclosed(pData, sototcpcb(so));
363 SOCKET_UNLOCK(so);
364 STAM_PROFILE_STOP(&pData->StatIOread, a);
365 return -1;
366 }
367 }
368 STAM_STATS(
369 if (n == 1)
370 {
371 STAM_COUNTER_INC(&pData->StatIORead_in_1);
372 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
373 }
374 else
375 {
376 STAM_COUNTER_INC(&pData->StatIORead_in_2);
377 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
378 }
379 );
380
381#ifndef HAVE_READV
382 /*
383 * If there was no error, try and read the second time round
384 * We read again if n = 2 (ie, there's another part of the buffer)
385 * and we read as much as we could in the first read
386 * We don't test for <= 0 this time, because there legitimately
387 * might not be any more data (since the socket is non-blocking),
388 * a close will be detected on next iteration.
389 * A return of -1 wont (shouldn't) happen, since it didn't happen above
390 */
391 if (n == 2 && nn == iov[0].iov_len)
392 {
393 int ret;
394 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
395 if (ret > 0)
396 nn += ret;
397 STAM_STATS(
398 if (ret > 0)
399 {
400 STAM_COUNTER_INC(&pData->StatIORead_in_2);
401 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
402 }
403 );
404 }
405
406 Log2(("%s: read(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
407#endif
408
409 /* Update fields */
410 sb->sb_cc += nn;
411 sb->sb_wptr += nn;
412 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
413 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
414 {
415 sb->sb_wptr -= sb->sb_datalen;
416 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
417 }
418 STAM_PROFILE_STOP(&pData->StatIOread, a);
419 SOCKET_UNLOCK(so);
420 return nn;
421}
422#else /* VBOX_WITH_SLIRP_BSD_SBUF */
423int
424soread(PNATState pData, struct socket *so)
425{
426 int n;
427 char *buf;
428 struct sbuf *sb = &so->so_snd;
429 size_t len = sbspace(sb);
430 int mss = so->so_tcpcb->t_maxseg;
431
432 STAM_PROFILE_START(&pData->StatIOread, a);
433 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
434 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
435
436 QSOCKET_LOCK(tcb);
437 SOCKET_LOCK(so);
438 QSOCKET_UNLOCK(tcb);
439
440 LogFlowFunc(("soread: so = %lx\n", (long)so));
441
442 if (len > mss)
443 len -= len % mss;
444 buf = RTMemAlloc(len);
445 if (buf == NULL)
446 {
447 Log(("NAT: can't alloc enough memory\n"));
448 return -1;
449 }
450
451 n = recv(so->s, buf, len, (so->so_tcpcb->t_force? MSG_OOB:0));
452 if (n <= 0)
453 {
454 /*
455 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
456 * _could_ mean that the connection is closed. But we will receive an
457 * FD_CLOSE event later if the connection was _really_ closed. With
458 * www.youtube.com I see this very often. Closing the socket too early
459 * would be dangerous.
460 */
461 int status;
462 unsigned long pending = 0;
463 status = ioctlsocket(so->s, FIONREAD, &pending);
464 if (status < 0)
465 Log(("NAT:error in WSAIoctl: %d\n", errno));
466 if (n == 0 && (pending != 0))
467 {
468 SOCKET_UNLOCK(so);
469 STAM_PROFILE_STOP(&pData->StatIOread, a);
470 RTMemFree(buf);
471 return 0;
472 }
473 if ( n < 0
474 && ( errno == EINTR
475 || errno == EAGAIN
476 || errno == EWOULDBLOCK))
477 {
478 SOCKET_UNLOCK(so);
479 STAM_PROFILE_STOP(&pData->StatIOread, a);
480 RTMemFree(buf);
481 return 0;
482 }
483 else
484 {
485 Log2((" --- soread() disconnected, n = %d, errno = %d (%s)\n",
486 n, errno, strerror(errno)));
487 sofcantrcvmore(so);
488 tcp_sockclosed(pData, sototcpcb(so));
489 SOCKET_UNLOCK(so);
490 STAM_PROFILE_STOP(&pData->StatIOread, a);
491 RTMemFree(buf);
492 return -1;
493 }
494 }
495
496 sbuf_bcat(sb, buf, n);
497 RTMemFree(buf);
498 return n;
499}
500#endif
501
502/*
503 * Get urgent data
504 *
505 * When the socket is created, we set it SO_OOBINLINE,
506 * so when OOB data arrives, we soread() it and everything
507 * in the send buffer is sent as urgent data
508 */
509void
510sorecvoob(PNATState pData, struct socket *so)
511{
512 struct tcpcb *tp = sototcpcb(so);
513 ssize_t ret;
514
515 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
516
517 /*
518 * We take a guess at how much urgent data has arrived.
519 * In most situations, when urgent data arrives, the next
520 * read() should get all the urgent data. This guess will
521 * be wrong however if more data arrives just after the
522 * urgent data, or the read() doesn't return all the
523 * urgent data.
524 */
525 ret = soread(pData, so);
526 if (RT_LIKELY(ret > 0))
527 {
528 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
529 tp->t_force = 1;
530 tcp_output(pData, tp);
531 tp->t_force = 0;
532 }
533}
534#ifndef VBOX_WITH_SLIRP_BSD_SBUF
535/*
536 * Send urgent data
537 * There's a lot duplicated code here, but...
538 */
539int
540sosendoob(struct socket *so)
541{
542 struct sbuf *sb = &so->so_rcv;
543 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
544
545 int n, len;
546
547 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
548
549 if (so->so_urgc > sizeof(buff))
550 so->so_urgc = sizeof(buff); /* XXX */
551
552 if (sb->sb_rptr < sb->sb_wptr)
553 {
554 /* We can send it directly */
555 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
556 so->so_urgc -= n;
557
558 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
559 n, so->so_urgc));
560 }
561 else
562 {
563 /*
564 * Since there's no sendv or sendtov like writev,
565 * we must copy all data to a linear buffer then
566 * send it all
567 */
568 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
569 if (len > so->so_urgc)
570 len = so->so_urgc;
571 memcpy(buff, sb->sb_rptr, len);
572 so->so_urgc -= len;
573 if (so->so_urgc)
574 {
575 n = sb->sb_wptr - sb->sb_data;
576 if (n > so->so_urgc)
577 n = so->so_urgc;
578 memcpy(buff + len, sb->sb_data, n);
579 so->so_urgc -= n;
580 len += n;
581 }
582 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
583#ifdef DEBUG
584 if (n != len)
585 Log(("Didn't send all data urgently XXXXX\n"));
586#endif
587 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
588 n, so->so_urgc));
589 }
590
591 sb->sb_cc -= n;
592 sb->sb_rptr += n;
593 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
594 sb->sb_rptr -= sb->sb_datalen;
595
596 return n;
597}
598
599/*
600 * Write data from so_rcv to so's socket,
601 * updating all sbuf field as necessary
602 */
603int
604sowrite(PNATState pData, struct socket *so)
605{
606 int n, nn;
607 struct sbuf *sb = &so->so_rcv;
608 size_t len = sb->sb_cc;
609 struct iovec iov[2];
610
611 STAM_PROFILE_START(&pData->StatIOwrite, a);
612 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
613 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
614 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
615 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
616 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
617 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
618 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
619 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
620 LogFlowFunc(("so = %R[natsock]\n", so));
621 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", __PRETTY_FUNCTION__, so, sb));
622 QSOCKET_LOCK(tcb);
623 SOCKET_LOCK(so);
624 QSOCKET_UNLOCK(tcb);
625 if (so->so_urgc)
626 {
627 sosendoob(so);
628 if (sb->sb_cc == 0)
629 {
630 SOCKET_UNLOCK(so);
631 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
632 return 0;
633 }
634 }
635
636 /*
637 * No need to check if there's something to write,
638 * sowrite wouldn't have been called otherwise
639 */
640
641 len = sb->sb_cc;
642
643 iov[0].iov_base = sb->sb_rptr;
644 iov[1].iov_base = 0;
645 iov[1].iov_len = 0;
646 if (sb->sb_rptr < sb->sb_wptr)
647 {
648 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
649 /* Should never succeed, but... */
650 if (iov[0].iov_len > len)
651 iov[0].iov_len = len;
652 n = 1;
653 }
654 else
655 {
656 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
657 if (iov[0].iov_len > len)
658 iov[0].iov_len = len;
659 len -= iov[0].iov_len;
660 if (len)
661 {
662 iov[1].iov_base = sb->sb_data;
663 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
664 if (iov[1].iov_len > len)
665 iov[1].iov_len = len;
666 n = 2;
667 }
668 else
669 n = 1;
670 }
671 STAM_STATS({
672 if (n == 1)
673 {
674 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
675 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
676 }
677 else
678 {
679 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
680 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
681 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
682 }
683 });
684 /* Check if there's urgent data to send, and if so, send it */
685#ifdef HAVE_READV
686 nn = writev(so->s, (const struct iovec *)iov, n);
687#else
688 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
689#endif
690 Log2(("%s: wrote(1) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
691 /* This should never happen, but people tell me it does *shrug* */
692 if ( nn < 0
693 && ( errno == EAGAIN
694 || errno == EINTR
695 || errno == EWOULDBLOCK))
696 {
697 SOCKET_UNLOCK(so);
698 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
699 return 0;
700 }
701
702 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
703 {
704 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
705 __PRETTY_FUNCTION__, so->so_state, errno));
706 sofcantsendmore(so);
707 tcp_sockclosed(pData, sototcpcb(so));
708 SOCKET_UNLOCK(so);
709 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
710 return -1;
711 }
712
713#ifndef HAVE_READV
714 if (n == 2 && nn == iov[0].iov_len)
715 {
716 int ret;
717 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
718 if (ret > 0)
719 nn += ret;
720 STAM_STATS({
721 if (ret > 0 && ret != iov[1].iov_len)
722 {
723 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
724 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
725 }
726 });
727 }
728 Log2(("%s: wrote(2) nn = %d bytes\n", __PRETTY_FUNCTION__, nn));
729#endif
730
731 /* Update sbuf */
732 sb->sb_cc -= nn;
733 sb->sb_rptr += nn;
734 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", __PRETTY_FUNCTION__, nn, sb));
735 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
736 {
737 sb->sb_rptr -= sb->sb_datalen;
738 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", __PRETTY_FUNCTION__, sb));
739 }
740
741 /*
742 * If in DRAIN mode, and there's no more data, set
743 * it CANTSENDMORE
744 */
745 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
746 sofcantsendmore(so);
747
748 SOCKET_UNLOCK(so);
749 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
750 return nn;
751}
752#else /* VBOX_WITH_SLIRP_BSD_SBUF */
753static int
754do_sosend(struct socket *so, int fUrg)
755{
756 struct sbuf *sb = &so->so_rcv;
757
758 int n, len;
759
760 LogFlowFunc(("sosendoob: so = %R[natsock]\n", so));
761
762 len = sbuf_len(sb);
763
764 n = send(so->s, sbuf_data(sb), len, (fUrg ? MSG_OOB : 0));
765 if (n < 0)
766 Log(("NAT: Can't sent sbuf via socket.\n"));
767 if (fUrg)
768 so->so_urgc -= n;
769 if (n > 0 && n < len)
770 {
771 char *ptr;
772 char *buff;
773 buff = RTMemAlloc(len);
774 if (buff == NULL)
775 {
776 Log(("NAT: No space to allocate temporal buffer\n"));
777 return -1;
778 }
779 ptr = sbuf_data(sb);
780 memcpy(buff, &ptr[n], len - n);
781 sbuf_bcpy(sb, buff, len - n);
782 RTMemFree(buff);
783 return n;
784 }
785 sbuf_clear(sb);
786 return n;
787}
788int
789sosendoob(struct socket *so)
790{
791 return do_sosend(so, 1);
792}
793
794/*
795 * Write data from so_rcv to so's socket,
796 * updating all sbuf field as necessary
797 */
798int
799sowrite(PNATState pData, struct socket *so)
800{
801 return do_sosend(so, 0);
802}
803#endif
804
805/*
806 * recvfrom() a UDP socket
807 */
808void
809sorecvfrom(PNATState pData, struct socket *so)
810{
811 ssize_t ret = 0;
812 struct sockaddr_in addr;
813 socklen_t addrlen = sizeof(struct sockaddr_in);
814
815 LogFlowFunc(("sorecvfrom: so = %lx\n", (long)so));
816
817 if (so->so_type == IPPROTO_ICMP)
818 {
819 /* This is a "ping" reply */
820#ifdef RT_OS_WINDOWS
821 sorecvfrom_icmp_win(pData, so);
822#else /* RT_OS_WINDOWS */
823 sorecvfrom_icmp_unix(pData, so);
824#endif /* !RT_OS_WINDOWS */
825 udp_detach(pData, so);
826 }
827 else
828 {
829 /* A "normal" UDP packet */
830 struct mbuf *m;
831 ssize_t len;
832 u_long n = 0;
833 int rc = 0;
834 static int signalled = 0;
835 char *pchBuffer = NULL;
836 bool fWithTemporalBuffer = false;
837
838 QSOCKET_LOCK(udb);
839 SOCKET_LOCK(so);
840 QSOCKET_UNLOCK(udb);
841
842 /*How many data has been received ?*/
843 /*
844 * 1. calculate how much we can read
845 * 2. read as much as possible
846 * 3. attach buffer to allocated header mbuf
847 */
848 rc = ioctlsocket(so->s, FIONREAD, &n);
849 if (rc == -1)
850 {
851 if ( errno == EAGAIN
852 || errno == EWOULDBLOCK
853 || errno == EINPROGRESS
854 || errno == ENOTCONN)
855 return;
856 else if (signalled == 0)
857 {
858 LogRel(("NAT: can't fetch amount of bytes on socket %R[natsock], so message will be truncated.\n", so));
859 signalled = 1;
860 }
861 return;
862 }
863
864 len = sizeof(struct udpiphdr);
865 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
866 if (m == NULL)
867 return;
868
869 len += n;
870 m->m_data += ETH_HLEN;
871 m->m_pkthdr.header = mtod(m, void *);
872 m->m_data += sizeof(struct udpiphdr);
873
874 pchBuffer = mtod(m, char *);
875 fWithTemporalBuffer = false;
876 /*
877 * Even if amounts of bytes on socket is greater than MTU value
878 * Slirp will able fragment it, but we won't create temporal location
879 * here.
880 */
881 if (n > (slirp_size(pData) - sizeof(struct udpiphdr)))
882 {
883 pchBuffer = RTMemAlloc((n) * sizeof(char));
884 if (!pchBuffer)
885 {
886 m_freem(pData, m);
887 return;
888 }
889 fWithTemporalBuffer = true;
890 }
891 ret = recvfrom(so->s, pchBuffer, n, 0,
892 (struct sockaddr *)&addr, &addrlen);
893 if (fWithTemporalBuffer)
894 {
895 if (ret > 0)
896 {
897 m_copyback(pData, m, 0, ret, pchBuffer);
898 /*
899 * If we've met comporison below our size prediction was failed
900 * it's not fatal just we've allocated for nothing. (@todo add counter here
901 * to calculate how rare we here)
902 */
903 if(ret < slirp_size(pData) && !m->m_next)
904 Log(("NAT:udp: Expected size(%d) lesser than real(%d) and less minimal mbuf size(%d)\n",
905 n, ret, slirp_size(pData)));
906 }
907 /* we're freeing buffer anyway */
908 RTMemFree(pchBuffer);
909 }
910 else
911 m->m_len = ret;
912
913 if (ret < 0)
914 {
915 u_char code = ICMP_UNREACH_PORT;
916
917 if (errno == EHOSTUNREACH)
918 code = ICMP_UNREACH_HOST;
919 else if (errno == ENETUNREACH)
920 code = ICMP_UNREACH_NET;
921
922 m_freem(pData, m);
923 if ( errno == EAGAIN
924 || errno == EWOULDBLOCK
925 || errno == EINPROGRESS
926 || errno == ENOTCONN)
927 {
928 return;
929 }
930
931 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
932 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
933 so->so_m = NULL;
934 }
935 else
936 {
937 Assert((m_length(m,NULL) == ret));
938 /*
939 * Hack: domain name lookup will be used the most for UDP,
940 * and since they'll only be used once there's no need
941 * for the 4 minute (or whatever) timeout... So we time them
942 * out much quicker (10 seconds for now...)
943 */
944 if (so->so_expire)
945 {
946 if (so->so_fport != RT_H2N_U16_C(53))
947 so->so_expire = curtime + SO_EXPIRE;
948 }
949 /*
950 * last argument should be changed if Slirp will inject IP attributes
951 * Note: Here we can't check if dnsproxy's sent initial request
952 */
953 if ( pData->fUseDnsProxy
954 && so->so_fport == RT_H2N_U16_C(53))
955 dnsproxy_answer(pData, so, m);
956
957#if 0
958 if (m->m_len == len)
959 {
960 m_inc(m, MINCSIZE);
961 m->m_len = 0;
962 }
963#endif
964
965 /* packets definetly will be fragmented, could confuse receiver peer. */
966 if (m_length(m, NULL) > if_mtu)
967 m->m_flags |= M_SKIP_FIREWALL;
968 /*
969 * If this packet was destined for CTL_ADDR,
970 * make it look like that's where it came from, done by udp_output
971 */
972 udp_output(pData, so, m, &addr);
973 SOCKET_UNLOCK(so);
974 } /* rx error */
975 } /* if ping packet */
976}
977
978/*
979 * sendto() a socket
980 */
981int
982sosendto(PNATState pData, struct socket *so, struct mbuf *m)
983{
984 int ret;
985 struct sockaddr_in *paddr;
986 struct sockaddr addr;
987#if 0
988 struct sockaddr_in host_addr;
989#endif
990 caddr_t buf = 0;
991 int mlen;
992
993 LogFlowFunc(("sosendto: so = %R[natsock], m = %lx\n", so, (long)m));
994
995 memset(&addr, 0, sizeof(struct sockaddr));
996#ifdef RT_OS_DARWIN
997 addr.sa_len = sizeof(struct sockaddr_in);
998#endif
999 paddr = (struct sockaddr_in *)&addr;
1000 paddr->sin_family = AF_INET;
1001 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
1002 {
1003 /* It's an alias */
1004 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
1005 switch(last_byte)
1006 {
1007#if 0
1008 /* handle this case at 'default:' */
1009 case CTL_BROADCAST:
1010 addr.sin_addr.s_addr = INADDR_BROADCAST;
1011 /* Send the packet to host to fully emulate broadcast */
1012 /** @todo r=klaus: on Linux host this causes the host to receive
1013 * the packet twice for some reason. And I cannot find any place
1014 * in the man pages which states that sending a broadcast does not
1015 * reach the host itself. */
1016 host_addr.sin_family = AF_INET;
1017 host_addr.sin_port = so->so_fport;
1018 host_addr.sin_addr = our_addr;
1019 sendto(so->s, m->m_data, m->m_len, 0,
1020 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
1021 break;
1022#endif
1023 case CTL_DNS:
1024 case CTL_ALIAS:
1025 default:
1026 if (last_byte == ~pData->netmask)
1027 paddr->sin_addr.s_addr = INADDR_BROADCAST;
1028 else
1029 paddr->sin_addr = loopback_addr;
1030 break;
1031 }
1032 }
1033 else
1034 paddr->sin_addr = so->so_faddr;
1035 paddr->sin_port = so->so_fport;
1036
1037 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
1038 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
1039
1040 /* Don't care what port we get */
1041 /*
1042 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
1043 * generates bodyless messages, annoying memmory management system.
1044 */
1045 mlen = m_length(m, NULL);
1046 if (mlen > 0)
1047 {
1048 buf = RTMemAlloc(mlen);
1049 if (buf == NULL)
1050 {
1051 return -1;
1052 }
1053 m_copydata(m, 0, mlen, buf);
1054 }
1055 ret = sendto(so->s, buf, mlen, 0,
1056 (struct sockaddr *)&addr, sizeof (struct sockaddr));
1057#ifdef VBOX_WITH_NAT_SEND2HOME
1058 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
1059 {
1060 slirpSend2Home(pData, so, buf, mlen, 0);
1061 }
1062#endif
1063 if (buf)
1064 RTMemFree(buf);
1065 if (ret < 0)
1066 {
1067 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
1068 return -1;
1069 }
1070
1071 /*
1072 * Kill the socket if there's no reply in 4 minutes,
1073 * but only if it's an expirable socket
1074 */
1075 if (so->so_expire)
1076 so->so_expire = curtime + SO_EXPIRE;
1077 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
1078 return 0;
1079}
1080
1081/*
1082 * XXX This should really be tcp_listen
1083 */
1084struct socket *
1085solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
1086{
1087 struct sockaddr_in addr;
1088 struct socket *so;
1089 socklen_t addrlen = sizeof(addr);
1090 int s, opt = 1;
1091 int status;
1092
1093 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1094
1095 if ((so = socreate()) == NULL)
1096 {
1097 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1098 return NULL;
1099 }
1100
1101 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1102 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1103 {
1104 RTMemFree(so);
1105 return NULL;
1106 }
1107
1108 SOCKET_LOCK_CREATE(so);
1109 SOCKET_LOCK(so);
1110 QSOCKET_LOCK(tcb);
1111 insque(pData, so,&tcb);
1112 NSOCK_INC();
1113 QSOCKET_UNLOCK(tcb);
1114
1115 /*
1116 * SS_FACCEPTONCE sockets must time out.
1117 */
1118 if (flags & SS_FACCEPTONCE)
1119 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1120
1121 so->so_state = (SS_FACCEPTCONN|flags);
1122 so->so_lport = lport; /* Kept in network format */
1123 so->so_laddr.s_addr = laddr; /* Ditto */
1124
1125 memset(&addr, 0, sizeof(addr));
1126#ifdef RT_OS_DARWIN
1127 addr.sin_len = sizeof(addr);
1128#endif
1129 addr.sin_family = AF_INET;
1130 addr.sin_addr.s_addr = bind_addr;
1131 addr.sin_port = port;
1132
1133 /**
1134 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1135 * kernel will choose the optimal value for requests queue length.
1136 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1137 */
1138 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1139 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1140 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1141 || (listen(s, pData->soMaxConn) < 0))
1142 {
1143#ifdef RT_OS_WINDOWS
1144 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1145 closesocket(s);
1146 QSOCKET_LOCK(tcb);
1147 sofree(pData, so);
1148 QSOCKET_UNLOCK(tcb);
1149 /* Restore the real errno */
1150 WSASetLastError(tmperrno);
1151#else
1152 int tmperrno = errno; /* Don't clobber the real reason we failed */
1153 close(s);
1154 QSOCKET_LOCK(tcb);
1155 sofree(pData, so);
1156 QSOCKET_UNLOCK(tcb);
1157 /* Restore the real errno */
1158 errno = tmperrno;
1159#endif
1160 return NULL;
1161 }
1162 fd_nonblock(s);
1163 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1164
1165 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1166 so->so_fport = addr.sin_port;
1167 /* set socket buffers */
1168 opt = pData->socket_rcv;
1169 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1170 if (status < 0)
1171 {
1172 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1173 goto no_sockopt;
1174 }
1175 opt = pData->socket_snd;
1176 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1177 if (status < 0)
1178 {
1179 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1180 goto no_sockopt;
1181 }
1182no_sockopt:
1183 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1184 so->so_faddr = alias_addr;
1185 else
1186 so->so_faddr = addr.sin_addr;
1187
1188 so->s = s;
1189 SOCKET_UNLOCK(so);
1190 return so;
1191}
1192
1193/*
1194 * Data is available in so_rcv
1195 * Just write() the data to the socket
1196 * XXX not yet...
1197 * @todo do we really need this function, what it's intended to do?
1198 */
1199void
1200sorwakeup(struct socket *so)
1201{
1202 NOREF(so);
1203#if 0
1204 sowrite(so);
1205 FD_CLR(so->s,&writefds);
1206#endif
1207}
1208
1209/*
1210 * Data has been freed in so_snd
1211 * We have room for a read() if we want to
1212 * For now, don't read, it'll be done in the main loop
1213 */
1214void
1215sowwakeup(struct socket *so)
1216{
1217 NOREF(so);
1218}
1219
1220/*
1221 * Various session state calls
1222 * XXX Should be #define's
1223 * The socket state stuff needs work, these often get call 2 or 3
1224 * times each when only 1 was needed
1225 */
1226void
1227soisfconnecting(struct socket *so)
1228{
1229 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1230 SS_FCANTSENDMORE|SS_FWDRAIN);
1231 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1232}
1233
1234void
1235soisfconnected(struct socket *so)
1236{
1237 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1238 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1239 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1240 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1241}
1242
1243void
1244sofcantrcvmore(struct socket *so)
1245{
1246 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1247 if ((so->so_state & SS_NOFDREF) == 0)
1248 {
1249 shutdown(so->s, 0);
1250 }
1251 so->so_state &= ~(SS_ISFCONNECTING);
1252 if (so->so_state & SS_FCANTSENDMORE)
1253 so->so_state = SS_NOFDREF; /* Don't select it */
1254 /* XXX close() here as well? */
1255 else
1256 so->so_state |= SS_FCANTRCVMORE;
1257 LogFlowFuncLeave();
1258}
1259
1260void
1261sofcantsendmore(struct socket *so)
1262{
1263 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1264 if ((so->so_state & SS_NOFDREF) == 0)
1265 shutdown(so->s, 1); /* send FIN to fhost */
1266
1267 so->so_state &= ~(SS_ISFCONNECTING);
1268 if (so->so_state & SS_FCANTRCVMORE)
1269 so->so_state = SS_NOFDREF; /* as above */
1270 else
1271 so->so_state |= SS_FCANTSENDMORE;
1272 LogFlowFuncLeave();
1273}
1274
1275void
1276soisfdisconnected(struct socket *so)
1277{
1278 NOREF(so);
1279#if 0
1280 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1281 close(so->s);
1282 so->so_state = SS_ISFDISCONNECTED;
1283 /*
1284 * XXX Do nothing ... ?
1285 */
1286#endif
1287}
1288
1289/*
1290 * Set write drain mode
1291 * Set CANTSENDMORE once all data has been write()n
1292 */
1293void
1294sofwdrain(struct socket *so)
1295{
1296 if (SBUF_LEN(&so->so_rcv))
1297 so->so_state |= SS_FWDRAIN;
1298 else
1299 sofcantsendmore(so);
1300}
1301
1302static void
1303send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1304{
1305 struct ip *ip;
1306 uint32_t dst, src;
1307 char ip_copy[256];
1308 struct icmp *icp;
1309 int old_ip_len = 0;
1310 int hlen, original_hlen = 0;
1311 struct mbuf *m;
1312 struct icmp_msg *icm;
1313 uint8_t proto;
1314 int type = 0;
1315
1316 ip = (struct ip *)buff;
1317 /* Fix ip->ip_len to contain the total packet length including the header
1318 * in _host_ byte order for all OSes. On Darwin, that value already is in
1319 * host byte order. Solaris and Darwin report only the payload. */
1320#ifndef RT_OS_DARWIN
1321 ip->ip_len = RT_N2H_U16(ip->ip_len);
1322#endif
1323 hlen = (ip->ip_hl << 2);
1324#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1325 ip->ip_len += hlen;
1326#endif
1327 if (ip->ip_len < hlen + ICMP_MINLEN)
1328 {
1329 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1330 return;
1331 }
1332 icp = (struct icmp *)((char *)ip + hlen);
1333
1334 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1335 if ( icp->icmp_type != ICMP_ECHOREPLY
1336 && icp->icmp_type != ICMP_TIMXCEED
1337 && icp->icmp_type != ICMP_UNREACH)
1338 {
1339 return;
1340 }
1341
1342 /*
1343 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1344 * ICMP_ECHOREPLY assuming data 0
1345 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1346 */
1347 if (ip->ip_len < hlen + 8)
1348 {
1349 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1350 return;
1351 }
1352
1353 type = icp->icmp_type;
1354 if ( type == ICMP_TIMXCEED
1355 || type == ICMP_UNREACH)
1356 {
1357 /*
1358 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1359 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1360 */
1361 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1362 {
1363 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1364 return;
1365 }
1366 ip = &icp->icmp_ip;
1367 }
1368
1369 icm = icmp_find_original_mbuf(pData, ip);
1370 if (icm == NULL)
1371 {
1372 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1373 return;
1374 }
1375
1376 m = icm->im_m;
1377 if (!m)
1378 {
1379 LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so));
1380 LIST_REMOVE(icm, im_list);
1381 RTMemFree(icm);
1382 return;
1383 }
1384
1385 src = addr->sin_addr.s_addr;
1386 if (type == ICMP_ECHOREPLY)
1387 {
1388 struct ip *ip0 = mtod(m, struct ip *);
1389 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1390 if (icp0->icmp_type != ICMP_ECHO)
1391 {
1392 Log(("NAT: we haven't found echo for this reply\n"));
1393 return;
1394 }
1395 /*
1396 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1397 * IP header combined by OS network stack, our local copy of IP header contians values
1398 * in host byte order so no byte order conversion is required. IP headers fields are converting
1399 * in ip_output0 routine only.
1400 */
1401 if ( (ip->ip_len - hlen)
1402 != (ip0->ip_len - (ip0->ip_hl << 2)))
1403 {
1404 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1405 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1406 return;
1407 }
1408 }
1409
1410 /* ip points on origianal ip header */
1411 ip = mtod(m, struct ip *);
1412 proto = ip->ip_p;
1413 /* Now ip is pointing on header we've sent from guest */
1414 if ( icp->icmp_type == ICMP_TIMXCEED
1415 || icp->icmp_type == ICMP_UNREACH)
1416 {
1417 old_ip_len = (ip->ip_hl << 2) + 64;
1418 if (old_ip_len > sizeof(ip_copy))
1419 old_ip_len = sizeof(ip_copy);
1420 memcpy(ip_copy, ip, old_ip_len);
1421 }
1422
1423 /* source address from original IP packet*/
1424 dst = ip->ip_src.s_addr;
1425
1426 /* overide ther tail of old packet */
1427 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1428 original_hlen = ip->ip_hl << 2;
1429 /* saves original ip header and options */
1430 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1431 ip->ip_len = m_length(m, NULL);
1432 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1433
1434 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1435 type = icp->icmp_type;
1436 if ( type == ICMP_TIMXCEED
1437 || type == ICMP_UNREACH)
1438 {
1439 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1440 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1441 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1442 }
1443
1444 ip->ip_src.s_addr = src;
1445 ip->ip_dst.s_addr = dst;
1446 icmp_reflect(pData, m);
1447 LIST_REMOVE(icm, im_list);
1448 pData->cIcmpCacheSize--;
1449 /* Don't call m_free here*/
1450
1451 if ( type == ICMP_TIMXCEED
1452 || type == ICMP_UNREACH)
1453 {
1454 icm->im_so->so_m = NULL;
1455 switch (proto)
1456 {
1457 case IPPROTO_UDP:
1458 /*XXX: so->so_m already freed so we shouldn't call sofree */
1459 udp_detach(pData, icm->im_so);
1460 break;
1461 case IPPROTO_TCP:
1462 /*close tcp should be here */
1463 break;
1464 default:
1465 /* do nothing */
1466 break;
1467 }
1468 }
1469 RTMemFree(icm);
1470}
1471
1472#ifdef RT_OS_WINDOWS
1473static void
1474sorecvfrom_icmp_win(PNATState pData, struct socket *so)
1475{
1476 int len;
1477 int i;
1478 struct ip *ip;
1479 struct mbuf *m;
1480 struct icmp *icp;
1481 struct icmp_msg *icm;
1482 struct ip *ip_broken; /* ICMP returns header + 64 bit of packet */
1483 uint32_t src;
1484 ICMP_ECHO_REPLY *icr;
1485 int hlen = 0;
1486 int nbytes = 0;
1487 u_char code = ~0;
1488 int out_len;
1489 int size;
1490
1491 len = pData->pfIcmpParseReplies(pData->pvIcmpBuffer, pData->szIcmpBuffer);
1492 if (len < 0)
1493 {
1494 LogRel(("NAT: Error (%d) occurred on ICMP receiving\n", GetLastError()));
1495 return;
1496 }
1497 if (len == 0)
1498 return; /* no error */
1499
1500 icr = (ICMP_ECHO_REPLY *)pData->pvIcmpBuffer;
1501 for (i = 0; i < len; ++i)
1502 {
1503 LogFunc(("icr[%d] Data:%p, DataSize:%d\n",
1504 i, icr[i].Data, icr[i].DataSize));
1505 switch(icr[i].Status)
1506 {
1507 case IP_DEST_HOST_UNREACHABLE:
1508 code = (code != ~0 ? code : ICMP_UNREACH_HOST);
1509 case IP_DEST_NET_UNREACHABLE:
1510 code = (code != ~0 ? code : ICMP_UNREACH_NET);
1511 case IP_DEST_PROT_UNREACHABLE:
1512 code = (code != ~0 ? code : ICMP_UNREACH_PROTOCOL);
1513 /* UNREACH error inject here */
1514 case IP_DEST_PORT_UNREACHABLE:
1515 code = (code != ~0 ? code : ICMP_UNREACH_PORT);
1516 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, "Error occurred!!!");
1517 so->so_m = NULL;
1518 break;
1519 case IP_SUCCESS: /* echo replied */
1520 out_len = ETH_HLEN + sizeof(struct ip) + 8;
1521 size;
1522 size = MCLBYTES;
1523 if (out_len < MSIZE)
1524 size = MCLBYTES;
1525 else if (out_len < MCLBYTES)
1526 size = MCLBYTES;
1527 else if (out_len < MJUM9BYTES)
1528 size = MJUM9BYTES;
1529 else if (out_len < MJUM16BYTES)
1530 size = MJUM16BYTES;
1531 else
1532 AssertMsgFailed(("Unsupported size"));
1533
1534 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, size);
1535 LogFunc(("m_getjcl returns m: %p\n", m));
1536 if (m == NULL)
1537 return;
1538 m->m_len = 0;
1539 m->m_data += if_maxlinkhdr;
1540 m->m_pkthdr.header = mtod(m, void *);
1541
1542 ip = mtod(m, struct ip *);
1543 ip->ip_src.s_addr = icr[i].Address;
1544 ip->ip_p = IPPROTO_ICMP;
1545 ip->ip_dst.s_addr = so->so_laddr.s_addr; /*XXX: still the hack*/
1546 ip->ip_hl = sizeof(struct ip) >> 2; /* requiered for icmp_reflect, no IP options */
1547 ip->ip_ttl = icr[i].Options.Ttl;
1548
1549 icp = (struct icmp *)&ip[1]; /* no options */
1550 icp->icmp_type = ICMP_ECHOREPLY;
1551 icp->icmp_code = 0;
1552 icp->icmp_id = so->so_icmp_id;
1553 icp->icmp_seq = so->so_icmp_seq;
1554
1555 icm = icmp_find_original_mbuf(pData, ip);
1556 if (icm)
1557 {
1558 /* on this branch we don't need stored variant */
1559 m_freem(pData, icm->im_m);
1560 LIST_REMOVE(icm, im_list);
1561 pData->cIcmpCacheSize--;
1562 RTMemFree(icm);
1563 }
1564
1565
1566 hlen = (ip->ip_hl << 2);
1567 Assert((hlen >= sizeof(struct ip)));
1568
1569 m->m_data += hlen + ICMP_MINLEN;
1570 if (!RT_VALID_PTR(icr[i].Data))
1571 {
1572 m_freem(pData, m);
1573 break;
1574 }
1575 m_copyback(pData, m, 0, icr[i].DataSize, icr[i].Data);
1576 m->m_data -= hlen + ICMP_MINLEN;
1577 m->m_len += hlen + ICMP_MINLEN;
1578
1579
1580 ip->ip_len = m_length(m, NULL);
1581 Assert((ip->ip_len == hlen + ICMP_MINLEN + icr[i].DataSize));
1582
1583 icmp_reflect(pData, m);
1584 break;
1585 case IP_TTL_EXPIRED_TRANSIT: /* TTL expired */
1586
1587 ip_broken = icr[i].Data;
1588 icm = icmp_find_original_mbuf(pData, ip_broken);
1589 if (icm == NULL) {
1590 Log(("ICMP: can't find original package (first double word %x)\n", *(uint32_t *)ip_broken));
1591 return;
1592 }
1593 m = icm->im_m;
1594 ip = mtod(m, struct ip *);
1595 Assert(((ip_broken->ip_hl >> 2) >= sizeof(struct ip)));
1596 ip->ip_ttl = icr[i].Options.Ttl;
1597 src = ip->ip_src.s_addr;
1598 ip->ip_dst.s_addr = src;
1599 ip->ip_dst.s_addr = icr[i].Address;
1600
1601 hlen = (ip->ip_hl << 2);
1602 icp = (struct icmp *)((char *)ip + hlen);
1603 ip_broken->ip_src.s_addr = src; /*it packet sent from host not from guest*/
1604
1605 m->m_len = (ip_broken->ip_hl << 2) + 64;
1606 m->m_pkthdr.header = mtod(m, void *);
1607 m_copyback(pData, m, ip->ip_hl >> 2, icr[i].DataSize, icr[i].Data);
1608 icmp_reflect(pData, m);
1609 /* Here is different situation from Unix world, where we can receive icmp in response on TCP/UDP */
1610 LIST_REMOVE(icm, im_list);
1611 pData->cIcmpCacheSize--;
1612 RTMemFree(icm);
1613 break;
1614 default:
1615 Log(("ICMP(default): message with Status: %x was received from %x\n", icr[i].Status, icr[i].Address));
1616 break;
1617 }
1618 }
1619}
1620#else /* !RT_OS_WINDOWS */
1621static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1622{
1623 struct sockaddr_in addr;
1624 socklen_t addrlen = sizeof(struct sockaddr_in);
1625 struct ip ip;
1626 char *buff;
1627 int len = 0;
1628
1629 /* 1- step: read the ip header */
1630 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1631 (struct sockaddr *)&addr, &addrlen);
1632 if ( len < 0
1633 && ( errno == EAGAIN
1634 || errno == EWOULDBLOCK
1635 || errno == EINPROGRESS
1636 || errno == ENOTCONN))
1637 {
1638 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1639 return;
1640 }
1641
1642 if ( len < sizeof(struct ip)
1643 || len < 0
1644 || len == 0)
1645 {
1646 u_char code;
1647 code = ICMP_UNREACH_PORT;
1648
1649 if (errno == EHOSTUNREACH)
1650 code = ICMP_UNREACH_HOST;
1651 else if (errno == ENETUNREACH)
1652 code = ICMP_UNREACH_NET;
1653
1654 LogRel((" udp icmp rx errno = %d (%s)\n", errno, strerror(errno)));
1655 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1656 so->so_m = NULL;
1657 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1658 return;
1659 }
1660 /* basic check of IP header */
1661 if ( ip.ip_v != IPVERSION
1662# ifndef RT_OS_DARWIN
1663 || ip.ip_p != IPPROTO_ICMP
1664# endif
1665 )
1666 {
1667 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1668 return;
1669 }
1670# ifndef RT_OS_DARWIN
1671 /* Darwin reports the IP length already in host byte order. */
1672 ip.ip_len = RT_N2H_U16(ip.ip_len);
1673# endif
1674# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1675 /* Solaris and Darwin report the payload only */
1676 ip.ip_len += (ip.ip_hl << 2);
1677# endif
1678 /* Note: ip->ip_len in host byte order (all OS) */
1679 len = ip.ip_len;
1680 buff = RTMemAlloc(len);
1681 if (buff == NULL)
1682 {
1683 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1684 return;
1685 }
1686 /* 2 - step: we're reading rest of the datagramm to the buffer */
1687 addrlen = sizeof(struct sockaddr_in);
1688 memset(&addr, 0, addrlen);
1689 len = recvfrom(so->s, buff, len, 0,
1690 (struct sockaddr *)&addr, &addrlen);
1691 if ( len < 0
1692 && ( errno == EAGAIN
1693 || errno == EWOULDBLOCK
1694 || errno == EINPROGRESS
1695 || errno == ENOTCONN))
1696 {
1697 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1698 ip.ip_len));
1699 RTMemFree(buff);
1700 return;
1701 }
1702 if ( len < 0
1703 || len == 0)
1704 {
1705 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1706 errno, len, (ip.ip_len - sizeof(struct ip))));
1707 RTMemFree(buff);
1708 return;
1709 }
1710 /* len is modified in 2nd read, when the rest of the datagramm was read */
1711 send_icmp_to_guest(pData, buff, len, &addr);
1712 RTMemFree(buff);
1713}
1714#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette