VirtualBox

source: vbox/trunk/src/VBox/Devices/Network/slirp/socket.c@ 91069

Last change on this file since 91069 was 82968, checked in by vboxsync, 5 years ago

Copyright year updates by scm.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Author Date Id Revision
File size: 43.3 KB
Line 
1/* $Id: socket.c 82968 2020-02-04 10:35:17Z vboxsync $ */
2/** @file
3 * NAT - socket handling.
4 */
5
6/*
7 * Copyright (C) 2006-2020 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.virtualbox.org. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 */
17
18/*
19 * This code is based on:
20 *
21 * Copyright (c) 1995 Danny Gasparovski.
22 *
23 * Please read the file COPYRIGHT for the
24 * terms and conditions of the copyright.
25 */
26
27#include <slirp.h>
28#include "ip_icmp.h"
29#include "main.h"
30#ifdef __sun__
31#include <sys/filio.h>
32#endif
33#include <VBox/vmm/pdmdrv.h>
34#if defined (RT_OS_WINDOWS)
35#include <iprt/win/iphlpapi.h>
36#include <icmpapi.h>
37#endif
38#include <alias.h>
39
40#if defined(DECLARE_IOVEC) && defined(RT_OS_WINDOWS)
41AssertCompileMembersSameSizeAndOffset(struct iovec, iov_base, WSABUF, buf);
42AssertCompileMembersSameSizeAndOffset(struct iovec, iov_len, WSABUF, len);
43#endif
44
45#ifdef VBOX_WITH_NAT_SEND2HOME
46DECLINLINE(bool) slirpSend2Home(PNATState pData, struct socket *pSo, const void *pvBuf, uint32_t cbBuf, int iFlags)
47{
48 int idxAddr;
49 int ret = 0;
50 bool fSendDone = false;
51 LogFlowFunc(("Enter pSo:%R[natsock] pvBuf: %p, cbBuf: %d, iFlags: %d\n", pSo, pvBuf, cbBuf, iFlags));
52 for (idxAddr = 0; idxAddr < pData->cInHomeAddressSize; ++idxAddr)
53 {
54
55 struct socket *pNewSocket = soCloneUDPSocketWithForegnAddr(pData, pSo, pData->pInSockAddrHomeAddress[idxAddr].sin_addr);
56 AssertReturn((pNewSocket, false));
57 pData->pInSockAddrHomeAddress[idxAddr].sin_port = pSo->so_fport;
58 /** @todo more verbose on errors,
59 * @note: we shouldn't care if this send fail or not (we're in broadcast).
60 */
61 LogFunc(("send %d bytes to %RTnaipv4 from %R[natsock]\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr, pNewSocket));
62 ret = sendto(pNewSocket->s, pvBuf, cbBuf, iFlags, (struct sockaddr *)&pData->pInSockAddrHomeAddress[idxAddr], sizeof(struct sockaddr_in));
63 if (ret < 0)
64 LogFunc(("Failed to send %d bytes to %RTnaipv4\n", cbBuf, pData->pInSockAddrHomeAddress[idxAddr].sin_addr.s_addr));
65 fSendDone |= ret > 0;
66 }
67 LogFlowFunc(("Leave %RTbool\n", fSendDone));
68 return fSendDone;
69}
70#endif /* !VBOX_WITH_NAT_SEND2HOME */
71
72#if !defined(RT_OS_WINDOWS)
73static void send_icmp_to_guest(PNATState, char *, size_t, const struct sockaddr_in *);
74static void sorecvfrom_icmp_unix(PNATState, struct socket *);
75#endif /* !RT_OS_WINDOWS */
76
77void
78so_init(void)
79{
80}
81
82struct socket *
83solookup(struct socket *head, struct in_addr laddr,
84 u_int lport, struct in_addr faddr, u_int fport)
85{
86 struct socket *so;
87
88 for (so = head->so_next; so != head; so = so->so_next)
89 {
90 if ( so->so_lport == lport
91 && so->so_laddr.s_addr == laddr.s_addr
92 && so->so_faddr.s_addr == faddr.s_addr
93 && so->so_fport == fport)
94 return so;
95 }
96
97 return (struct socket *)NULL;
98}
99
100/*
101 * Create a new socket, initialise the fields
102 * It is the responsibility of the caller to
103 * insque() it into the correct linked-list
104 */
105struct socket *
106socreate(void)
107{
108 struct socket *so;
109
110 so = (struct socket *)RTMemAllocZ(sizeof(struct socket));
111 if (so)
112 {
113 so->so_state = SS_NOFDREF;
114 so->s = -1;
115#if !defined(RT_OS_WINDOWS)
116 so->so_poll_index = -1;
117#endif
118 }
119 return so;
120}
121
122/*
123 * remque and free a socket, clobber cache
124 */
125void
126sofree(PNATState pData, struct socket *so)
127{
128 LogFlowFunc(("ENTER:%R[natsock]\n", so));
129 /*
130 * We should not remove socket when polling routine do the polling
131 * instead we mark it for deletion.
132 */
133 if (so->fUnderPolling)
134 {
135 so->fShouldBeRemoved = 1;
136 LogFlowFunc(("LEAVE:%R[natsock] postponed deletion\n", so));
137 return;
138 }
139 /**
140 * Check that we don't freeng socket with tcbcb
141 */
142 Assert(!sototcpcb(so));
143 /* udp checks */
144 Assert(!so->so_timeout);
145 Assert(!so->so_timeout_arg);
146 if (so == tcp_last_so)
147 tcp_last_so = &tcb;
148 else if (so == udp_last_so)
149 udp_last_so = &udb;
150
151 /* check if mbuf haven't been already freed */
152 if (so->so_m != NULL)
153 {
154 m_freem(pData, so->so_m);
155 so->so_m = NULL;
156 }
157
158 if (so->so_ohdr != NULL)
159 {
160 RTMemFree(so->so_ohdr);
161 so->so_ohdr = NULL;
162 }
163
164 if (so->so_next && so->so_prev)
165 {
166 remque(pData, so); /* crashes if so is not in a queue */
167 NSOCK_DEC();
168 }
169
170 RTMemFree(so);
171 LogFlowFuncLeave();
172}
173
174
175/*
176 * Worker for sobind() below.
177 */
178static int
179sobindto(struct socket *so, uint32_t addr, uint16_t port)
180{
181 struct sockaddr_in self;
182 int status;
183
184 if (addr == INADDR_ANY && port == 0 && so->so_type != IPPROTO_UDP)
185 {
186 /* TCP sockets without constraints don't need to be bound */
187 Log2(("NAT: sobind: %s guest %RTnaipv4:%d - nothing to do\n",
188 so->so_type == IPPROTO_UDP ? "udp" : "tcp",
189 so->so_laddr.s_addr, ntohs(so->so_lport)));
190 return 0;
191 }
192
193 RT_ZERO(self);
194#ifdef RT_OS_DARWIN
195 self.sin_len = sizeof(self);
196#endif
197 self.sin_family = AF_INET;
198 self.sin_addr.s_addr = addr;
199 self.sin_port = port;
200
201 status = bind(so->s, (struct sockaddr *)&self, sizeof(self));
202 if (status == 0)
203 {
204 Log2(("NAT: sobind: %s guest %RTnaipv4:%d to host %RTnaipv4:%d\n",
205 so->so_type == IPPROTO_UDP ? "udp" : "tcp",
206 so->so_laddr.s_addr, ntohs(so->so_lport), addr, ntohs(port)));
207 return 0;
208 }
209
210 Log2(("NAT: sobind: %s guest %RTnaipv4:%d to host %RTnaipv4:%d error %d%s\n",
211 so->so_type == IPPROTO_UDP ? "udp" : "tcp",
212 so->so_laddr.s_addr, ntohs(so->so_lport),
213 addr, ntohs(port),
214 errno, port ? " (will retry with random port)" : ""));
215
216 if (port) /* retry without */
217 status = sobindto(so, addr, 0);
218
219 if (addr)
220 return status;
221 else
222 return 0;
223}
224
225
226/*
227 * Bind the socket to specific host address and/or port if necessary.
228 * We also always bind udp sockets to force the local port to be
229 * allocated and known in advance.
230 */
231int
232sobind(PNATState pData, struct socket *so)
233{
234 uint32_t addr = pData->bindIP.s_addr; /* may be INADDR_ANY */
235 bool fSamePorts = !!(pData->i32AliasMode & PKT_ALIAS_SAME_PORTS);
236 uint16_t port;
237 int status;
238
239 if (fSamePorts)
240 {
241 int opt = 1;
242 setsockopt(so->s, SOL_SOCKET, SO_REUSEADDR, (char *)&opt, sizeof(opt));
243 port = so->so_lport;
244 }
245 else
246 {
247 port = 0;
248 }
249
250 status = sobindto(so, addr, port);
251 return status;
252}
253
254
255/*
256 * Read from so's socket into sb_snd, updating all relevant sbuf fields
257 * NOTE: This will only be called if it is select()ed for reading, so
258 * a read() of 0 (or less) means it's disconnected
259 */
260int
261soread(PNATState pData, struct socket *so)
262{
263 int n, nn, lss, total;
264 struct sbuf *sb = &so->so_snd;
265 u_int len = sb->sb_datalen - sb->sb_cc;
266 struct iovec iov[2];
267 int mss = so->so_tcpcb->t_maxseg;
268 int sockerr;
269
270 STAM_PROFILE_START(&pData->StatIOread, a);
271 STAM_COUNTER_RESET(&pData->StatIORead_in_1);
272 STAM_COUNTER_RESET(&pData->StatIORead_in_2);
273
274 QSOCKET_LOCK(tcb);
275 SOCKET_LOCK(so);
276 QSOCKET_UNLOCK(tcb);
277
278 LogFlow(("soread: so = %R[natsock]\n", so));
279 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb));
280
281 /*
282 * No need to check if there's enough room to read.
283 * soread wouldn't have been called if there weren't
284 */
285
286 len = sb->sb_datalen - sb->sb_cc;
287
288 iov[0].iov_base = sb->sb_wptr;
289 iov[1].iov_base = 0;
290 iov[1].iov_len = 0;
291 if (sb->sb_wptr < sb->sb_rptr)
292 {
293 iov[0].iov_len = sb->sb_rptr - sb->sb_wptr;
294 /* Should never succeed, but... */
295 if (iov[0].iov_len > len)
296 iov[0].iov_len = len;
297 if (iov[0].iov_len > mss)
298 iov[0].iov_len -= iov[0].iov_len%mss;
299 n = 1;
300 }
301 else
302 {
303 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_wptr;
304 /* Should never succeed, but... */
305 if (iov[0].iov_len > len)
306 iov[0].iov_len = len;
307 len -= iov[0].iov_len;
308 if (len)
309 {
310 iov[1].iov_base = sb->sb_data;
311 iov[1].iov_len = sb->sb_rptr - sb->sb_data;
312 if (iov[1].iov_len > len)
313 iov[1].iov_len = len;
314 total = iov[0].iov_len + iov[1].iov_len;
315 if (total > mss)
316 {
317 lss = total % mss;
318 if (iov[1].iov_len > lss)
319 {
320 iov[1].iov_len -= lss;
321 n = 2;
322 }
323 else
324 {
325 lss -= iov[1].iov_len;
326 iov[0].iov_len -= lss;
327 n = 1;
328 }
329 }
330 else
331 n = 2;
332 }
333 else
334 {
335 if (iov[0].iov_len > mss)
336 iov[0].iov_len -= iov[0].iov_len%mss;
337 n = 1;
338 }
339 }
340
341#ifdef HAVE_READV
342 nn = readv(so->s, (struct iovec *)iov, n);
343#else
344 nn = recv(so->s, iov[0].iov_base, iov[0].iov_len, (so->so_tcpcb->t_force? MSG_OOB:0));
345#endif
346 if (nn < 0)
347 sockerr = errno; /* save it, as it may be clobbered by logging */
348 else
349 sockerr = 0;
350
351 Log2(("%s: read(1) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
352 Log2(("%s: so = %R[natsock] so->so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb));
353 if (nn <= 0)
354 {
355#ifdef RT_OS_WINDOWS
356 /*
357 * Windows reports ESHUTDOWN after SHUT_RD (SD_RECEIVE)
358 * instead of just returning EOF indication.
359 */
360 if (nn < 0 && sockerr == ESHUTDOWN)
361 {
362 nn = 0;
363 sockerr = 0;
364 }
365#endif
366
367 if (nn == 0) /* XXX: should this be inside #if defined(RT_OS_WINDOWS)? */
368 {
369 /*
370 * Special case for WSAEnumNetworkEvents: If we receive 0 bytes that
371 * _could_ mean that the connection is closed. But we will receive an
372 * FD_CLOSE event later if the connection was _really_ closed. With
373 * www.youtube.com I see this very often. Closing the socket too early
374 * would be dangerous.
375 */
376 int status;
377 unsigned long pending = 0;
378 status = ioctlsocket(so->s, FIONREAD, &pending);
379 if (status < 0)
380 Log(("NAT:%s: error in WSAIoctl: %d\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, errno));
381 if (pending != 0)
382 {
383 SOCKET_UNLOCK(so);
384 STAM_PROFILE_STOP(&pData->StatIOread, a);
385 return 0;
386 }
387 }
388
389 if ( nn < 0
390 && soIgnorableErrorCode(sockerr))
391 {
392 SOCKET_UNLOCK(so);
393 STAM_PROFILE_STOP(&pData->StatIOread, a);
394 return 0;
395 }
396 else
397 {
398 int fUninitializedTemplate = 0;
399 int shuterr;
400
401 fUninitializedTemplate = RT_BOOL(( sototcpcb(so)
402 && ( sototcpcb(so)->t_template.ti_src.s_addr == INADDR_ANY
403 || sototcpcb(so)->t_template.ti_dst.s_addr == INADDR_ANY)));
404 /* nn == 0 means peer has performed an orderly shutdown */
405 Log2(("%s: disconnected, nn = %d, errno = %d (%s)\n",
406 RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sockerr, strerror(sockerr)));
407
408 shuterr = sofcantrcvmore(so);
409 if (!sockerr && !shuterr && !fUninitializedTemplate)
410 tcp_sockclosed(pData, sototcpcb(so));
411 else
412 {
413 LogRel2(("NAT: sockerr %d, shuterr %d - %R[natsock]\n", sockerr, shuterr, so));
414 tcp_drop(pData, sototcpcb(so), sockerr);
415 }
416 SOCKET_UNLOCK(so);
417 STAM_PROFILE_STOP(&pData->StatIOread, a);
418 return -1;
419 }
420 }
421 STAM_STATS(
422 if (n == 1)
423 {
424 STAM_COUNTER_INC(&pData->StatIORead_in_1);
425 STAM_COUNTER_ADD(&pData->StatIORead_in_1_bytes, nn);
426 }
427 else
428 {
429 STAM_COUNTER_INC(&pData->StatIORead_in_2);
430 STAM_COUNTER_ADD(&pData->StatIORead_in_2_1st_bytes, nn);
431 }
432 );
433
434#ifndef HAVE_READV
435 /*
436 * If there was no error, try and read the second time round
437 * We read again if n = 2 (ie, there's another part of the buffer)
438 * and we read as much as we could in the first read
439 * We don't test for <= 0 this time, because there legitimately
440 * might not be any more data (since the socket is non-blocking),
441 * a close will be detected on next iteration.
442 * A return of -1 wont (shouldn't) happen, since it didn't happen above
443 */
444 if (n == 2 && (unsigned)nn == iov[0].iov_len)
445 {
446 int ret;
447 ret = recv(so->s, iov[1].iov_base, iov[1].iov_len, 0);
448 if (ret > 0)
449 nn += ret;
450 STAM_STATS(
451 if (ret > 0)
452 {
453 STAM_COUNTER_INC(&pData->StatIORead_in_2);
454 STAM_COUNTER_ADD(&pData->StatIORead_in_2_2nd_bytes, ret);
455 }
456 );
457 }
458
459 Log2(("%s: read(2) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
460#endif
461
462 /* Update fields */
463 sb->sb_cc += nn;
464 sb->sb_wptr += nn;
465 Log2(("%s: update so_snd (readed nn = %d) %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sb));
466 if (sb->sb_wptr >= (sb->sb_data + sb->sb_datalen))
467 {
468 sb->sb_wptr -= sb->sb_datalen;
469 Log2(("%s: alter sb_wptr so_snd = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, sb));
470 }
471 STAM_PROFILE_STOP(&pData->StatIOread, a);
472 SOCKET_UNLOCK(so);
473 return nn;
474}
475
476/*
477 * Get urgent data
478 *
479 * When the socket is created, we set it SO_OOBINLINE,
480 * so when OOB data arrives, we soread() it and everything
481 * in the send buffer is sent as urgent data
482 */
483void
484sorecvoob(PNATState pData, struct socket *so)
485{
486 struct tcpcb *tp = sototcpcb(so);
487 ssize_t ret;
488
489 LogFlowFunc(("sorecvoob: so = %R[natsock]\n", so));
490
491 /*
492 * We take a guess at how much urgent data has arrived.
493 * In most situations, when urgent data arrives, the next
494 * read() should get all the urgent data. This guess will
495 * be wrong however if more data arrives just after the
496 * urgent data, or the read() doesn't return all the
497 * urgent data.
498 */
499 ret = soread(pData, so);
500 if (RT_LIKELY(ret > 0))
501 {
502 /*
503 * @todo for now just scrub the URG pointer. To faithfully
504 * proxy URG we need to read the srteam until SIOCATMARK, and
505 * then mark the first byte of the next read ar urgent.
506 */
507#if 0
508 tp->snd_up = tp->snd_una + SBUF_LEN(&so->so_snd);
509#endif
510 tp->t_force = 1;
511 tcp_output(pData, tp);
512 tp->t_force = 0;
513 }
514}
515
516/*
517 * Send urgent data
518 * There's a lot duplicated code here, but...
519 */
520int
521sosendoob(struct socket *so)
522{
523 struct sbuf *sb = &so->so_rcv;
524 char buff[2048]; /* XXX Shouldn't be sending more oob data than this */
525
526 int n, len;
527
528 LogFlowFunc(("sosendoob so = %R[natsock]\n", so));
529
530 if (so->so_urgc > sizeof(buff))
531 so->so_urgc = sizeof(buff); /* XXX */
532
533 if (sb->sb_rptr < sb->sb_wptr)
534 {
535 /* We can send it directly */
536 n = send(so->s, sb->sb_rptr, so->so_urgc, (MSG_OOB)); /* |MSG_DONTWAIT)); */
537 so->so_urgc -= n;
538
539 Log2((" --- sent %d bytes urgent data, %d urgent bytes left\n",
540 n, so->so_urgc));
541 }
542 else
543 {
544 /*
545 * Since there's no sendv or sendtov like writev,
546 * we must copy all data to a linear buffer then
547 * send it all
548 */
549 len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
550 if (len > so->so_urgc)
551 len = so->so_urgc;
552 memcpy(buff, sb->sb_rptr, len);
553 so->so_urgc -= len;
554 if (so->so_urgc)
555 {
556 n = sb->sb_wptr - sb->sb_data;
557 if (n > so->so_urgc)
558 n = so->so_urgc;
559 memcpy(buff + len, sb->sb_data, n);
560 so->so_urgc -= n;
561 len += n;
562 }
563 n = send(so->s, buff, len, (MSG_OOB)); /* |MSG_DONTWAIT)); */
564#ifdef DEBUG
565 if (n != len)
566 Log(("Didn't send all data urgently XXXXX\n"));
567#endif
568 Log2((" ---2 sent %d bytes urgent data, %d urgent bytes left\n",
569 n, so->so_urgc));
570 }
571
572 sb->sb_cc -= n;
573 sb->sb_rptr += n;
574 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
575 sb->sb_rptr -= sb->sb_datalen;
576
577 return n;
578}
579
580/*
581 * Write data from so_rcv to so's socket,
582 * updating all sbuf field as necessary
583 */
584int
585sowrite(PNATState pData, struct socket *so)
586{
587 int n, nn;
588 struct sbuf *sb = &so->so_rcv;
589 u_int len = sb->sb_cc;
590 struct iovec iov[2];
591
592 STAM_PROFILE_START(&pData->StatIOwrite, a);
593 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1);
594 STAM_COUNTER_RESET(&pData->StatIOWrite_in_1_bytes);
595 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2);
596 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_1st_bytes);
597 STAM_COUNTER_RESET(&pData->StatIOWrite_in_2_2nd_bytes);
598 STAM_COUNTER_RESET(&pData->StatIOWrite_no_w);
599 STAM_COUNTER_RESET(&pData->StatIOWrite_rest);
600 STAM_COUNTER_RESET(&pData->StatIOWrite_rest_bytes);
601 LogFlowFunc(("so = %R[natsock]\n", so));
602 Log2(("%s: so = %R[natsock] so->so_rcv = %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, so, sb));
603 QSOCKET_LOCK(tcb);
604 SOCKET_LOCK(so);
605 QSOCKET_UNLOCK(tcb);
606 if (so->so_urgc)
607 {
608 sosendoob(so);
609 if (sb->sb_cc == 0)
610 {
611 SOCKET_UNLOCK(so);
612 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
613 return 0;
614 }
615 }
616
617 /*
618 * No need to check if there's something to write,
619 * sowrite wouldn't have been called otherwise
620 */
621
622 len = sb->sb_cc;
623
624 iov[0].iov_base = sb->sb_rptr;
625 iov[1].iov_base = 0;
626 iov[1].iov_len = 0;
627 if (sb->sb_rptr < sb->sb_wptr)
628 {
629 iov[0].iov_len = sb->sb_wptr - sb->sb_rptr;
630 /* Should never succeed, but... */
631 if (iov[0].iov_len > len)
632 iov[0].iov_len = len;
633 n = 1;
634 }
635 else
636 {
637 iov[0].iov_len = (sb->sb_data + sb->sb_datalen) - sb->sb_rptr;
638 if (iov[0].iov_len > len)
639 iov[0].iov_len = len;
640 len -= iov[0].iov_len;
641 if (len)
642 {
643 iov[1].iov_base = sb->sb_data;
644 iov[1].iov_len = sb->sb_wptr - sb->sb_data;
645 if (iov[1].iov_len > len)
646 iov[1].iov_len = len;
647 n = 2;
648 }
649 else
650 n = 1;
651 }
652 STAM_STATS({
653 if (n == 1)
654 {
655 STAM_COUNTER_INC(&pData->StatIOWrite_in_1);
656 STAM_COUNTER_ADD(&pData->StatIOWrite_in_1_bytes, iov[0].iov_len);
657 }
658 else
659 {
660 STAM_COUNTER_INC(&pData->StatIOWrite_in_2);
661 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_1st_bytes, iov[0].iov_len);
662 STAM_COUNTER_ADD(&pData->StatIOWrite_in_2_2nd_bytes, iov[1].iov_len);
663 }
664 });
665 /* Check if there's urgent data to send, and if so, send it */
666#ifdef HAVE_READV
667 nn = writev(so->s, (const struct iovec *)iov, n);
668#else
669 nn = send(so->s, iov[0].iov_base, iov[0].iov_len, 0);
670#endif
671 Log2(("%s: wrote(1) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
672 /* This should never happen, but people tell me it does *shrug* */
673 if ( nn < 0
674 && soIgnorableErrorCode(errno))
675 {
676 SOCKET_UNLOCK(so);
677 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
678 return 0;
679 }
680
681 if (nn < 0 || (nn == 0 && iov[0].iov_len > 0))
682 {
683 Log2(("%s: disconnected, so->so_state = %x, errno = %d\n",
684 RT_GCC_EXTENSION __PRETTY_FUNCTION__, so->so_state, errno));
685 sofcantsendmore(so);
686 tcp_sockclosed(pData, sototcpcb(so));
687 SOCKET_UNLOCK(so);
688 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
689 return -1;
690 }
691
692#ifndef HAVE_READV
693 if (n == 2 && (unsigned)nn == iov[0].iov_len)
694 {
695 int ret;
696 ret = send(so->s, iov[1].iov_base, iov[1].iov_len, 0);
697 if (ret > 0)
698 nn += ret;
699# ifdef VBOX_WITH_STATISTICS
700 if (ret > 0 && ret != (ssize_t)iov[1].iov_len)
701 {
702 STAM_COUNTER_INC(&pData->StatIOWrite_rest);
703 STAM_COUNTER_ADD(&pData->StatIOWrite_rest_bytes, (iov[1].iov_len - ret));
704 }
705#endif
706 }
707 Log2(("%s: wrote(2) nn = %d bytes\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn));
708#endif
709
710 /* Update sbuf */
711 sb->sb_cc -= nn;
712 sb->sb_rptr += nn;
713 Log2(("%s: update so_rcv (written nn = %d) %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, nn, sb));
714 if (sb->sb_rptr >= (sb->sb_data + sb->sb_datalen))
715 {
716 sb->sb_rptr -= sb->sb_datalen;
717 Log2(("%s: alter sb_rptr of so_rcv %R[sbuf]\n", RT_GCC_EXTENSION __PRETTY_FUNCTION__, sb));
718 }
719
720 /*
721 * If in DRAIN mode, and there's no more data, set
722 * it CANTSENDMORE
723 */
724 if ((so->so_state & SS_FWDRAIN) && sb->sb_cc == 0)
725 sofcantsendmore(so);
726
727 SOCKET_UNLOCK(so);
728 STAM_PROFILE_STOP(&pData->StatIOwrite, a);
729 return nn;
730}
731
732/*
733 * recvfrom() a UDP socket
734 */
735void
736sorecvfrom(PNATState pData, struct socket *so)
737{
738 LogFlowFunc(("sorecvfrom: so = %p\n", so));
739
740#ifdef RT_OS_WINDOWS
741 /* ping is handled with ICMP API in ip_icmpwin.c */
742 Assert(so->so_type == IPPROTO_UDP);
743#else
744 if (so->so_type == IPPROTO_ICMP)
745 {
746 /* This is a "ping" reply */
747 sorecvfrom_icmp_unix(pData, so);
748 udp_detach(pData, so);
749 }
750 else
751#endif /* !RT_OS_WINDOWS */
752 {
753 static char achBuf[64 * 1024];
754
755 /* A "normal" UDP packet */
756 struct sockaddr_in addr;
757 socklen_t addrlen = sizeof(struct sockaddr_in);
758 struct iovec iov[2];
759 ssize_t nread;
760 struct mbuf *m;
761
762 QSOCKET_LOCK(udb);
763 SOCKET_LOCK(so);
764 QSOCKET_UNLOCK(udb);
765
766 m = m_getjcl(pData, M_NOWAIT, MT_HEADER, M_PKTHDR, slirp_size(pData));
767 if (m == NULL)
768 {
769 SOCKET_UNLOCK(so);
770 return;
771 }
772
773 m->m_data += ETH_HLEN;
774 m->m_pkthdr.header = mtod(m, void *);
775
776 m->m_data += sizeof(struct udpiphdr);
777
778 /* small packets will fit without copying */
779 iov[0].iov_base = mtod(m, char *);
780 iov[0].iov_len = M_TRAILINGSPACE(m);
781
782 /* large packets will spill into a temp buffer */
783 iov[1].iov_base = achBuf;
784 iov[1].iov_len = sizeof(achBuf);
785
786#if !defined(RT_OS_WINDOWS)
787 {
788 struct msghdr mh;
789 memset(&mh, 0, sizeof(mh));
790
791 mh.msg_iov = iov;
792 mh.msg_iovlen = 2;
793 mh.msg_name = &addr;
794 mh.msg_namelen = addrlen;
795
796 nread = recvmsg(so->s, &mh, 0);
797 }
798#else /* RT_OS_WINDOWS */
799 {
800 DWORD nbytes; /* NB: can't use nread b/c of different size */
801 DWORD flags = 0;
802 int status;
803 AssertCompile(sizeof(WSABUF) == sizeof(struct iovec));
804 AssertCompileMembersSameSizeAndOffset(WSABUF, len, struct iovec, iov_len);
805 AssertCompileMembersSameSizeAndOffset(WSABUF, buf, struct iovec, iov_base);
806 status = WSARecvFrom(so->s, (WSABUF *)&iov[0], 2, &nbytes, &flags,
807 (struct sockaddr *)&addr, &addrlen,
808 NULL, NULL);
809 if (status != SOCKET_ERROR)
810 nread = nbytes;
811 else
812 nread = -1;
813 }
814#endif
815 if (nread >= 0)
816 {
817 if (nread <= iov[0].iov_len)
818 m->m_len = nread;
819 else
820 {
821 m->m_len = iov[0].iov_len;
822 m_append(pData, m, nread - iov[0].iov_len, iov[1].iov_base);
823 }
824 Assert(m_length(m, NULL) == (size_t)nread);
825
826 /*
827 * Hack: domain name lookup will be used the most for UDP,
828 * and since they'll only be used once there's no need
829 * for the 4 minute (or whatever) timeout... So we time them
830 * out much quicker (10 seconds for now...)
831 */
832 if (so->so_expire)
833 {
834 if (so->so_fport != RT_H2N_U16_C(53))
835 so->so_expire = curtime + SO_EXPIRE;
836 }
837
838 /*
839 * DNS proxy requests are forwarded to the real resolver,
840 * but its socket's so_faddr is that of the DNS proxy
841 * itself.
842 *
843 * last argument should be changed if Slirp will inject IP attributes
844 */
845 if ( pData->fUseDnsProxy
846 && so->so_fport == RT_H2N_U16_C(53)
847 && CTL_CHECK(so->so_faddr.s_addr, CTL_DNS))
848 dnsproxy_answer(pData, so, m);
849
850 /* packets definetly will be fragmented, could confuse receiver peer. */
851 if (nread > if_mtu)
852 m->m_flags |= M_SKIP_FIREWALL;
853
854 /*
855 * If this packet was destined for CTL_ADDR,
856 * make it look like that's where it came from, done by udp_output
857 */
858 udp_output(pData, so, m, &addr);
859 }
860 else
861 {
862 m_freem(pData, m);
863
864 if (!soIgnorableErrorCode(errno))
865 {
866 u_char code;
867 if (errno == EHOSTUNREACH)
868 code = ICMP_UNREACH_HOST;
869 else if (errno == ENETUNREACH)
870 code = ICMP_UNREACH_NET;
871 else
872 code = ICMP_UNREACH_PORT;
873
874 Log2((" rx error, tx icmp ICMP_UNREACH:%i\n", code));
875 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
876 so->so_m = NULL;
877 }
878 }
879
880 SOCKET_UNLOCK(so);
881 }
882}
883
884/*
885 * sendto() a socket
886 */
887int
888sosendto(PNATState pData, struct socket *so, struct mbuf *m)
889{
890 int ret;
891 struct sockaddr_in *paddr;
892 struct sockaddr addr;
893#if 0
894 struct sockaddr_in host_addr;
895#endif
896 caddr_t buf = 0;
897 int mlen;
898
899 LogFlowFunc(("sosendto: so = %R[natsock], m = %p\n", so, m));
900
901 memset(&addr, 0, sizeof(struct sockaddr));
902#ifdef RT_OS_DARWIN
903 addr.sa_len = sizeof(struct sockaddr_in);
904#endif
905 paddr = (struct sockaddr_in *)&addr;
906 paddr->sin_family = AF_INET;
907 if ((so->so_faddr.s_addr & RT_H2N_U32(pData->netmask)) == pData->special_addr.s_addr)
908 {
909 /* It's an alias */
910 uint32_t last_byte = RT_N2H_U32(so->so_faddr.s_addr) & ~pData->netmask;
911 switch(last_byte)
912 {
913#if 0
914 /* handle this case at 'default:' */
915 case CTL_BROADCAST:
916 addr.sin_addr.s_addr = INADDR_BROADCAST;
917 /* Send the packet to host to fully emulate broadcast */
918 /** @todo r=klaus: on Linux host this causes the host to receive
919 * the packet twice for some reason. And I cannot find any place
920 * in the man pages which states that sending a broadcast does not
921 * reach the host itself. */
922 host_addr.sin_family = AF_INET;
923 host_addr.sin_port = so->so_fport;
924 host_addr.sin_addr = our_addr;
925 sendto(so->s, m->m_data, m->m_len, 0,
926 (struct sockaddr *)&host_addr, sizeof (struct sockaddr));
927 break;
928#endif
929 case CTL_DNS:
930 case CTL_ALIAS:
931 default:
932 if (last_byte == ~pData->netmask)
933 paddr->sin_addr.s_addr = INADDR_BROADCAST;
934 else
935 paddr->sin_addr = loopback_addr;
936 break;
937 }
938 }
939 else
940 paddr->sin_addr = so->so_faddr;
941 paddr->sin_port = so->so_fport;
942
943 Log2((" sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n",
944 RT_N2H_U16(paddr->sin_port), inet_ntoa(paddr->sin_addr)));
945
946 /* Don't care what port we get */
947 /*
948 * > nmap -sV -T4 -O -A -v -PU3483 255.255.255.255
949 * generates bodyless messages, annoying memmory management system.
950 */
951 mlen = m_length(m, NULL);
952 if (mlen > 0)
953 {
954 buf = RTMemAlloc(mlen);
955 if (buf == NULL)
956 {
957 return -1;
958 }
959 m_copydata(m, 0, mlen, buf);
960 }
961 ret = sendto(so->s, buf, mlen, 0,
962 (struct sockaddr *)&addr, sizeof (struct sockaddr));
963#ifdef VBOX_WITH_NAT_SEND2HOME
964 if (slirpIsWideCasting(pData, so->so_faddr.s_addr))
965 {
966 slirpSend2Home(pData, so, buf, mlen, 0);
967 }
968#endif
969 if (buf)
970 RTMemFree(buf);
971 if (ret < 0)
972 {
973 Log2(("UDP: sendto fails (%s)\n", strerror(errno)));
974 return -1;
975 }
976
977 /*
978 * Kill the socket if there's no reply in 4 minutes,
979 * but only if it's an expirable socket
980 */
981 if (so->so_expire)
982 so->so_expire = curtime + SO_EXPIRE;
983 so->so_state = SS_ISFCONNECTED; /* So that it gets select()ed */
984 return 0;
985}
986
987/*
988 * XXX This should really be tcp_listen
989 */
990struct socket *
991solisten(PNATState pData, u_int32_t bind_addr, u_int port, u_int32_t laddr, u_int lport, int flags)
992{
993 struct sockaddr_in addr;
994 struct socket *so;
995 socklen_t addrlen = sizeof(addr);
996 int s, opt = 1;
997 int status;
998
999 LogFlowFunc(("solisten: port = %d, laddr = %x, lport = %d, flags = %x\n", port, laddr, lport, flags));
1000
1001 if ((so = socreate()) == NULL)
1002 {
1003 /* RTMemFree(so); Not sofree() ??? free(NULL) == NOP */
1004 return NULL;
1005 }
1006
1007 /* Don't tcp_attach... we don't need so_snd nor so_rcv */
1008 if ((so->so_tcpcb = tcp_newtcpcb(pData, so)) == NULL)
1009 {
1010 RTMemFree(so);
1011 return NULL;
1012 }
1013
1014 SOCKET_LOCK_CREATE(so);
1015 SOCKET_LOCK(so);
1016 QSOCKET_LOCK(tcb);
1017 insque(pData, so,&tcb);
1018 NSOCK_INC();
1019 QSOCKET_UNLOCK(tcb);
1020
1021 /*
1022 * SS_FACCEPTONCE sockets must time out.
1023 */
1024 if (flags & SS_FACCEPTONCE)
1025 so->so_tcpcb->t_timer[TCPT_KEEP] = TCPTV_KEEP_INIT*2;
1026
1027 so->so_state = (SS_FACCEPTCONN|flags);
1028 so->so_lport = lport; /* Kept in network format */
1029 so->so_laddr.s_addr = laddr; /* Ditto */
1030
1031 memset(&addr, 0, sizeof(addr));
1032#ifdef RT_OS_DARWIN
1033 addr.sin_len = sizeof(addr);
1034#endif
1035 addr.sin_family = AF_INET;
1036 addr.sin_addr.s_addr = bind_addr;
1037 addr.sin_port = port;
1038
1039 /**
1040 * changing listen(,1->SOMAXCONN) shouldn't be harmful for NAT's TCP/IP stack,
1041 * kernel will choose the optimal value for requests queue length.
1042 * @note: MSDN recommends low (2-4) values for bluetooth networking devices.
1043 */
1044 if ( ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0)
1045 || (setsockopt(s, SOL_SOCKET, SO_REUSEADDR,(char *)&opt, sizeof(int)) < 0)
1046 || (bind(s,(struct sockaddr *)&addr, sizeof(addr)) < 0)
1047 || (listen(s, pData->soMaxConn) < 0))
1048 {
1049#ifdef RT_OS_WINDOWS
1050 int tmperrno = WSAGetLastError(); /* Don't clobber the real reason we failed */
1051 closesocket(s);
1052 QSOCKET_LOCK(tcb);
1053 sofree(pData, so);
1054 QSOCKET_UNLOCK(tcb);
1055 /* Restore the real errno */
1056 WSASetLastError(tmperrno);
1057#else
1058 int tmperrno = errno; /* Don't clobber the real reason we failed */
1059 close(s);
1060 if (sototcpcb(so))
1061 tcp_close(pData, sototcpcb(so));
1062 else
1063 sofree(pData, so);
1064 /* Restore the real errno */
1065 errno = tmperrno;
1066#endif
1067 return NULL;
1068 }
1069 fd_nonblock(s);
1070 setsockopt(s, SOL_SOCKET, SO_OOBINLINE,(char *)&opt, sizeof(int));
1071
1072 getsockname(s,(struct sockaddr *)&addr,&addrlen);
1073 so->so_fport = addr.sin_port;
1074 /* set socket buffers */
1075 opt = pData->socket_rcv;
1076 status = setsockopt(s, SOL_SOCKET, SO_RCVBUF, (char *)&opt, sizeof(int));
1077 if (status < 0)
1078 {
1079 LogRel(("NAT: Error(%d) while setting RCV capacity to (%d)\n", errno, opt));
1080 goto no_sockopt;
1081 }
1082 opt = pData->socket_snd;
1083 status = setsockopt(s, SOL_SOCKET, SO_SNDBUF, (char *)&opt, sizeof(int));
1084 if (status < 0)
1085 {
1086 LogRel(("NAT: Error(%d) while setting SND capacity to (%d)\n", errno, opt));
1087 goto no_sockopt;
1088 }
1089no_sockopt:
1090 if (addr.sin_addr.s_addr == 0 || addr.sin_addr.s_addr == loopback_addr.s_addr)
1091 so->so_faddr = alias_addr;
1092 else
1093 so->so_faddr = addr.sin_addr;
1094
1095 so->s = s;
1096 SOCKET_UNLOCK(so);
1097 return so;
1098}
1099
1100/*
1101 * Data is available in so_rcv
1102 * Just write() the data to the socket
1103 * XXX not yet...
1104 * @todo do we really need this function, what it's intended to do?
1105 */
1106void
1107sorwakeup(struct socket *so)
1108{
1109 NOREF(so);
1110#if 0
1111 sowrite(so);
1112 FD_CLR(so->s,&writefds);
1113#endif
1114}
1115
1116/*
1117 * Data has been freed in so_snd
1118 * We have room for a read() if we want to
1119 * For now, don't read, it'll be done in the main loop
1120 */
1121void
1122sowwakeup(struct socket *so)
1123{
1124 NOREF(so);
1125}
1126
1127/*
1128 * Various session state calls
1129 * XXX Should be #define's
1130 * The socket state stuff needs work, these often get call 2 or 3
1131 * times each when only 1 was needed
1132 */
1133void
1134soisfconnecting(struct socket *so)
1135{
1136 so->so_state &= ~(SS_NOFDREF|SS_ISFCONNECTED|SS_FCANTRCVMORE|
1137 SS_FCANTSENDMORE|SS_FWDRAIN);
1138 so->so_state |= SS_ISFCONNECTING; /* Clobber other states */
1139}
1140
1141void
1142soisfconnected(struct socket *so)
1143{
1144 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1145 so->so_state &= ~(SS_ISFCONNECTING|SS_FWDRAIN|SS_NOFDREF);
1146 so->so_state |= SS_ISFCONNECTED; /* Clobber other states */
1147 LogFlowFunc(("LEAVE: so:%R[natsock]\n", so));
1148}
1149
1150int
1151sofcantrcvmore(struct socket *so)
1152{
1153 int err = 0;
1154
1155 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1156 if ((so->so_state & SS_NOFDREF) == 0)
1157 {
1158 /*
1159 * If remote closes first and then sends an RST, the recv() in
1160 * soread() will keep reporting EOF without any error
1161 * indication. As far as I can tell the only way to detect
1162 * this on Linux is to check if shutdown() succeeds here (but
1163 * see below).
1164 *
1165 * OTOH on OS X shutdown() "helpfully" checks if remote has
1166 * already closed and then always returns ENOTCONN
1167 * immediately.
1168 */
1169 int status = shutdown(so->s, SHUT_RD);
1170#if defined(RT_OS_LINUX)
1171 if (status < 0)
1172 err = errno;
1173#else
1174 RT_NOREF(status);
1175#endif
1176 }
1177 so->so_state &= ~(SS_ISFCONNECTING);
1178 if (so->so_state & SS_FCANTSENDMORE)
1179 {
1180#if defined(RT_OS_LINUX)
1181 /*
1182 * If we have closed first, and remote closes, shutdown will
1183 * return ENOTCONN, but this is expected. Don't tell the
1184 * caller there was an error.
1185 */
1186 if (err == ENOTCONN)
1187 err = 0;
1188#endif
1189 so->so_state = SS_NOFDREF; /* Don't select it */
1190 /* XXX close() here as well? */
1191 }
1192 else
1193 so->so_state |= SS_FCANTRCVMORE;
1194
1195 LogFlowFunc(("LEAVE: %d\n", err));
1196 return err;
1197}
1198
1199void
1200sofcantsendmore(struct socket *so)
1201{
1202 LogFlowFunc(("ENTER: so:%R[natsock]\n", so));
1203 if ((so->so_state & SS_NOFDREF) == 0)
1204 shutdown(so->s, 1); /* send FIN to fhost */
1205
1206 so->so_state &= ~(SS_ISFCONNECTING);
1207 if (so->so_state & SS_FCANTRCVMORE)
1208 so->so_state = SS_NOFDREF; /* as above */
1209 else
1210 so->so_state |= SS_FCANTSENDMORE;
1211 LogFlowFuncLeave();
1212}
1213
1214void
1215soisfdisconnected(struct socket *so)
1216{
1217 NOREF(so);
1218#if 0
1219 so->so_state &= ~(SS_ISFCONNECTING|SS_ISFCONNECTED);
1220 close(so->s);
1221 so->so_state = SS_ISFDISCONNECTED;
1222 /*
1223 * XXX Do nothing ... ?
1224 */
1225#endif
1226}
1227
1228/*
1229 * Set write drain mode
1230 * Set CANTSENDMORE once all data has been write()n
1231 */
1232void
1233sofwdrain(struct socket *so)
1234{
1235 if (SBUF_LEN(&so->so_rcv))
1236 so->so_state |= SS_FWDRAIN;
1237 else
1238 sofcantsendmore(so);
1239}
1240
1241#if !defined(RT_OS_WINDOWS)
1242static void
1243send_icmp_to_guest(PNATState pData, char *buff, size_t len, const struct sockaddr_in *addr)
1244{
1245 struct ip *ip;
1246 uint32_t dst, src;
1247 char ip_copy[256];
1248 struct icmp *icp;
1249 int old_ip_len = 0;
1250 int hlen, original_hlen = 0;
1251 struct mbuf *m;
1252 struct icmp_msg *icm;
1253 uint8_t proto;
1254 int type = 0;
1255
1256 ip = (struct ip *)buff;
1257 /* Fix ip->ip_len to contain the total packet length including the header
1258 * in _host_ byte order for all OSes. On Darwin, that value already is in
1259 * host byte order. Solaris and Darwin report only the payload. */
1260#ifndef RT_OS_DARWIN
1261 ip->ip_len = RT_N2H_U16(ip->ip_len);
1262#endif
1263 hlen = (ip->ip_hl << 2);
1264#if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1265 ip->ip_len += hlen;
1266#endif
1267 if (ip->ip_len < hlen + ICMP_MINLEN)
1268 {
1269 Log(("send_icmp_to_guest: ICMP header is too small to understand which type/subtype of the datagram\n"));
1270 return;
1271 }
1272 icp = (struct icmp *)((char *)ip + hlen);
1273
1274 Log(("ICMP:received msg(t:%d, c:%d)\n", icp->icmp_type, icp->icmp_code));
1275 if ( icp->icmp_type != ICMP_ECHOREPLY
1276 && icp->icmp_type != ICMP_TIMXCEED
1277 && icp->icmp_type != ICMP_UNREACH)
1278 {
1279 return;
1280 }
1281
1282 /*
1283 * ICMP_ECHOREPLY, ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1284 * ICMP_ECHOREPLY assuming data 0
1285 * icmp_{type(8), code(8), cksum(16),identifier(16),seqnum(16)}
1286 */
1287 if (ip->ip_len < hlen + 8)
1288 {
1289 Log(("send_icmp_to_guest: NAT accept ICMP_{ECHOREPLY, TIMXCEED, UNREACH} the minimum size is 64 (see rfc792)\n"));
1290 return;
1291 }
1292
1293 type = icp->icmp_type;
1294 if ( type == ICMP_TIMXCEED
1295 || type == ICMP_UNREACH)
1296 {
1297 /*
1298 * ICMP_TIMXCEED, ICMP_UNREACH minimal header size is
1299 * icmp_{type(8), code(8), cksum(16),unused(32)} + IP header + 64 bit of original datagram
1300 */
1301 if (ip->ip_len < hlen + 2*8 + sizeof(struct ip))
1302 {
1303 Log(("send_icmp_to_guest: NAT accept ICMP_{TIMXCEED, UNREACH} the minimum size of ipheader + 64 bit of data (see rfc792)\n"));
1304 return;
1305 }
1306 ip = &icp->icmp_ip;
1307 }
1308
1309 icm = icmp_find_original_mbuf(pData, ip);
1310 if (icm == NULL)
1311 {
1312 Log(("NAT: Can't find the corresponding packet for the received ICMP\n"));
1313 return;
1314 }
1315
1316 m = icm->im_m;
1317 if (!m)
1318 {
1319 LogFunc(("%R[natsock] hasn't stored it's mbuf on sent\n", icm->im_so));
1320 goto done;
1321 }
1322
1323 src = addr->sin_addr.s_addr;
1324 if (type == ICMP_ECHOREPLY)
1325 {
1326 struct ip *ip0 = mtod(m, struct ip *);
1327 struct icmp *icp0 = (struct icmp *)((char *)ip0 + (ip0->ip_hl << 2));
1328 if (icp0->icmp_type != ICMP_ECHO)
1329 {
1330 Log(("NAT: we haven't found echo for this reply\n"));
1331 goto done;
1332 }
1333 /*
1334 * while combining buffer to send (see ip_icmp.c) we control ICMP header only,
1335 * IP header combined by OS network stack, our local copy of IP header contians values
1336 * in host byte order so no byte order conversion is required. IP headers fields are converting
1337 * in ip_output0 routine only.
1338 */
1339 if ( (ip->ip_len - hlen)
1340 != (ip0->ip_len - (ip0->ip_hl << 2)))
1341 {
1342 Log(("NAT: ECHO(%d) lenght doesn't match ECHOREPLY(%d)\n",
1343 (ip->ip_len - hlen), (ip0->ip_len - (ip0->ip_hl << 2))));
1344 goto done;
1345 }
1346 }
1347
1348 /* ip points on origianal ip header */
1349 ip = mtod(m, struct ip *);
1350 proto = ip->ip_p;
1351 /* Now ip is pointing on header we've sent from guest */
1352 if ( icp->icmp_type == ICMP_TIMXCEED
1353 || icp->icmp_type == ICMP_UNREACH)
1354 {
1355 old_ip_len = (ip->ip_hl << 2) + 64;
1356 if (old_ip_len > sizeof(ip_copy))
1357 old_ip_len = sizeof(ip_copy);
1358 memcpy(ip_copy, ip, old_ip_len);
1359 }
1360
1361 /* source address from original IP packet*/
1362 dst = ip->ip_src.s_addr;
1363
1364 /* overide ther tail of old packet */
1365 ip = mtod(m, struct ip *); /* ip is from mbuf we've overrided */
1366 original_hlen = ip->ip_hl << 2;
1367 /* saves original ip header and options */
1368 m_copyback(pData, m, original_hlen, len - hlen, buff + hlen);
1369 ip->ip_len = m_length(m, NULL);
1370 ip->ip_p = IPPROTO_ICMP; /* the original package could be whatever, but we're response via ICMP*/
1371
1372 icp = (struct icmp *)((char *)ip + (ip->ip_hl << 2));
1373 type = icp->icmp_type;
1374 if ( type == ICMP_TIMXCEED
1375 || type == ICMP_UNREACH)
1376 {
1377 /* according RFC 793 error messages required copy of initial IP header + 64 bit */
1378 memcpy(&icp->icmp_ip, ip_copy, old_ip_len);
1379
1380 /* undo byte order conversions done in ip_input() */
1381 HTONS(icp->icmp_ip.ip_len);
1382 HTONS(icp->icmp_ip.ip_id);
1383 HTONS(icp->icmp_ip.ip_off);
1384
1385 ip->ip_tos = ((ip->ip_tos & 0x1E) | 0xC0); /* high priority for errors */
1386 }
1387
1388 ip->ip_src.s_addr = src;
1389 ip->ip_dst.s_addr = dst;
1390 icmp_reflect(pData, m);
1391 /* m was freed */
1392 icm->im_m = NULL;
1393
1394 done:
1395 icmp_msg_delete(pData, icm);
1396}
1397
1398static void sorecvfrom_icmp_unix(PNATState pData, struct socket *so)
1399{
1400 struct sockaddr_in addr;
1401 socklen_t addrlen = sizeof(struct sockaddr_in);
1402 struct ip ip;
1403 char *buff;
1404 int len = 0;
1405
1406 /* 1- step: read the ip header */
1407 len = recvfrom(so->s, &ip, sizeof(struct ip), MSG_PEEK,
1408 (struct sockaddr *)&addr, &addrlen);
1409 if ( len < 0
1410 && ( soIgnorableErrorCode(errno)
1411 || errno == ENOTCONN))
1412 {
1413 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm (would block)\n"));
1414 return;
1415 }
1416
1417 if ( len < sizeof(struct ip)
1418 || len < 0
1419 || len == 0)
1420 {
1421 u_char code;
1422 code = ICMP_UNREACH_PORT;
1423
1424 if (errno == EHOSTUNREACH)
1425 code = ICMP_UNREACH_HOST;
1426 else if (errno == ENETUNREACH)
1427 code = ICMP_UNREACH_NET;
1428
1429 LogRel(("NAT: UDP ICMP rx errno=%d (%s)\n", errno, strerror(errno)));
1430 icmp_error(pData, so->so_m, ICMP_UNREACH, code, 0, strerror(errno));
1431 so->so_m = NULL;
1432 Log(("sorecvfrom_icmp_unix: 1 - step can't read IP datagramm\n"));
1433 return;
1434 }
1435 /* basic check of IP header */
1436 if ( ip.ip_v != IPVERSION
1437# ifndef RT_OS_DARWIN
1438 || ip.ip_p != IPPROTO_ICMP
1439# endif
1440 )
1441 {
1442 Log(("sorecvfrom_icmp_unix: 1 - step IP isn't IPv4\n"));
1443 return;
1444 }
1445# ifndef RT_OS_DARWIN
1446 /* Darwin reports the IP length already in host byte order. */
1447 ip.ip_len = RT_N2H_U16(ip.ip_len);
1448# endif
1449# if defined(RT_OS_SOLARIS) || defined(RT_OS_DARWIN)
1450 /* Solaris and Darwin report the payload only */
1451 ip.ip_len += (ip.ip_hl << 2);
1452# endif
1453 /* Note: ip->ip_len in host byte order (all OS) */
1454 len = ip.ip_len;
1455 buff = RTMemAlloc(len);
1456 if (buff == NULL)
1457 {
1458 Log(("sorecvfrom_icmp_unix: 1 - step can't allocate enought room for datagram\n"));
1459 return;
1460 }
1461 /* 2 - step: we're reading rest of the datagramm to the buffer */
1462 addrlen = sizeof(struct sockaddr_in);
1463 memset(&addr, 0, addrlen);
1464 len = recvfrom(so->s, buff, len, 0,
1465 (struct sockaddr *)&addr, &addrlen);
1466 if ( len < 0
1467 && ( soIgnorableErrorCode(errno)
1468 || errno == ENOTCONN))
1469 {
1470 Log(("sorecvfrom_icmp_unix: 2 - step can't read IP body (would block expected:%d)\n",
1471 ip.ip_len));
1472 RTMemFree(buff);
1473 return;
1474 }
1475 if ( len < 0
1476 || len == 0)
1477 {
1478 Log(("sorecvfrom_icmp_unix: 2 - step read of the rest of datagramm is fallen (errno:%d, len:%d expected: %d)\n",
1479 errno, len, (ip.ip_len - sizeof(struct ip))));
1480 RTMemFree(buff);
1481 return;
1482 }
1483 /* len is modified in 2nd read, when the rest of the datagramm was read */
1484 send_icmp_to_guest(pData, buff, len, &addr);
1485 RTMemFree(buff);
1486}
1487#endif /* !RT_OS_WINDOWS */
Note: See TracBrowser for help on using the repository browser.

© 2024 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette