ctdb-common: Stop a pcap-related crash on error
[samba.git] / ctdb / common / system_socket.c
1 /*
2    ctdb system specific code to manage raw sockets on linux
3
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Marc Dequènes (Duck) 2009
7    Copyright (C) Volker Lendecke 2012
8
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, see <http://www.gnu.org/licenses/>.
21 */
22
23 #include "replace.h"
24
25 /*
26  * Use BSD struct tcphdr field names for portability.  Modern glibc
27  * makes them available by default via <netinet/tcp.h> but older glibc
28  * requires __FAVOR_BSD to be defined.
29  *
30  * __FAVOR_BSD is normally defined in <features.h> if _DEFAULT_SOURCE
31  * (new) or _BSD_SOURCE (now deprecated) is set and _GNU_SOURCE is not
32  * set.  Including "replace.h" above causes <features.h> to be
33  * indirectly included and this will not set __FAVOR_BSD because
34  * _GNU_SOURCE is set in Samba's "config.h" (which is included by
35  * "replace.h").
36  *
37  * Therefore, set __FAVOR_BSD by hand below.
38  */
39 #define __FAVOR_BSD 1
40 #include "system/network.h"
41
42 #ifdef HAVE_NETINET_IF_ETHER_H
43 #include <netinet/if_ether.h>
44 #endif
45 #ifdef HAVE_NETINET_IP6_H
46 #include <netinet/ip6.h>
47 #endif
48 #ifdef HAVE_NETINET_ICMP6_H
49 #include <netinet/icmp6.h>
50 #endif
51 #ifdef HAVE_LINUX_IF_PACKET_H
52 #include <linux/if_packet.h>
53 #endif
54
55 #ifndef ETHERTYPE_IP6
56 #define ETHERTYPE_IP6 0x86dd
57 #endif
58
59 #include "lib/util/debug.h"
60 #include "lib/util/blocking.h"
61
62 #include "protocol/protocol.h"
63
64 #include "common/logging.h"
65 #include "common/system_socket.h"
66
67 /*
68   uint16 checksum for n bytes
69  */
70 static uint32_t uint16_checksum(uint8_t *data, size_t n)
71 {
72         uint32_t sum=0;
73         uint16_t value;
74
75         while (n>=2) {
76                 memcpy(&value, data, 2);
77                 sum += (uint32_t)ntohs(value);
78                 data += 2;
79                 n -= 2;
80         }
81         if (n == 1) {
82                 sum += (uint32_t)ntohs(*data);
83         }
84         return sum;
85 }
86
87 /*
88  * See if the given IP is currently on an interface
89  */
90 bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
91 {
92         int s;
93         int ret;
94         ctdb_sock_addr __addr = *_addr;
95         ctdb_sock_addr *addr = &__addr;
96         socklen_t addrlen = 0;
97
98         switch (addr->sa.sa_family) {
99         case AF_INET:
100                 addr->ip.sin_port = 0;
101                 addrlen = sizeof(struct sockaddr_in);
102                 break;
103         case AF_INET6:
104                 addr->ip6.sin6_port = 0;
105                 addrlen = sizeof(struct sockaddr_in6);
106                 break;
107         }
108
109         s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
110         if (s == -1) {
111                 return false;
112         }
113
114         ret = bind(s, (struct sockaddr *)addr, addrlen);
115
116         close(s);
117         return ret == 0;
118 }
119
120 /*
121  * simple TCP checksum - assumes data is multiple of 2 bytes long
122  */
123 static uint16_t ip_checksum(uint8_t *data, size_t n, struct ip *ip)
124 {
125         uint32_t sum = uint16_checksum(data, n);
126         uint16_t sum2;
127
128         sum += uint16_checksum((uint8_t *)&ip->ip_src, sizeof(ip->ip_src));
129         sum += uint16_checksum((uint8_t *)&ip->ip_dst, sizeof(ip->ip_dst));
130         sum += ip->ip_p + n;
131         sum = (sum & 0xFFFF) + (sum >> 16);
132         sum = (sum & 0xFFFF) + (sum >> 16);
133         sum2 = htons(sum);
134         sum2 = ~sum2;
135         if (sum2 == 0) {
136                 return 0xFFFF;
137         }
138         return sum2;
139 }
140
141 static uint16_t ip6_checksum(uint8_t *data, size_t n, struct ip6_hdr *ip6)
142 {
143         uint16_t phdr[3];
144         uint32_t sum = 0;
145         uint16_t sum2;
146         uint32_t len;
147
148         sum += uint16_checksum((uint8_t *)&ip6->ip6_src, 16);
149         sum += uint16_checksum((uint8_t *)&ip6->ip6_dst, 16);
150
151         len = htonl(n);
152         phdr[0] = len & UINT16_MAX;
153         phdr[1] = (len >> 16) & UINT16_MAX;
154         /* ip6_nxt is only 8 bits, so fits comfortably into a uint16_t */
155         phdr[2] = htons(ip6->ip6_nxt);
156         sum += uint16_checksum((uint8_t *)phdr, sizeof(phdr));
157
158         sum += uint16_checksum(data, n);
159
160         sum = (sum & 0xFFFF) + (sum >> 16);
161         sum = (sum & 0xFFFF) + (sum >> 16);
162         sum2 = htons(sum);
163         sum2 = ~sum2;
164         if (sum2 == 0) {
165                 return 0xFFFF;
166         }
167         return sum2;
168 }
169
170 /*
171  * Send gratuitous ARP request/reply or IPv6 neighbor advertisement
172  */
173
174 #ifdef HAVE_PACKETSOCKET
175
176 /*
177  * Create IPv4 ARP requests/replies or IPv6 neighbour advertisement
178  * packets
179  */
180
181 #define ARP_STRUCT_SIZE sizeof(struct ether_header) + \
182                         sizeof(struct ether_arp)
183
184 #define IP6_NA_STRUCT_SIZE sizeof(struct ether_header) + \
185                            sizeof(struct ip6_hdr) + \
186                            sizeof(struct nd_neighbor_advert) + \
187                            sizeof(struct nd_opt_hdr) + \
188                            sizeof(struct ether_addr)
189
190 #define ARP_BUFFER_SIZE MAX(ARP_STRUCT_SIZE, 64)
191
192 #define IP6_NA_BUFFER_SIZE MAX(IP6_NA_STRUCT_SIZE, 64)
193
194 static int arp_build(uint8_t *buffer,
195                      size_t buflen,
196                      const struct sockaddr_in *addr,
197                      const struct ether_addr *hwaddr,
198                      bool reply,
199                      struct ether_addr **ether_dhost,
200                      size_t *len)
201 {
202         size_t l = ARP_BUFFER_SIZE;
203         struct ether_header *eh;
204         struct ether_arp *ea;
205         struct arphdr *ah;
206
207         if (addr->sin_family != AF_INET) {
208                 return EINVAL;
209         }
210
211         if (buflen < l) {
212                 return EMSGSIZE;
213         }
214
215         memset(buffer, 0 , l);
216
217         eh = (struct ether_header *)buffer;
218         memset(eh->ether_dhost, 0xff, ETH_ALEN);
219         memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
220         eh->ether_type = htons(ETHERTYPE_ARP);
221
222         ea = (struct ether_arp *)(buffer + sizeof(struct ether_header));
223         ah = &ea->ea_hdr;
224         ah->ar_hrd = htons(ARPHRD_ETHER);
225         ah->ar_pro = htons(ETH_P_IP);
226         ah->ar_hln = ETH_ALEN;
227         ah->ar_pln = sizeof(ea->arp_spa);
228
229         if (! reply) {
230                 ah->ar_op  = htons(ARPOP_REQUEST);
231                 memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
232                 memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
233                 memset(ea->arp_tha, 0, ETH_ALEN);
234                 memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
235         } else {
236                 ah->ar_op  = htons(ARPOP_REPLY);
237                 memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
238                 memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
239                 memcpy(ea->arp_tha, hwaddr, ETH_ALEN);
240                 memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
241         }
242
243         *ether_dhost = (struct ether_addr *)eh->ether_dhost;
244         *len = l;
245         return 0;
246 }
247
248 static int ip6_na_build(uint8_t *buffer,
249                         size_t buflen,
250                         const struct sockaddr_in6 *addr,
251                         const struct ether_addr *hwaddr,
252                         struct ether_addr **ether_dhost,
253                         size_t *len)
254 {
255         size_t l = IP6_NA_BUFFER_SIZE;
256         struct ether_header *eh;
257         struct ip6_hdr *ip6;
258         struct nd_neighbor_advert *nd_na;
259         struct nd_opt_hdr *nd_oh;
260         struct ether_addr *ea;
261         int ret;
262
263         if (addr->sin6_family != AF_INET6) {
264                 return EINVAL;
265         }
266
267         if (buflen < l) {
268                 return EMSGSIZE;
269         }
270
271         memset(buffer, 0 , l);
272
273         eh = (struct ether_header *)buffer;
274         /*
275          * Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
276          * section 7) - note memset 0 above!
277          */
278         eh->ether_dhost[0] = 0x33;
279         eh->ether_dhost[1] = 0x33;
280         eh->ether_dhost[5] = 0x01;
281         memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
282         eh->ether_type = htons(ETHERTYPE_IP6);
283
284         ip6 = (struct ip6_hdr *)(buffer + sizeof(struct ether_header));
285         ip6->ip6_vfc  = 6 << 4;
286         ip6->ip6_plen = htons(sizeof(struct nd_neighbor_advert) +
287                               sizeof(struct nd_opt_hdr) +
288                               ETH_ALEN);
289         ip6->ip6_nxt  = IPPROTO_ICMPV6;
290         ip6->ip6_hlim = 255;
291         ip6->ip6_src  = addr->sin6_addr;
292         /* all-nodes multicast */
293
294         ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
295         if (ret != 1) {
296                 return EIO;
297         }
298
299         nd_na = (struct nd_neighbor_advert *)(buffer +
300                                               sizeof(struct ether_header) +
301                                               sizeof(struct ip6_hdr));
302         nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
303         nd_na->nd_na_code = 0;
304         nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
305         nd_na->nd_na_target = addr->sin6_addr;
306
307         /* Option: Target link-layer address */
308         nd_oh = (struct nd_opt_hdr *)(buffer +
309                                       sizeof(struct ether_header) +
310                                       sizeof(struct ip6_hdr) +
311                                       sizeof(struct nd_neighbor_advert));
312         nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
313         nd_oh->nd_opt_len = 1;  /* multiple of 8 octets */
314
315         ea = (struct ether_addr *)(buffer +
316                                    sizeof(struct ether_header) +
317                                    sizeof(struct ip6_hdr) +
318                                    sizeof(struct nd_neighbor_advert) +
319                                    sizeof(struct nd_opt_hdr));
320         memcpy(ea, hwaddr, ETH_ALEN);
321
322         nd_na->nd_na_cksum = ip6_checksum((uint8_t *)nd_na,
323                                           ntohs(ip6->ip6_plen),
324                                           ip6);
325
326         *ether_dhost = (struct ether_addr *)eh->ether_dhost;
327         *len = l;
328         return 0;
329 }
330
331 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
332 {
333         int s;
334         struct sockaddr_ll sall = {0};
335         struct ifreq if_hwaddr = {
336                 .ifr_ifru = {
337                         .ifru_flags = 0
338                 },
339         };
340         uint8_t buffer[MAX(ARP_BUFFER_SIZE, IP6_NA_BUFFER_SIZE)];
341         struct ifreq ifr = {
342                 .ifr_ifru = {
343                         .ifru_flags = 0
344                 },
345         };
346         struct ether_addr *hwaddr = NULL;
347         struct ether_addr *ether_dhost = NULL;
348         size_t len = 0;
349         int ret = 0;
350
351         s = socket(AF_PACKET, SOCK_RAW, 0);
352         if (s == -1) {
353                 ret = errno;
354                 DBG_ERR("Failed to open raw socket\n");
355                 return ret;
356         }
357         DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
358
359         /* Find interface */
360         strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
361         if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
362                 ret = errno;
363                 DBG_ERR("Interface '%s' not found\n", iface);
364                 goto fail;
365         }
366
367         /* Get MAC address */
368         strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
369         ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
370         if ( ret < 0 ) {
371                 ret = errno;
372                 DBG_ERR("ioctl failed\n");
373                 goto fail;
374         }
375         if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
376                 ret = 0;
377                 D_DEBUG("Ignoring loopback arp request\n");
378                 goto fail;
379         }
380         if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
381                 ret = EINVAL;
382                 DBG_ERR("Not an ethernet address family (0x%x)\n",
383                         if_hwaddr.ifr_hwaddr.sa_family);
384                 goto fail;;
385         }
386
387         /* Set up most of destination address structure */
388         sall.sll_family = AF_PACKET;
389         sall.sll_halen = sizeof(struct ether_addr);
390         sall.sll_protocol = htons(ETH_P_ALL);
391         sall.sll_ifindex = ifr.ifr_ifindex;
392
393         /* For clarity */
394         hwaddr = (struct ether_addr *)if_hwaddr.ifr_hwaddr.sa_data;
395
396         switch (addr->ip.sin_family) {
397         case AF_INET:
398                 /* Send gratuitous ARP */
399                 ret = arp_build(buffer,
400                                 sizeof(buffer),
401                                 &addr->ip,
402                                 hwaddr,
403                                 false,
404                                 &ether_dhost,
405                                 &len);
406                 if (ret != 0) {
407                         DBG_ERR("Failed to build ARP request\n");
408                         goto fail;
409                 }
410
411                 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
412
413                 ret = sendto(s,
414                              buffer,
415                              len,
416                              0,
417                              (struct sockaddr *)&sall,
418                              sizeof(sall));
419                 if (ret < 0 ) {
420                         ret = errno;
421                         DBG_ERR("Failed sendto\n");
422                         goto fail;
423                 }
424
425                 /* Send unsolicited ARP reply */
426                 ret = arp_build(buffer,
427                                 sizeof(buffer),
428                                 &addr->ip,
429                                 hwaddr,
430                                 true,
431                                 &ether_dhost,
432                                 &len);
433                 if (ret != 0) {
434                         DBG_ERR("Failed to build ARP reply\n");
435                         goto fail;
436                 }
437
438                 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
439
440                 ret = sendto(s,
441                              buffer,
442                              len,
443                              0,
444                              (struct sockaddr *)&sall,
445                              sizeof(sall));
446                 if (ret < 0 ) {
447                         ret = errno;
448                         DBG_ERR("Failed sendto\n");
449                         goto fail;
450                 }
451
452                 close(s);
453                 break;
454
455         case AF_INET6:
456                 ret = ip6_na_build(buffer,
457                                    sizeof(buffer),
458                                    &addr->ip6,
459                                    hwaddr,
460                                    &ether_dhost,
461                                    &len);
462                 if (ret != 0) {
463                         DBG_ERR("Failed to build IPv6 neighbor advertisement\n");
464                         goto fail;
465                 }
466
467                 memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
468
469                 ret = sendto(s,
470                              buffer,
471                              len,
472                              0,
473                              (struct sockaddr *)&sall,
474                              sizeof(sall));
475                 if (ret < 0 ) {
476                         ret = errno;
477                         DBG_ERR("Failed sendto\n");
478                         goto fail;
479                 }
480
481                 close(s);
482                 break;
483
484         default:
485                 ret = EINVAL;
486                 DBG_ERR("Not an ipv4/ipv6 address (family is %u)\n",
487                         addr->ip.sin_family);
488                 goto fail;
489         }
490
491         return 0;
492
493 fail:
494         close(s);
495         return ret;
496 }
497
498 #else /* HAVE_PACKETSOCKET */
499
500 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
501 {
502         /* Not implemented */
503         return ENOSYS;
504 }
505
506 #endif /* HAVE_PACKETSOCKET */
507
508
509 #define IP4_TCP_BUFFER_SIZE sizeof(struct ip) + \
510                             sizeof(struct tcphdr)
511
512 #define IP6_TCP_BUFFER_SIZE sizeof(struct ip6_hdr) + \
513                             sizeof(struct tcphdr)
514
515 static int tcp4_build(uint8_t *buf,
516                       size_t buflen,
517                       const struct sockaddr_in *src,
518                       const struct sockaddr_in *dst,
519                       uint32_t seq,
520                       uint32_t ack,
521                       int rst,
522                       size_t *len)
523 {
524         size_t l = IP4_TCP_BUFFER_SIZE;
525         struct {
526                 struct ip ip;
527                 struct tcphdr tcp;
528         } *ip4pkt;
529
530         if (l != sizeof(*ip4pkt)) {
531                 return EMSGSIZE;
532         }
533
534         if (buflen < l) {
535                 return EMSGSIZE;
536         }
537
538         ip4pkt = (void *)buf;
539         memset(ip4pkt, 0, l);
540
541         ip4pkt->ip.ip_v     = 4;
542         ip4pkt->ip.ip_hl    = sizeof(ip4pkt->ip)/sizeof(uint32_t);
543         ip4pkt->ip.ip_len   = htons(sizeof(ip4pkt));
544         ip4pkt->ip.ip_ttl   = 255;
545         ip4pkt->ip.ip_p     = IPPROTO_TCP;
546         ip4pkt->ip.ip_src.s_addr = src->sin_addr.s_addr;
547         ip4pkt->ip.ip_dst.s_addr = dst->sin_addr.s_addr;
548         ip4pkt->ip.ip_sum   = 0;
549
550         ip4pkt->tcp.th_sport = src->sin_port;
551         ip4pkt->tcp.th_dport = dst->sin_port;
552         ip4pkt->tcp.th_seq   = seq;
553         ip4pkt->tcp.th_ack   = ack;
554         ip4pkt->tcp.th_flags = 0;
555         ip4pkt->tcp.th_flags |= TH_ACK;
556         if (rst) {
557                 ip4pkt->tcp.th_flags |= TH_RST;
558         }
559         ip4pkt->tcp.th_off   = sizeof(ip4pkt->tcp)/sizeof(uint32_t);
560         /* this makes it easier to spot in a sniffer */
561         ip4pkt->tcp.th_win   = htons(1234);
562         ip4pkt->tcp.th_sum   = ip_checksum((uint8_t *)&ip4pkt->tcp,
563                                            sizeof(ip4pkt->tcp),
564                                            &ip4pkt->ip);
565
566         *len = l;
567         return 0;
568 }
569
570 static int tcp6_build(uint8_t *buf,
571                       size_t buflen,
572                       const struct sockaddr_in6 *src,
573                       const struct sockaddr_in6 *dst,
574                       uint32_t seq,
575                       uint32_t ack,
576                       int rst,
577                       size_t *len)
578 {
579         size_t l = IP6_TCP_BUFFER_SIZE;
580         struct {
581                 struct ip6_hdr ip6;
582                 struct tcphdr tcp;
583         } *ip6pkt;
584
585         if (l != sizeof(*ip6pkt)) {
586                 return EMSGSIZE;
587         }
588
589         if (buflen < l) {
590                 return EMSGSIZE;
591         }
592
593         ip6pkt = (void *)buf;
594         memset(ip6pkt, 0, l);
595
596         ip6pkt->ip6.ip6_vfc  = 6 << 4;
597         ip6pkt->ip6.ip6_plen = htons(sizeof(struct tcphdr));
598         ip6pkt->ip6.ip6_nxt  = IPPROTO_TCP;
599         ip6pkt->ip6.ip6_hlim = 64;
600         ip6pkt->ip6.ip6_src  = src->sin6_addr;
601         ip6pkt->ip6.ip6_dst  = dst->sin6_addr;
602
603         ip6pkt->tcp.th_sport = src->sin6_port;
604         ip6pkt->tcp.th_dport = dst->sin6_port;
605         ip6pkt->tcp.th_seq   = seq;
606         ip6pkt->tcp.th_ack   = ack;
607         ip6pkt->tcp.th_flags = 0;
608         ip6pkt->tcp.th_flags |= TH_ACK;
609         if (rst) {
610                 ip6pkt->tcp.th_flags |= TH_RST;
611         }
612         ip6pkt->tcp.th_off    = sizeof(ip6pkt->tcp)/sizeof(uint32_t);
613         /* this makes it easier to spot in a sniffer */
614         ip6pkt->tcp.th_win   = htons(1234);
615         ip6pkt->tcp.th_sum   = ip6_checksum((uint8_t *)&ip6pkt->tcp,
616                                             sizeof(ip6pkt->tcp),
617                                             &ip6pkt->ip6);
618
619         *len = l;
620         return 0;
621 }
622
623 /*
624  * Send tcp segment from the specified IP/port to the specified
625  * destination IP/port.
626  *
627  * This is used to trigger the receiving host into sending its own ACK,
628  * which should trigger early detection of TCP reset by the client
629  * after IP takeover
630  *
631  * This can also be used to send RST segments (if rst is true) and also
632  * if correct seq and ack numbers are provided.
633  */
634 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
635                       const ctdb_sock_addr *src,
636                       uint32_t seq,
637                       uint32_t ack,
638                       int rst)
639 {
640         uint8_t buf[MAX(IP4_TCP_BUFFER_SIZE, IP6_TCP_BUFFER_SIZE)];
641         size_t len = 0;
642         int ret;
643         int s;
644         uint32_t one = 1;
645         struct sockaddr_in6 tmpdest = { 0 };
646         int saved_errno;
647
648         switch (src->ip.sin_family) {
649         case AF_INET:
650                 ret = tcp4_build(buf,
651                                  sizeof(buf),
652                                  &src->ip,
653                                  &dest->ip,
654                                  seq,
655                                  ack,
656                                  rst,
657                                  &len);
658                 if (ret != 0) {
659                         DBG_ERR("Failed to build TCP packet (%d)\n", ret);
660                         return ret;
661                 }
662
663                 /* open a raw socket to send this segment from */
664                 s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
665                 if (s == -1) {
666                         DBG_ERR("Failed to open raw socket (%s)\n",
667                                 strerror(errno));
668                         return -1;
669                 }
670
671                 ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
672                 if (ret != 0) {
673                         DBG_ERR("Failed to setup IP headers (%s)\n",
674                                 strerror(errno));
675                         close(s);
676                         return -1;
677                 }
678
679                 ret = sendto(s,
680                              buf,
681                              len,
682                              0,
683                              (const struct sockaddr *)&dest->ip,
684                              sizeof(dest->ip));
685                 saved_errno = errno;
686                 close(s);
687                 if (ret == -1) {
688                         D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
689                         return -1;
690                 }
691                 if ((size_t)ret != len) {
692                         DBG_ERR("Failed sendto - didn't send full packet\n");
693                         return -1;
694                 }
695                 break;
696
697         case AF_INET6:
698                 ret = tcp6_build(buf,
699                                  sizeof(buf),
700                                  &src->ip6,
701                                  &dest->ip6,
702                                  seq,
703                                  ack,
704                                  rst,
705                                  &len);
706                 if (ret != 0) {
707                         DBG_ERR("Failed to build TCP packet (%d)\n", ret);
708                         return ret;
709                 }
710
711                 s = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
712                 if (s == -1) {
713                         DBG_ERR("Failed to open sending socket\n");
714                         return -1;
715
716                 }
717                 /*
718                  * sendto() on an IPv6 raw socket requires the port to
719                  * be either 0 or a protocol value
720                  */
721                 tmpdest = dest->ip6;
722                 tmpdest.sin6_port = 0;
723
724                 ret = sendto(s,
725                              buf,
726                              len,
727                              0,
728                              (const struct sockaddr *)&tmpdest,
729                              sizeof(tmpdest));
730                 saved_errno = errno;
731                 close(s);
732                 if (ret == -1) {
733                         D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
734                         return -1;
735                 }
736                 if ((size_t)ret != len) {
737                         DBG_ERR("Failed sendto - didn't send full packet\n");
738                         return -1;
739                 }
740                 break;
741
742         default:
743                 DBG_ERR("Not an ipv4/v6 address\n");
744                 return -1;
745         }
746
747         return 0;
748 }
749
750 static int tcp4_extract(const uint8_t *ip_pkt,
751                         size_t pktlen,
752                         struct sockaddr_in *src,
753                         struct sockaddr_in *dst,
754                         uint32_t *ack_seq,
755                         uint32_t *seq,
756                         int *rst,
757                         uint16_t *window)
758 {
759         const struct ip *ip;
760         const struct tcphdr *tcp;
761
762         if (pktlen < sizeof(struct ip)) {
763                 return EMSGSIZE;
764         }
765
766         ip = (const struct ip *)ip_pkt;
767
768         /* IPv4 only */
769         if (ip->ip_v != 4) {
770                 return ENOMSG;
771         }
772         /* Don't look at fragments */
773         if ((ntohs(ip->ip_off)&0x1fff) != 0) {
774                 return ENOMSG;
775         }
776         /* TCP only */
777         if (ip->ip_p != IPPROTO_TCP) {
778                 return ENOMSG;
779         }
780
781         /* Ensure there is enough of the packet to gather required fields */
782         if (pktlen <
783             (ip->ip_hl * sizeof(uint32_t)) + offsetof(struct tcphdr, th_sum)) {
784                 return EMSGSIZE;
785         }
786
787         tcp = (const struct tcphdr *)(ip_pkt + (ip->ip_hl * sizeof(uint32_t)));
788
789         src->sin_family      = AF_INET;
790         src->sin_addr.s_addr = ip->ip_src.s_addr;
791         src->sin_port        = tcp->th_sport;
792
793         dst->sin_family      = AF_INET;
794         dst->sin_addr.s_addr = ip->ip_dst.s_addr;
795         dst->sin_port        = tcp->th_dport;
796
797         *ack_seq             = tcp->th_ack;
798         *seq                 = tcp->th_seq;
799         if (window != NULL) {
800                 *window = tcp->th_win;
801         }
802         if (rst != NULL) {
803                 *rst = tcp->th_flags & TH_RST;
804         }
805
806         return 0;
807 }
808
809 static int tcp6_extract(const uint8_t *ip_pkt,
810                         size_t pktlen,
811                         struct sockaddr_in6 *src,
812                         struct sockaddr_in6 *dst,
813                         uint32_t *ack_seq,
814                         uint32_t *seq,
815                         int *rst,
816                         uint16_t *window)
817 {
818         const struct ip6_hdr *ip6;
819         const struct tcphdr *tcp;
820
821         /* Ensure there is enough of the packet to gather required fields */
822         if (pktlen < sizeof(struct ip6_hdr) + offsetof(struct tcphdr, th_sum)) {
823                 return EMSGSIZE;
824         }
825
826         ip6 = (const struct ip6_hdr *)ip_pkt;
827
828         /* IPv6 only */
829         if ((ip6->ip6_vfc >> 4) != 6){
830                 return ENOMSG;
831         }
832
833         /* TCP only */
834         if (ip6->ip6_nxt != IPPROTO_TCP) {
835                 return ENOMSG;
836         }
837
838         tcp = (const struct tcphdr *)(ip_pkt + sizeof(struct ip6_hdr));
839
840         src->sin6_family = AF_INET6;
841         src->sin6_port   = tcp->th_sport;
842         src->sin6_addr   = ip6->ip6_src;
843
844         dst->sin6_family = AF_INET6;
845         dst->sin6_port   = tcp->th_dport;
846         dst->sin6_addr   = ip6->ip6_dst;
847
848         *ack_seq             = tcp->th_ack;
849         *seq                 = tcp->th_seq;
850         if (window != NULL) {
851                 *window = tcp->th_win;
852         }
853         if (rst != NULL) {
854                 *rst = tcp->th_flags & TH_RST;
855         }
856
857         return 0;
858 }
859
860 /*
861  * Packet capture
862  *
863  * If AF_PACKET is available then use a raw socket otherwise use pcap.
864  * wscript has checked to make sure that pcap is available if needed.
865  */
866
867 #if defined(HAVE_AF_PACKET) && !defined(ENABLE_PCAP)
868
869 /*
870  * This function is used to open a raw socket to capture from
871  */
872 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
873 {
874         int s, ret;
875
876         /* Open a socket to capture all traffic */
877         s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
878         if (s == -1) {
879                 DBG_ERR("Failed to open raw socket\n");
880                 return -1;
881         }
882
883         DBG_DEBUG("Created RAW SOCKET FD:%d for tcp tickle\n", s);
884
885         ret = set_blocking(s, false);
886         if (ret != 0) {
887                 DBG_ERR("Failed to set socket non-blocking (%s)\n",
888                         strerror(errno));
889                 close(s);
890                 return -1;
891         }
892
893         set_close_on_exec(s);
894
895         return s;
896 }
897
898 /*
899  * This function is used to do any additional cleanup required when closing
900  * a capture socket.
901  * Note that the socket itself is closed automatically in the caller.
902  */
903 int ctdb_sys_close_capture_socket(void *private_data)
904 {
905         return 0;
906 }
907
908
909 /*
910  * called when the raw socket becomes readable
911  */
912 int ctdb_sys_read_tcp_packet(int s, void *private_data,
913                              ctdb_sock_addr *src,
914                              ctdb_sock_addr *dst,
915                              uint32_t *ack_seq,
916                              uint32_t *seq,
917                              int *rst,
918                              uint16_t *window)
919 {
920         ssize_t nread;
921         uint8_t pkt[100]; /* Large enough for simple ACK/RST packets */
922         struct ether_header *eth;
923         int ret;
924
925         nread = recv(s, pkt, sizeof(pkt), MSG_TRUNC);
926         if (nread == -1) {
927                 return errno;
928         }
929         if ((size_t)nread < sizeof(*eth)) {
930                 return EMSGSIZE;
931         }
932
933         ZERO_STRUCTP(src);
934         ZERO_STRUCTP(dst);
935
936         /* Ethernet */
937         eth = (struct ether_header *)pkt;
938
939         /* we want either IPv4 or IPv6 */
940         if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
941                 ret = tcp4_extract(pkt + sizeof(struct ether_header),
942                                    (size_t)nread - sizeof(struct ether_header),
943                                    &src->ip,
944                                    &dst->ip,
945                                    ack_seq,
946                                    seq,
947                                    rst,
948                                    window);
949                 return ret;
950
951         } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
952                 ret = tcp6_extract(pkt + sizeof(struct ether_header),
953                                    (size_t)nread - sizeof(struct ether_header),
954                                    &src->ip6,
955                                    &dst->ip6,
956                                    ack_seq,
957                                    seq,
958                                    rst,
959                                    window);
960                 return ret;
961         }
962
963         return ENOMSG;
964 }
965
966 #else /* defined(HAVE_AF_PACKET) && !defined(ENABLE_PCAP) */
967
968 #include <pcap.h>
969
970 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
971 {
972         char errbuf[PCAP_ERRBUF_SIZE];
973         pcap_t *pt;
974
975         pt = pcap_open_live(iface, 100, 0, 0, errbuf);
976         if (pt == NULL) {
977                 DBG_ERR("Failed to open pcap capture device %s (%s)\n",
978                         iface,
979                         errbuf);
980                 return -1;
981         }
982         *((pcap_t **)private_data) = pt;
983
984         return pcap_fileno(pt);
985 }
986
987 int ctdb_sys_close_capture_socket(void *private_data)
988 {
989         pcap_t *pt = (pcap_t *)private_data;
990         pcap_close(pt);
991         return 0;
992 }
993
994 int ctdb_sys_read_tcp_packet(int s,
995                              void *private_data,
996                              ctdb_sock_addr *src,
997                              ctdb_sock_addr *dst,
998                              uint32_t *ack_seq,
999                              uint32_t *seq,
1000                              int *rst,
1001                              uint16_t *window)
1002 {
1003         int ret;
1004         const struct ether_header *eth;
1005         struct pcap_pkthdr pkthdr;
1006         const u_char *buffer;
1007         pcap_t *pt = (pcap_t *)private_data;
1008
1009         buffer=pcap_next(pt, &pkthdr);
1010         if (buffer==NULL) {
1011                 return ENOMSG;
1012         }
1013
1014         ZERO_STRUCTP(src);
1015         ZERO_STRUCTP(dst);
1016
1017         /* Ethernet */
1018         eth = (const struct ether_header *)buffer;
1019
1020         /* we want either IPv4 or IPv6 */
1021         if (eth->ether_type == htons(ETHERTYPE_IP)) {
1022                 ret = tcp4_extract(buffer + sizeof(struct ether_header),
1023                                    (size_t)(pkthdr.caplen -
1024                                             sizeof(struct ether_header)),
1025                                    &src->ip,
1026                                    &dst->ip,
1027                                    ack_seq,
1028                                    seq,
1029                                    rst,
1030                                    window);
1031                 return ret;
1032
1033         } else if (eth->ether_type == htons(ETHERTYPE_IP6)) {
1034                 ret = tcp6_extract(buffer + sizeof(struct ether_header),
1035                                    (size_t)(pkthdr.caplen -
1036                                             sizeof(struct ether_header)),
1037                                    &src->ip6,
1038                                    &dst->ip6,
1039                                    ack_seq,
1040                                    seq,
1041                                    rst,
1042                                    window);
1043                 return ret;
1044         }
1045
1046         return ENOMSG;
1047 }
1048
1049 #endif /* defined(HAVE_AF_PACKET) && !defined(ENABLE_PCAP) */