2 ctdb system specific code to manage raw sockets on linux
4 Copyright (C) Ronnie Sahlberg 2007
5 Copyright (C) Andrew Tridgell 2007
7 This program is free software; you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3 of the License, or
10 (at your option) any later version.
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with this program; if not, see <http://www.gnu.org/licenses/>.
22 #include "system/network.h"
23 #include "system/filesys.h"
24 #include "system/wait.h"
25 #include "../include/ctdb_private.h"
26 #include <netinet/if_ether.h>
27 #include <netinet/ip6.h>
28 #include <netinet/icmp6.h>
29 #include <net/if_arp.h>
30 #include <netpacket/packet.h>
31 #include <sys/prctl.h>
34 #define ETHERTYPE_IP6 0x86dd
38 calculate the tcp checksum for tcp over ipv6
40 static uint16_t tcp_checksum6(uint16_t *data, size_t n, struct ip6_hdr *ip6)
46 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_src, 16);
47 sum += uint16_checksum((uint16_t *)(void *)&ip6->ip6_dst, 16);
50 phdr[1] = htonl(ip6->ip6_nxt);
51 sum += uint16_checksum((uint16_t *)phdr, 8);
53 sum += uint16_checksum(data, n);
55 sum = (sum & 0xFFFF) + (sum >> 16);
56 sum = (sum & 0xFFFF) + (sum >> 16);
66 send gratuitous arp reply after we have taken over an ip address
68 saddr is the address we are trying to claim
69 iface is the interface name we will be using to claim the address
71 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
74 struct sockaddr_ll sall;
75 struct ether_header *eh;
78 struct nd_neighbor_solicit *nd_ns;
79 struct ifreq if_hwaddr;
80 unsigned char buffer[78]; /* ipv6 neigh solicitation size */
82 char bdcast[] = {0xff,0xff,0xff,0xff,0xff,0xff};
87 switch (addr->ip.sin_family) {
89 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
91 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
95 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
96 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
97 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
98 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
103 /* get the mac address */
104 strcpy(if_hwaddr.ifr_name, iface);
105 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
108 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
111 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
112 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
116 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
119 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
120 if_hwaddr.ifr_hwaddr.sa_family));
125 memset(buffer, 0 , 64);
126 eh = (struct ether_header *)buffer;
127 memset(eh->ether_dhost, 0xff, ETH_ALEN);
128 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
129 eh->ether_type = htons(ETHERTYPE_ARP);
131 ah = (struct arphdr *)&buffer[sizeof(struct ether_header)];
132 ah->ar_hrd = htons(ARPHRD_ETHER);
133 ah->ar_pro = htons(ETH_P_IP);
134 ah->ar_hln = ETH_ALEN;
137 /* send a gratious arp */
138 ah->ar_op = htons(ARPOP_REQUEST);
139 ptr = (char *)&ah[1];
140 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
142 memcpy(ptr, &addr->ip.sin_addr, 4);
144 memset(ptr, 0, ETH_ALEN);
146 memcpy(ptr, &addr->ip.sin_addr, 4);
149 sall.sll_family = AF_PACKET;
151 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
152 sall.sll_protocol = htons(ETH_P_ALL);
153 sall.sll_ifindex = ifr.ifr_ifindex;
154 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
157 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
161 /* send unsolicited arp reply broadcast */
162 ah->ar_op = htons(ARPOP_REPLY);
163 ptr = (char *)&ah[1];
164 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
166 memcpy(ptr, &addr->ip.sin_addr, 4);
168 memcpy(ptr, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
170 memcpy(ptr, &addr->ip.sin_addr, 4);
173 ret = sendto(s, buffer, 64, 0, (struct sockaddr *)&sall, sizeof(sall));
175 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
183 s = socket(PF_PACKET, SOCK_RAW, htons(ETHERTYPE_ARP));
185 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
189 DEBUG(DEBUG_DEBUG, (__location__ " Created SOCKET FD:%d for sending arp\n", s));
190 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
191 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
192 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
197 /* get the mac address */
198 strcpy(if_hwaddr.ifr_name, iface);
199 ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
202 DEBUG(DEBUG_CRIT,(__location__ " ioctl failed\n"));
205 if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
206 DEBUG(DEBUG_DEBUG,("Ignoring loopback arp request\n"));
210 if (if_hwaddr.ifr_hwaddr.sa_family != AF_LOCAL) {
213 DEBUG(DEBUG_CRIT,(__location__ " not an ethernet address family (0x%x)\n",
214 if_hwaddr.ifr_hwaddr.sa_family));
218 memset(buffer, 0 , sizeof(buffer));
219 eh = (struct ether_header *)buffer;
220 memset(eh->ether_dhost, 0xff, ETH_ALEN);
221 memcpy(eh->ether_shost, if_hwaddr.ifr_hwaddr.sa_data, ETH_ALEN);
222 eh->ether_type = htons(ETHERTYPE_IP6);
224 ip6 = (struct ip6_hdr *)(eh+1);
226 ip6->ip6_plen = htons(sizeof(*nd_ns));
227 ip6->ip6_nxt = IPPROTO_ICMPV6;
229 ip6->ip6_dst = addr->ip6.sin6_addr;
231 nd_ns = (struct nd_neighbor_solicit *)(ip6+1);
232 nd_ns->nd_ns_type = ND_NEIGHBOR_SOLICIT;
233 nd_ns->nd_ns_code = 0;
234 nd_ns->nd_ns_reserved = 0;
235 nd_ns->nd_ns_target = addr->ip6.sin6_addr;
237 nd_ns->nd_ns_cksum = tcp_checksum6((uint16_t *)nd_ns, ntohs(ip6->ip6_plen), ip6);
239 sall.sll_family = AF_PACKET;
241 memcpy(&sall.sll_addr[0], bdcast, sall.sll_halen);
242 sall.sll_protocol = htons(ETH_P_ALL);
243 sall.sll_ifindex = ifr.ifr_ifindex;
244 ret = sendto(s, buffer, 78, 0, (struct sockaddr *)&sall, sizeof(sall));
247 DEBUG(DEBUG_CRIT,(__location__ " failed sendto\n"));
254 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/ipv6 address (family is %u)\n", addr->ip.sin_family));
263 simple TCP checksum - assumes data is multiple of 2 bytes long
265 static uint16_t tcp_checksum(uint16_t *data, size_t n, struct iphdr *ip)
267 uint32_t sum = uint16_checksum(data, n);
269 sum += uint16_checksum((uint16_t *)(void *)&ip->saddr,
271 sum += uint16_checksum((uint16_t *)(void *)&ip->daddr,
273 sum += ip->protocol + n;
274 sum = (sum & 0xFFFF) + (sum >> 16);
275 sum = (sum & 0xFFFF) + (sum >> 16);
285 Send tcp segment from the specified IP/port to the specified
288 This is used to trigger the receiving host into sending its own ACK,
289 which should trigger early detection of TCP reset by the client
292 This can also be used to send RST segments (if rst is true) and also
293 if correct seq and ack numbers are provided.
295 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
296 const ctdb_sock_addr *src,
297 uint32_t seq, uint32_t ack, int rst)
303 ctdb_sock_addr *tmpdest;
313 switch (src->ip.sin_family) {
316 ip4pkt.ip.version = 4;
317 ip4pkt.ip.ihl = sizeof(ip4pkt.ip)/4;
318 ip4pkt.ip.tot_len = htons(sizeof(ip4pkt));
320 ip4pkt.ip.protocol = IPPROTO_TCP;
321 ip4pkt.ip.saddr = src->ip.sin_addr.s_addr;
322 ip4pkt.ip.daddr = dest->ip.sin_addr.s_addr;
325 ip4pkt.tcp.source = src->ip.sin_port;
326 ip4pkt.tcp.dest = dest->ip.sin_port;
327 ip4pkt.tcp.seq = seq;
328 ip4pkt.tcp.ack_seq = ack;
333 ip4pkt.tcp.doff = sizeof(ip4pkt.tcp)/4;
334 /* this makes it easier to spot in a sniffer */
335 ip4pkt.tcp.window = htons(1234);
336 ip4pkt.tcp.check = tcp_checksum((uint16_t *)&ip4pkt.tcp, sizeof(ip4pkt.tcp), &ip4pkt.ip);
338 /* open a raw socket to send this segment from */
339 s = socket(AF_INET, SOCK_RAW, htons(IPPROTO_RAW));
341 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket (%s)\n",
346 ret = setsockopt(s, SOL_IP, IP_HDRINCL, &one, sizeof(one));
348 DEBUG(DEBUG_CRIT,(__location__ " failed to setup IP headers (%s)\n",
355 set_close_on_exec(s);
357 ret = sendto(s, &ip4pkt, sizeof(ip4pkt), 0,
358 (const struct sockaddr *)&dest->ip,
361 if (ret != sizeof(ip4pkt)) {
362 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
368 ip6pkt.ip6.ip6_vfc = 0x60;
369 ip6pkt.ip6.ip6_plen = htons(20);
370 ip6pkt.ip6.ip6_nxt = IPPROTO_TCP;
371 ip6pkt.ip6.ip6_hlim = 64;
372 ip6pkt.ip6.ip6_src = src->ip6.sin6_addr;
373 ip6pkt.ip6.ip6_dst = dest->ip6.sin6_addr;
375 ip6pkt.tcp.source = src->ip6.sin6_port;
376 ip6pkt.tcp.dest = dest->ip6.sin6_port;
377 ip6pkt.tcp.seq = seq;
378 ip6pkt.tcp.ack_seq = ack;
383 ip6pkt.tcp.doff = sizeof(ip6pkt.tcp)/4;
384 /* this makes it easier to spot in a sniffer */
385 ip6pkt.tcp.window = htons(1234);
386 ip6pkt.tcp.check = tcp_checksum6((uint16_t *)&ip6pkt.tcp, sizeof(ip6pkt.tcp), &ip6pkt.ip6);
388 s = socket(PF_INET6, SOCK_RAW, IPPROTO_RAW);
390 DEBUG(DEBUG_CRIT, (__location__ " Failed to open sending socket\n"));
394 /* sendto() dont like if the port is set and the socket is
397 tmpdest = discard_const(dest);
398 tmpport = tmpdest->ip6.sin6_port;
400 tmpdest->ip6.sin6_port = 0;
401 ret = sendto(s, &ip6pkt, sizeof(ip6pkt), 0,
402 (const struct sockaddr *)&dest->ip6,
404 tmpdest->ip6.sin6_port = tmpport;
407 if (ret != sizeof(ip6pkt)) {
408 DEBUG(DEBUG_CRIT,(__location__ " failed sendto (%s)\n", strerror(errno)));
414 DEBUG(DEBUG_CRIT,(__location__ " not an ipv4/v6 address\n"));
422 This function is used to open a raw socket to capture from
424 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
428 /* Open a socket to capture all traffic */
429 s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
431 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
435 DEBUG(DEBUG_DEBUG, (__location__ " Created RAW SOCKET FD:%d for tcp tickle\n", s));
438 set_close_on_exec(s);
444 This function is used to do any additional cleanup required when closing
446 Note that the socket itself is closed automatically in the caller.
448 int ctdb_sys_close_capture_socket(void *private_data)
455 called when the raw socket becomes readable
457 int ctdb_sys_read_tcp_packet(int s, void *private_data,
458 ctdb_sock_addr *src, ctdb_sock_addr *dst,
459 uint32_t *ack_seq, uint32_t *seq)
462 #define RCVPKTSIZE 100
463 char pkt[RCVPKTSIZE];
464 struct ether_header *eth;
469 ret = recv(s, pkt, RCVPKTSIZE, MSG_TRUNC);
470 if (ret < sizeof(*eth)+sizeof(*ip)) {
475 eth = (struct ether_header *)pkt;
477 /* we want either IPv4 or IPv6 */
478 if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
480 ip = (struct iphdr *)(eth+1);
482 /* We only want IPv4 packets */
483 if (ip->version != 4) {
486 /* Dont look at fragments */
487 if ((ntohs(ip->frag_off)&0x1fff) != 0) {
490 /* we only want TCP */
491 if (ip->protocol != IPPROTO_TCP) {
495 /* make sure its not a short packet */
496 if (offsetof(struct tcphdr, ack_seq) + 4 +
497 (ip->ihl*4) + sizeof(*eth) > ret) {
501 tcp = (struct tcphdr *)((ip->ihl*4) + (char *)ip);
503 /* tell the caller which one we've found */
504 src->ip.sin_family = AF_INET;
505 src->ip.sin_addr.s_addr = ip->saddr;
506 src->ip.sin_port = tcp->source;
507 dst->ip.sin_family = AF_INET;
508 dst->ip.sin_addr.s_addr = ip->daddr;
509 dst->ip.sin_port = tcp->dest;
510 *ack_seq = tcp->ack_seq;
514 } else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
516 ip6 = (struct ip6_hdr *)(eth+1);
518 /* we only want TCP */
519 if (ip6->ip6_nxt != IPPROTO_TCP) {
524 tcp = (struct tcphdr *)(ip6+1);
526 /* tell the caller which one we've found */
527 src->ip6.sin6_family = AF_INET6;
528 src->ip6.sin6_port = tcp->source;
529 src->ip6.sin6_addr = ip6->ip6_src;
531 dst->ip6.sin6_family = AF_INET6;
532 dst->ip6.sin6_port = tcp->dest;
533 dst->ip6.sin6_addr = ip6->ip6_dst;
535 *ack_seq = tcp->ack_seq;
545 bool ctdb_sys_check_iface_exists(const char *iface)
550 s = socket(PF_PACKET, SOCK_RAW, 0);
552 /* We dont know if the interface exists, so assume yes */
553 DEBUG(DEBUG_CRIT,(__location__ " failed to open raw socket\n"));
557 strncpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
558 if (ioctl(s, SIOCGIFINDEX, &ifr) < 0 && errno == ENODEV) {
559 DEBUG(DEBUG_CRIT,(__location__ " interface '%s' not found\n", iface));
568 int ctdb_get_peer_pid(const int fd, pid_t *peer_pid)
571 socklen_t crl = sizeof(struct ucred);
573 if ((ret = getsockopt(fd, SOL_SOCKET, SO_PEERCRED, &cr, &crl) == 0)) {
580 * Find the process name from process ID
582 char *ctdb_get_process_name(pid_t pid)
589 snprintf(path, sizeof(path), "/proc/%d/exe", pid);
590 n = readlink(path, buf, sizeof(buf));
595 /* Remove any extra fields */
597 ptr = strtok(buf, " ");
604 int ctdb_set_process_name(const char *name)
608 strncpy(procname, name, 15);
610 return prctl(PR_SET_NAME, (unsigned long)procname, 0, 0, 0);
614 * Parsing a line from /proc/locks,
616 static bool parse_proc_locks_line(char *line, pid_t *pid,
617 struct ctdb_lock_info *curlock)
621 /* output of /proc/locks
624 * 1: POSIX ADVISORY WRITE 25945 fd:00:6424820 212 212
627 * 1: -> POSIX ADVISORY WRITE 25946 fd:00:6424820 212 212
631 ptr = strtok_r(line, " ", &saveptr);
632 if (ptr == NULL) return false;
635 ptr = strtok_r(NULL, " ", &saveptr);
636 if (ptr == NULL) return false;
637 if (strcmp(ptr, "->") == 0) {
638 curlock->waiting = true;
639 ptr = strtok_r(NULL, " ", &saveptr);
641 curlock->waiting = false;
645 if (ptr == NULL || strcmp(ptr, "POSIX") != 0) {
650 ptr = strtok_r(NULL, " ", &saveptr);
651 if (ptr == NULL) return false;
654 ptr = strtok_r(NULL, " ", &saveptr);
655 if (ptr == NULL) return false;
656 if (strcmp(ptr, "READ") == 0) {
657 curlock->read_only = true;
658 } else if (strcmp(ptr, "WRITE") == 0) {
659 curlock->read_only = false;
665 ptr = strtok_r(NULL, " ", &saveptr);
666 if (ptr == NULL) return false;
669 /* MAJOR:MINOR:INODE */
670 ptr = strtok_r(NULL, " :", &saveptr);
671 if (ptr == NULL) return false;
672 ptr = strtok_r(NULL, " :", &saveptr);
673 if (ptr == NULL) return false;
674 ptr = strtok_r(NULL, " :", &saveptr);
675 if (ptr == NULL) return false;
676 curlock->inode = atol(ptr);
679 ptr = strtok_r(NULL, " ", &saveptr);
680 if (ptr == NULL) return false;
681 curlock->start = atol(ptr);
684 ptr = strtok_r(NULL, " ", &saveptr);
685 if (ptr == NULL) return false;
686 if (strncmp(ptr, "EOF", 3) == 0) {
687 curlock->end = (off_t)-1;
689 curlock->end = atol(ptr);
696 * Find information of lock being waited on for given process ID
698 bool ctdb_get_lock_info(pid_t req_pid, struct ctdb_lock_info *lock_info)
701 struct ctdb_lock_info curlock;
707 if ((fp = fopen("/proc/locks", "r")) == NULL) {
708 DEBUG(DEBUG_ERR, ("Failed to read locks information"));
711 while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
712 if (! parse_proc_locks_line(buf, &pid, &curlock)) {
715 if (pid == req_pid && curlock.waiting) {
716 *lock_info = curlock;
727 * Find process ID which holds an overlapping byte lock for required
728 * inode and byte range.
730 bool ctdb_get_blocker_pid(struct ctdb_lock_info *reqlock, pid_t *blocker_pid)
733 struct ctdb_lock_info curlock;
739 if ((fp = fopen("/proc/locks", "r")) == NULL) {
740 DEBUG(DEBUG_ERR, ("Failed to read locks information"));
743 while ((ptr = fgets(buf, sizeof(buf), fp)) != NULL) {
744 if (! parse_proc_locks_line(buf, &pid, &curlock)) {
748 if (curlock.waiting) {
752 if (curlock.inode != reqlock->inode) {
756 if (curlock.start > reqlock->end ||
757 curlock.end < reqlock->start) {
758 /* Outside the required range */