diff --git a/tests/bench_rollover.c b/tests/bench_rollover.c new file mode 100644 index 0000000..90db025 --- /dev/null +++ b/tests/bench_rollover.c @@ -0,0 +1,318 @@ +/* + * Copyright 2015 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * bench_rollover: stress test the Linux kernel PF_PACKET rollover feature. + * + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAX_NUM_SOCK 64 +#define DFLT_NUM_SOCK 8 + +#define RING_NUM_FRAMES 1024 +#define RING_FRAME_LEN 4096 + +#ifndef PACKET_ROLLOVER_STATS +#define PACKET_ROLLOVER_STATS 21 + +struct tpacket_rollover_stats { + __aligned_u64 tp_all; + __aligned_u64 tp_huge; + __aligned_u64 tp_failed; +}; +#endif + +static bool do_stop; + +static int cfg_num_sock = DFLT_NUM_SOCK; +static bool cfg_use_ring; +static int cfg_ratelimit_ms; +static bool cfg_stats_rollover; +static bool cfg_verbose; + +static void setcpu(int cpu) +{ + cpu_set_t mask; + + CPU_ZERO(&mask); + CPU_SET(cpu, &mask); + if (sched_setaffinity(0, sizeof(mask), &mask)) + error(1, errno, "sched.%d", cpu); +} + +static void bindtodev(int cpu, int fd, const char *dev) +{ + struct sockaddr_ll addr = {}; + static int ifindex; + + if (!ifindex) + ifindex = if_nametoindex(dev); + if (!ifindex) + error(1, errno, "if_nametoindex.%s", dev); + + addr.sll_family = AF_PACKET; + addr.sll_ifindex = ifindex; + addr.sll_protocol = htons(ETH_P_IP); + addr.sll_halen = ETH_ALEN; + if (bind(fd, (void *) &addr, sizeof(addr))) + error(1, errno, "bind.%d", cpu); +} + +static void sighandler(int sig) +{ + do_stop = true; +} + +static char *setrxring(int fd) +{ + struct tpacket_req req = { + .tp_block_size = RING_FRAME_LEN, + .tp_frame_size = RING_FRAME_LEN, + .tp_block_nr = RING_NUM_FRAMES, + .tp_frame_nr = RING_NUM_FRAMES, + }; + char *ring; + int val = TPACKET_V2; + + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req))) + error(1, errno, "setsockopt ring"); + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (!ring) + error(1, errno, "setsockopt mmap"); + + return ring; +} + +int reader(int cpu, int fd, char *ring) +{ + struct tpacket2_hdr *hdr; + struct tpacket_stats tpstats; + struct tpacket_rollover_stats rstats; + struct pollfd pfd; + char buf[ETH_FRAME_LEN]; + unsigned long packets = 0; + socklen_t slen; + int ret, index = 0; + + setcpu(cpu); + + hdr = (void *) ring; + while (!do_stop) { + if (ring) { + int budget = RING_NUM_FRAMES; + while (hdr->tp_status & TP_STATUS_USER && budget--) { + memcpy(buf, ((void *) hdr) + hdr->tp_net, + ETH_FRAME_LEN /* add some copy cost */); + hdr->tp_status = TP_STATUS_KERNEL; + + packets++; + index++; + if (index == RING_NUM_FRAMES) + index = 0; + + hdr = (void *) ((unsigned long) ring) + + (index * RING_FRAME_LEN); + + if (cfg_ratelimit_ms && + (packets % (cfg_ratelimit_ms / 10)) == 0) + usleep(100); + } + if (do_stop) + continue; + + pfd.fd = fd; + pfd.events = POLLIN; + pfd.revents = 0; + ret = poll(&pfd, 1, 100); + } else { + ret = read(fd, buf, sizeof(buf)); + } + if (ret == -1 && errno == EINTR) + break; + if (ret == -1) + error(1, errno, "%s.%d", ring ? "poll" : "read", cpu); + if (!ring) + packets++; + } + + slen = sizeof(tpstats); + if (getsockopt(fd, SOL_PACKET, PACKET_STATISTICS, + &tpstats, &slen)) + error(1, errno, "packetstat.%d", cpu); + + if (cfg_stats_rollover) { + slen = sizeof(rstats); + if (getsockopt(fd, SOL_PACKET, PACKET_ROLLOVER_STATS, + &rstats, &slen)) + error(1, errno, "rolloverstat.%d", cpu); + } else { + memset(&rstats, 0, sizeof(rstats)); + } + + if (packets) { + usleep(cpu * 4000); /* poor man's sorting */ + fprintf(stderr, "%3d %10lu %10u %10u %10" PRIu64 " %10" PRIu64 " %10" PRIu64 "\n", + cpu, packets, + tpstats.tp_packets - tpstats.tp_drops, + tpstats.tp_drops, + (uint64_t) rstats.tp_all, + (uint64_t) rstats.tp_huge, + (uint64_t) rstats.tp_failed); + } + + if (ring && munmap(ring, RING_FRAME_LEN * RING_NUM_FRAMES)) + error(1, errno, "munmap.%d", cpu); + + if (close(fd)) + error(1, errno, "close.%d", cpu); + + return 0; +} + +static void __attribute__((noreturn)) usage(const char *filepath) +{ + fprintf(stderr, "Usage: %s [-h] [-l len] [-n num] [-r] [-s] [-v]\n", + filepath); + exit(1); +} + +static void parse_opt(int argc, char **argv) +{ + const char on[] = "ON", off[] = "OFF"; + char c; + + while ((c = getopt(argc, argv, "hl:n:rsv")) != -1) { + switch (c) { + case 'h': + usage(argv[0]); + break; + case 'l': + cfg_ratelimit_ms = strtoul(optarg, NULL, 10); + break; + case 'n': + cfg_num_sock = strtoul(optarg, NULL, 10); + if (cfg_num_sock > MAX_NUM_SOCK) + error(1, 0, "num exceeds %u\n", MAX_NUM_SOCK); + break; + case 'r': + cfg_use_ring = true; + break; + case 's': + cfg_stats_rollover = true; + break; + case 'v': + cfg_verbose = true; + break; + default: + error(1, 0, "unknown parameter %c", c); + } + } + + if (cfg_verbose) + fprintf(stderr, "socks: %d\n" + "rate: %d K pps\n" + "ring: %s\n" + "rstat: %s\n", + cfg_num_sock, + cfg_ratelimit_ms ? cfg_ratelimit_ms : -1, + cfg_use_ring ? on : off, + cfg_stats_rollover ? on : off); +} + +int main(int argc, char **argv) +{ + static int fds[MAX_NUM_SOCK]; + static char *rings[MAX_NUM_SOCK]; + struct sigaction sig = {}; + pid_t pid, pgid; + int i, val; + + parse_opt(argc, argv); + + pgid = getpgid(0); + + /* make recv return with EINTR */ + sig.sa_handler = sighandler; + if (sigaction(SIGINT, &sig, NULL)) + error(1, errno, "sigaction"); + + for (i = 0; i < cfg_num_sock; i++) { + fds[i] = socket(PF_PACKET, SOCK_RAW, 0); + if (fds[i] == -1) + error(1, errno, "socket.%d", i); + + bindtodev(i, fds[i], "eth0"); + + if (cfg_use_ring) { + rings[i] = setrxring(fds[i]); + memset(rings[i], 0, RING_NUM_FRAMES * RING_FRAME_LEN); + } + + val = PACKET_FANOUT_CPU | PACKET_FANOUT_FLAG_ROLLOVER; + val <<= 16; + if (setsockopt(fds[i], SOL_PACKET, PACKET_FANOUT, + &val, sizeof(val))); + } + + for (i = 0; i < cfg_num_sock; i++) { + pid = fork(); + if (pid == -1) + error(1, errno, "fork.%d", i); + if (!pid) + return reader(i, fds[i], rings[i]); + } + + fprintf(stderr, "Press [Enter] to exit\n"); + getchar(); + + fprintf(stderr, "cpu rx rx.k drop.k rollover r.huge r.failed\n"); + kill(-pgid, SIGINT); + usleep(cfg_num_sock * 5000); + return 0; +} + diff --git a/tests/pingpong_tcpudp.c b/tests/pingpong_tcpudp.c new file mode 100644 index 0000000..6a385d1 --- /dev/null +++ b/tests/pingpong_tcpudp.c @@ -0,0 +1,305 @@ +/* GNU GPLv2 applies */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool cfg_is_ack; +static bool cfg_is_client; +static int cfg_ifindex; +static struct in_addr cfg_ip_dst; +static struct in_addr cfg_ip_src; +static char cfg_mac_dst[ETH_HLEN]; +static char cfg_mac_src[ETH_HLEN]; +const int cfg_num_runs = 10; +const int cfg_payload_len = 10; /* must be even */ +static int cfg_pkt_len; +static int cfg_proto = IPPROTO_TCP; +const int cfg_tcp_dst = 0x2222; +const int cfg_tcp_src = 0x1111; +const int cfg_timeout_us = 1000 * 1000; + +static char packet[ETH_DATA_LEN]; + +static unsigned long gettimeofday_us(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (1000UL * 1000 * tv.tv_sec) + tv.tv_usec; +} + +static uint16_t calc_csum(unsigned long sum, const uint16_t *data, + int num_words) +{ + int i; + + for (i = 0; i < num_words; i++) + sum += data[i]; + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return ~sum; +} + +static uint16_t calc_tcp_csum(struct iphdr *iph, struct tcphdr *tcph) +{ + unsigned long sum = 0, tcplen; + + tcplen = ntohs(iph->tot_len) - sizeof(*iph); + if (tcplen & 1) + error(1, 0, "odd length: csum needs padding"); + + sum += iph->daddr; + sum += iph->saddr; + sum += htons(iph->protocol); + sum += htons(tcplen); + + return calc_csum(sum, (void *) tcph, tcplen >> 1); +} + +static void build_pkt(void) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + struct udphdr *udph; + int off = 0, tslen; + + eth = (void *) packet; + memcpy(ð->h_dest, cfg_mac_dst, ETH_ALEN); + memcpy(ð->h_source, cfg_mac_src, ETH_ALEN); + eth->h_proto = htons(ETH_P_IP); + off += sizeof(*eth); + + if (cfg_proto == IPPROTO_UDP) + tslen = sizeof(*udph); + else + tslen = sizeof(*tcph); + cfg_pkt_len = sizeof(*eth) + sizeof(*iph) + tslen + cfg_payload_len; + + iph = (void *) packet + off; + iph->version = 4; + iph->ihl = 5; + iph->ttl = 2; + iph->id = 666; + iph->frag_off = htons(IP_DF); + iph->tot_len = htons((uint16_t) (sizeof(*iph) + tslen + cfg_payload_len)); + iph->saddr = cfg_ip_src.s_addr; + iph->daddr = cfg_ip_dst.s_addr; + iph->protocol = cfg_proto; + iph->check = calc_csum(0, (void *) iph, sizeof(*iph) >> 1); + off += sizeof(*iph); + + if (cfg_proto == IPPROTO_UDP) { + udph = (void *) packet + off; + udph->dest = htons(cfg_tcp_dst); + udph->source = htons(cfg_tcp_src); + udph->len = htons(tslen + cfg_payload_len); + udph->check = 0; + off += sizeof(*udph); + } else { + tcph = (void *) packet + off; + tcph->dest = htons(cfg_tcp_dst); + tcph->source = htons(cfg_tcp_src); + tcph->seq = htonl(1); + tcph->ack_seq = htonl(1); + tcph->doff = 5; + if (cfg_is_ack) + tcph->ack = 1; + tcph->psh = 1; + tcph->window = htons(16000); + tcph->check = calc_tcp_csum(iph, tcph); + off += sizeof(*tcph); + } +} + +static void do_recv(int fd) +{ + char rdata[ETH_DATA_LEN] = {0}; + int ret; + + ret = read(fd, rdata, sizeof(rdata)); + if (ret == -1) + error(1, errno, "read"); + /* TODO: understand why returned packet exceeds cfg_pkt_len */ + if (ret < cfg_pkt_len) + error(1, 0, "read: %uB != %uB\n", ret, cfg_pkt_len); +} + +static void do_send(int fd) +{ + int ret; + + ret = send(fd, packet, cfg_pkt_len, MSG_DONTWAIT); + if (ret == -1) + error(1, errno, "write"); + if (ret != cfg_pkt_len) + error(1, 0, "write: %uB != %uB\n", ret, cfg_pkt_len); +} + +static void do_server(int fd) +{ + unsigned long t1, t2 = 0; + int i; + + for (i = 0; i < cfg_num_runs; i++) { + do_recv(fd); + t1 = t2; + t2 = gettimeofday_us(); + do_send(fd); + + if (t1) + fprintf(stderr, "%d. RTT: %lu usec\n", i, t2 - t1); + } + do_send(fd); +} + +static void do_client(int fd) +{ + unsigned long t1, t2; + int i; + + for (i = 0; i < cfg_num_runs; i++) { + t1 = gettimeofday_us(); + do_send(fd); + do_recv(fd); + t2 = gettimeofday_us(); + + fprintf(stderr, "%d. RTT: %lu usec\n", i, t2 - t1); + } +} + +static void set_filter(int fd) +{ + const int ip_prot_off = sizeof(struct ethhdr) + + __builtin_offsetof(struct iphdr, protocol); + const int tcp_dst_off = sizeof(struct ethhdr) + sizeof(struct iphdr) + + __builtin_offsetof(struct tcphdr, dest); + + struct sock_filter filter[] = { + BPF_STMT(BPF_LD + BPF_B + BPF_ABS, ip_prot_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_proto, 0, 3), + BPF_STMT(BPF_LD + BPF_H + BPF_ABS, tcp_dst_off), + BPF_JUMP(BPF_JMP + BPF_JEQ + BPF_K, cfg_tcp_dst, 0, 1), + BPF_STMT(BPF_RET + BPF_K, 0xFFFF), + BPF_STMT(BPF_RET + BPF_K, 0), + }; + struct sock_fprog prog; + + prog.filter = filter; + prog.len = sizeof(filter) / sizeof(struct sock_filter); + if (setsockopt(fd, SOL_SOCKET, SO_ATTACH_FILTER, &prog, sizeof(prog))) + error(1, errno, "setsockopt filter mark"); +} + +static void read_mac(const char *mac_str, char *mac_bin) +{ + if (sscanf(mac_str, "%hhx:%hhx:%hhx:%hhx:%hhx:%hhx", + &mac_bin[0], &mac_bin[1], &mac_bin[2], + &mac_bin[3], &mac_bin[4], &mac_bin[5]) != 6) + error(1, 0, "bad mac: %s\n", optarg); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "acd:D:i:s:S:u")) != -1) { + switch (c) { + case 'a': + cfg_is_ack = true; + break; + case 'c': + cfg_is_client = true; + break; + case 'd': + if (inet_pton(PF_INET, optarg, &cfg_ip_dst) != 1) + error(1, 0, "bad src ip: %s\n", optarg); + break; + case 'D': + read_mac(optarg, cfg_mac_dst); + break; + case 'i': + cfg_ifindex = if_nametoindex(optarg); + if (!cfg_ifindex) + error(1, errno, "if_nametoindex"); + break; + case 's': + if (inet_pton(PF_INET, optarg, &cfg_ip_src) != 1) + error(1, 0, "bad src ip: %s\n", optarg); + break; + case 'S': + read_mac(optarg, cfg_mac_src); + break; + case 'u': + cfg_proto = IPPROTO_UDP; + break; + default: + error(1, 0, "unknown option %c", c); + } + } + + if (!cfg_ifindex) { + cfg_ifindex = if_nametoindex("eth0"); + if (!cfg_ifindex) + error(1, errno, "if_nametoindex"); + } +} + +int main(int argc, char **argv) +{ + struct sockaddr_ll addr = {0}; + int fd; + + parse_opts(argc, argv); + fprintf(stderr, "mode: %s\n", cfg_is_client ? "client" : "server"); + + fd = socket(PF_PACKET, SOCK_RAW, 0); + if (fd == -1) + error(1, errno, "socket"); + + set_filter(fd); + + addr.sll_family = AF_PACKET; + addr.sll_protocol = htons(ETH_P_IP); + addr.sll_ifindex = cfg_ifindex; + if (bind(fd, (void*) &addr, sizeof(addr))) + error(1, errno, "bind"); + + build_pkt(); + + if (cfg_is_client) + do_client(fd); + else + do_server(fd); + + if (close(fd)) + error(1, errno, "close"); + + return 0; +} + diff --git a/tests/psock_rxring_vnet.c b/tests/psock_rxring_vnet.c new file mode 100644 index 0000000..fcd3fea --- /dev/null +++ b/tests/psock_rxring_vnet.c @@ -0,0 +1,254 @@ +/* + * A packet sniffer that combines PACKET_RX_RING and PACKET_VNET_HDR + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static bool cfg_enable_vnet = false; +static int cfg_num_frames = 1024; +static int cfg_runtime_sec = 1; + +static struct tpacket_req req; + +static unsigned long gettimeofday_ms(void) +{ + struct timeval tv; + + gettimeofday(&tv, NULL); + return (tv.tv_sec * 1000) + (tv.tv_usec / 1000); +} + +static int socket_open(void) +{ + int fd, val; + + fd = socket(PF_PACKET, SOCK_RAW, 0 /* disable until ring is ready */); + if (fd == -1) + error(1, errno, "socket"); + + val = TPACKET_V2; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + + if (cfg_enable_vnet) { + val = 1; + if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, + &val, sizeof(val))) + error(1, errno, "setsockopt vnet_hdr"); + } + + return fd; +} + +static void socket_bind(int fd) +{ + struct sockaddr_ll addr = {}; + + addr.sll_family = AF_PACKET; + addr.sll_protocol = htons(ETH_P_IP); + if (bind(fd, (void *) &addr, sizeof(addr)) == -1) + error(1, errno, "packetsock bind"); +} + +static char * ring_open(int fd) +{ + char *ring; + + req.tp_frame_size = 256; + req.tp_frame_nr = cfg_num_frames; + req.tp_block_size = getpagesize(); + req.tp_block_nr = (req.tp_frame_size * req.tp_frame_nr) / + req.tp_block_size; + + if (setsockopt(fd, SOL_PACKET, PACKET_RX_RING, + (void*) &req, sizeof(req))) + error(1, errno, "setsockopt ring"); + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ring == MAP_FAILED) + error(1, errno, "mmap"); + + return ring; +} + +/* portability warning: ignoring virtio endiannes */ +static void parse_vnet(struct virtio_net_hdr *vnet) +{ + uint16_t gso_type; + char *type; + + gso_type = vnet->gso_type & ~VIRTIO_NET_HDR_GSO_ECN; + switch (gso_type) { + case VIRTIO_NET_HDR_GSO_NONE: + type = "none"; + break; + case VIRTIO_NET_HDR_GSO_TCPV4: + type = "tcpv4"; + break; + case VIRTIO_NET_HDR_GSO_TCPV6: + type = "tcpv6"; + break; + case VIRTIO_NET_HDR_GSO_UDP: + type = "udp"; + break; + default: + type = "unknown"; + } + + fprintf(stderr, "vnet: gso_type=%s gso_size=%u hlen=%u ecn=%s\n", + type, vnet->gso_size, vnet->hdr_len, + (vnet->gso_type & VIRTIO_NET_HDR_GSO_ECN) ? "on " : "off"); + + if (vnet->flags == VIRTIO_NET_HDR_F_NEEDS_CSUM) + fprintf(stderr, "csum: start=%u off=%u\n", + vnet->csum_start, vnet->csum_offset); + +} + +static void parse_ipv4(struct iphdr *iph) +{ + char saddr[INET_ADDRSTRLEN], daddr[INET_ADDRSTRLEN]; + + if (!inet_ntop(AF_INET, &iph->saddr, saddr, sizeof(saddr))) + error(1, errno, "inet_ntop saddr"); + if (!inet_ntop(AF_INET, &iph->daddr, daddr, sizeof(daddr))) + error(1, errno, "inet_ntop daddr"); + + fprintf(stderr, "ip: src=%s dst=%s proto=%u len=%u\n", + saddr, daddr, iph->protocol, ntohs(iph->tot_len)); +} + +/* portability warning: assumes ethernet */ +static void __ring_read(struct tpacket2_hdr *hdr, void *data) +{ + struct timeval tv; + uint16_t eth_proto; + struct ethhdr *eth = (void *) data; + + gettimeofday(&tv, NULL); + fprintf(stderr, "\npkt: %lu.%lu len=%u\n", + tv.tv_sec, tv.tv_usec, hdr->tp_len); + + if (cfg_enable_vnet) + parse_vnet(data - sizeof(struct virtio_net_hdr)); + + eth_proto = htons(eth->h_proto); + fprintf(stderr, "eth: proto=0x%x\n", eth_proto); + if (eth_proto == ETH_P_IP) + parse_ipv4(data + ETH_HLEN); +} + +static void ring_read(void *ring, int index) +{ + struct tpacket2_hdr *header = ring + (index * req.tp_frame_size); + + if (!(header->tp_status & TP_STATUS_USER)) + error(1, 0, "ring: no data (0x%x)", header->tp_status); + + + __ring_read(header, ((void *) header) + header->tp_mac); + header->tp_status = TP_STATUS_KERNEL; +} + +static void ring_close(char *ring) +{ + if (munmap(ring, req.tp_block_size * req.tp_block_nr)) + error(1, errno, "munmap"); +} + +static bool ring_poll(int fd) +{ + struct pollfd pfd; + int ret; + + pfd.fd = fd; + pfd.events = POLLIN; + ret = poll(&pfd, 1, 100); + if (ret == -1) + error(1, errno, "poll"); + if (ret == 0) + return false; + if (pfd.revents != POLLIN) + error(1, 0, "unexpected event (0x%x)", pfd.revents); + + return true; +} + +static void do_run(int fd, char *ring) +{ + int64_t tstop, index = 0; + + tstop = gettimeofday_ms() + (cfg_runtime_sec * 1000); + + while (gettimeofday_ms() < tstop) { + if (ring_poll(fd)) { + ring_read(ring, index % cfg_num_frames); + index++; + } + } + + fprintf(stderr, "total: %lu packets\n", index); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "v")) != -1) + { + switch (c) { + case 'v': + cfg_enable_vnet = true; + break; + default: + error(1, 0, "unknown option %c", c); + } + } +} + +int main(int argc, char **argv) +{ + char *ring; + int fd; + + parse_opts(argc, argv); + + fprintf(stderr, "vnet: %sabled\n", cfg_enable_vnet ? "en" : "dis"); + + fd = socket_open(); + socket_bind(fd); + ring = ring_open(fd); + + do_run(fd, ring); + + ring_close(ring); + if (close(fd) == -1) + error(1, errno, "close"); + + return 0; +} diff --git a/tests/psock_txring_vnet.c b/tests/psock_txring_vnet.c new file mode 100644 index 0000000..a951d60 --- /dev/null +++ b/tests/psock_txring_vnet.c @@ -0,0 +1,511 @@ +/* Inject packets with PACKET_TX_RING and PACKET_VNET_HDR */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +/* requires libcap-dev */ +#include +#else +extern int capset(cap_user_header_t header, cap_user_data_t data); +extern int capget(cap_user_header_t header, const cap_user_data_t data); +#endif + +#ifndef PACKET_QDISC_BYPASS +#define PACKET_QDISC_BYPASS 20 +#endif + +static bool cfg_enable_ring = true; +static bool cfg_enable_vnet = false; +static bool cfg_enable_csum = true; /* only used if cfg_enable_vnet */ +static bool cfg_enable_gso = true; /* only used if cfg_enable_vnet */ +static bool cfg_vector_send = false; +static char *cfg_ifname = "eth0"; +static int cfg_ifindex; +static int cfg_num_frames = 4; +static unsigned int cfg_override_len = UINT_MAX; +static unsigned int cfg_payload_len = 500; +static bool cfg_qdisc_bypass = false; + +static struct tpacket_req req; +static struct in_addr ip_saddr, ip_daddr; + +/* must configure real daddr (should really infer or pass on cmdline) */ +const char cfg_mac_src[] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }; +const char cfg_mac_dst[] = { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55 }; + +static int socket_open(void) +{ + int fd, val; + + fd = socket(PF_PACKET, SOCK_RAW, 0 /* disable reading */); + if (fd == -1) + error(1, errno, "socket"); + + if (cfg_enable_ring) { + val = TPACKET_V2; + if (setsockopt(fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt version"); + } + + if (cfg_qdisc_bypass) { + val = 1; + if (setsockopt(fd, SOL_PACKET, PACKET_QDISC_BYPASS, + &val, sizeof(val))) + error(1, errno, "setsockopt qdisc bypass"); + } + + if (cfg_enable_vnet) { + val = 1; + if (setsockopt(fd, SOL_PACKET, PACKET_VNET_HDR, + &val, sizeof(val))) + error(1, errno, "setsockopt vnet_hdr"); + } + + return fd; +} + +static char * ring_open(int fd) +{ + char *ring; + unsigned int frame_sz; + + frame_sz = cfg_payload_len + 100 /* overestimate */; + frame_sz = 1 << (32 - __builtin_clz(frame_sz)); + if (frame_sz < getpagesize()) + frame_sz = getpagesize(); + + fprintf(stderr, "frame size: %u\n", frame_sz); + + req.tp_frame_size = frame_sz; + req.tp_frame_nr = cfg_num_frames; + req.tp_block_size = req.tp_frame_size; + req.tp_block_nr = cfg_num_frames; + + if (setsockopt(fd, SOL_PACKET, PACKET_TX_RING, + (void*) &req, sizeof(req))) + error(1, errno, "setsockopt ring"); + + ring = mmap(0, req.tp_block_size * req.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); + if (ring == MAP_FAILED) + error(1, errno, "mmap"); + + return ring; +} + +/* warning: does not handle odd length */ +static unsigned long add_csum_hword(const uint16_t *start, int num_u16) +{ + unsigned long sum = 0; + int i; + + for (i = 0; i < num_u16; i++) + sum += start[i]; + + return sum; +} + +static uint16_t add_csum_hword_fold(const uint16_t *start, int num_u16, + unsigned long sum) +{ + sum += add_csum_hword(start, num_u16); + + while (sum >> 16) + sum = (sum & 0xffff) + (sum >> 16); + + return sum; +} + +static uint16_t build_ip_csum(const uint16_t *start, int num_u16, + unsigned long sum) +{ + return ~add_csum_hword_fold(start, num_u16, sum); +} + +static uint16_t get_tcp_v4_csum(const struct iphdr *iph, + const struct tcphdr *tcph, + int length) +{ + unsigned long pseudo_sum = 0; + uint16_t proto = htons(IPPROTO_TCP); + uint16_t ulen = htons(length); + + pseudo_sum = add_csum_hword_fold((void *) &iph->saddr, 4, 0); + pseudo_sum = add_csum_hword_fold(&proto, 1, pseudo_sum); + pseudo_sum = add_csum_hword_fold(&ulen, 1, pseudo_sum); + + if (cfg_enable_vnet) + return pseudo_sum; + else + return build_ip_csum((void *) tcph, length >> 1, pseudo_sum); +} + +static void set_vheader(void *buffer) +{ + struct virtio_net_hdr *vnet; + vnet = buffer; + if (cfg_enable_csum) { + vnet->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM; + vnet->csum_start = ETH_HLEN + sizeof(struct iphdr); + vnet->csum_offset = __builtin_offsetof(struct tcphdr, check); + } + + if (cfg_enable_gso) { + vnet->hdr_len = ETH_HLEN + sizeof(struct iphdr) + sizeof(struct tcphdr); + vnet->gso_type = VIRTIO_NET_HDR_GSO_TCPV4; + vnet->gso_size = ETH_DATA_LEN - sizeof(struct iphdr) - + sizeof(struct tcphdr); + } else { + vnet->gso_type = VIRTIO_NET_HDR_GSO_NONE; + } +} + +static int set_packet(void *buffer, unsigned int off, unsigned int payload_len) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + + eth = buffer + off; + memcpy(ð->h_source, cfg_mac_src, ETH_ALEN); + memcpy(ð->h_dest, cfg_mac_dst, ETH_ALEN); + eth->h_proto = htons(ETH_P_IP); + off += ETH_HLEN; + + iph = buffer + off; + iph->ttl = 8; + iph->ihl = 5; + iph->version = 4; + iph->saddr = ip_saddr.s_addr; + iph->daddr = ip_daddr.s_addr; + iph->protocol = IPPROTO_TCP; + iph->tot_len = htons(sizeof(*iph) + sizeof(*tcph) + payload_len); + iph->check = build_ip_csum((const void *) iph, 10 /* hwords */, 0); + off += sizeof(*iph); + + tcph = buffer + off; + tcph->dest = htons(9); + tcph->source = htons(9); + tcph->doff = sizeof(*tcph) >> 2; + off += sizeof(*tcph); + + memset(buffer + off, 'a', payload_len); + + tcph->check = get_tcp_v4_csum(iph, tcph, + (sizeof(*tcph) + payload_len)); + return off + payload_len; +} + +static int frame_fill(void *buffer, unsigned int payload_len) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + int off = 0; + + if (cfg_enable_vnet) { + set_vheader(buffer); + off += sizeof(struct virtio_net_hdr); + } + + return set_packet(buffer, off, payload_len); +} + +static void ring_write(void *slot) +{ + struct tpacket2_hdr *header = slot; + int len; + + if (header->tp_status != TP_STATUS_AVAILABLE) + error(1, 0, "write: slot not available"); + + header->tp_mac = TPACKET2_HDRLEN - sizeof(struct sockaddr_ll); + memset(slot + header->tp_mac, 0, req.tp_frame_size - header->tp_mac); + + len = frame_fill(slot + header->tp_mac, cfg_payload_len); + if (cfg_override_len < len) + len = cfg_override_len; + + header->tp_len = len; + header->tp_status = TP_STATUS_SEND_REQUEST; +} + +static void socket_write(int fd) +{ + static char buf[ETH_HLEN + (1 << 16)]; + int len, ret; + + memset(buf, 0, sizeof(buf)); + len = frame_fill(buf, cfg_payload_len); + + if (cfg_override_len < len) + len = cfg_override_len; + + ret = send(fd, buf, len, 0); + if (ret == -1) + error(1, errno, "send"); + if (ret < len) + error(1, 0, "send: %uB < %uB\n", ret, len); +} + +static void vector_write(int fd, int count) +{ + struct mmsghdr *loop, *msgvec = NULL; + struct iovec *iov = NULL; + int i, ret; + char *packet; + + fprintf(stderr, "vector size: %u\n", count); + msgvec = malloc(sizeof(struct mmsghdr) * count); + if (msgvec == NULL) { + error(1, ENOMEM, "alloc mmsg vector"); + } + iov = malloc(sizeof(struct iovec) * count * 3); + if (iov == NULL) { + error(1, ENOMEM, "alloc iov vector"); + } + loop = msgvec; + for (i = 0; i < count ; i++) { + loop->msg_hdr.msg_iov = iov; + loop->msg_hdr.msg_iovlen = 2; + loop->msg_hdr.msg_control = NULL; + loop->msg_hdr.msg_controllen = 0; + loop->msg_hdr.msg_flags = MSG_DONTWAIT; + loop->msg_hdr.msg_name = NULL; + loop->msg_hdr.msg_namelen = 0; + if (cfg_enable_vnet) { + loop->msg_hdr.msg_iovlen += 1; + iov->iov_base = malloc(sizeof (struct virtio_net_hdr)); + if (iov->iov_base == NULL) { + error(1, ENOMEM, "alloc vnet hdr"); + iov->iov_len = 0; + } else { + iov->iov_len = sizeof(struct virtio_net_hdr); + set_vheader(iov->iov_base); + } + iov++; + } + packet = malloc( + cfg_payload_len + sizeof(struct ethhdr) + + sizeof(struct iphdr) + sizeof(struct tcphdr)); + if (packet == NULL) { + error(1, ENOMEM, "alloc payload"); + iov->iov_len = 0; + } else { + set_packet(packet, 0, cfg_payload_len); + iov->iov_base = packet; + iov->iov_len = sizeof(struct ethhdr) + + sizeof(struct iphdr) + sizeof(struct tcphdr); + iov++; + iov->iov_base = packet + sizeof(struct ethhdr) + + sizeof(struct iphdr) + sizeof(struct tcphdr); + iov->iov_len = cfg_payload_len; + } + iov++; + loop++; + } + ret = sendmmsg(fd, msgvec, count, 0); + if (ret == -1) + error(1, errno, "send"); + if (ret < count) + error(1, 0, "send: %uB < %uB\n", ret, count); +} + +static void socket_bind(int fd) +{ + struct sockaddr_ll addr = { 0 }; + + addr.sll_family = AF_PACKET; + addr.sll_ifindex = cfg_ifindex; + addr.sll_protocol = htons(ETH_P_IP); + addr.sll_halen = ETH_ALEN; + + if (bind(fd, (void *) &addr, sizeof(addr))) + error(1, errno, "bind"); +} + +static void ring_wake_kernel(int fd) +{ + int ret; + + ret = send(fd, NULL, 0, 0); + if (ret < 0) + error(1, errno, "send"); + if (!ret) + error(1, 0, "send: no data"); + + fprintf(stderr, "send: %uB\n", ret); +} + +static void ring_close(char *ring) +{ + if (munmap(ring, req.tp_block_size * req.tp_block_nr)) + error(1, errno, "munmap"); +} + +static void do_run_ring(int fd, char *ring) +{ + int i; + + for (i = 0; i < cfg_num_frames; i++) + ring_write(ring + (i * req.tp_frame_size)); + + ring_wake_kernel(fd); +} + +static void do_run(int fd) +{ + int i; + + for (i = 0; i < cfg_num_frames; i++) + socket_write(fd); +} + +static void drop_capability(uint32_t capability) +{ + struct __user_cap_header_struct hdr = {}; + struct __user_cap_data_struct data = {}; + + hdr.pid = getpid(); + hdr.version = _LINUX_CAPABILITY_VERSION; + + if (capget(&hdr, &data) == -1) + error(1, errno, "capget"); + fprintf(stderr, "cap.1: eff=0x%x perm=0x%x\n", + data.effective, data.permitted); + + data.effective &= ~CAP_TO_MASK(capability); + data.permitted &= ~CAP_TO_MASK(capability); + data.inheritable = 0; + + if (capset(&hdr, &data) == -1) + error(1, errno, "capset"); + + if (capget(&hdr, &data) == -1) + error(1, errno, "capget"); + fprintf(stderr, "cap.2: eff=0x%x perm=0x%x\n", + data.effective, data.permitted); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "cCd:Gi:l:L:n:Nqs:vZ")) != -1) + { + switch (c) { + case 'c': + drop_capability(CAP_SYS_RAWIO); + break; + case 'C': + cfg_enable_csum = false; + break; + case 'd': + if (!inet_aton(optarg, &ip_daddr)) + error(1, 0, "bad ipv4 destination address"); + break; + case 'G': + cfg_enable_gso = false; + break; + case 'i': + cfg_ifname = optarg; + break; + case 'l': + cfg_payload_len = strtoul(optarg, NULL, 0); + break; + case 'L': + cfg_override_len = strtoul(optarg, NULL, 0); + break; + case 'n': + cfg_num_frames = strtoul(optarg, NULL, 0); + break; + case 'N': + cfg_enable_ring = false; + break; + case 'q': + cfg_qdisc_bypass = true; + break; + case 's': + if (!inet_aton(optarg, &ip_saddr)) + error(1, 0, "bad ipv4 destination address"); + break; + case 'v': + cfg_enable_vnet = true; + break; + case 'Z': + { + cfg_enable_ring = false; + cfg_vector_send = true; + } + break; + default: + error(1, 0, "unknown option %c", c); + } + } + + if (!ip_saddr.s_addr || !ip_daddr.s_addr) + error(1, 0, "must specify ipv4 source and destination"); + + cfg_ifindex = if_nametoindex(cfg_ifname); + if (!cfg_ifindex) + error(1, errno, "ifnametoindex"); + + fprintf(stderr, "len: %u\n", cfg_num_frames); + fprintf(stderr, "num: %u\n", cfg_payload_len); + fprintf(stderr, "vnet: %sabled\n", cfg_enable_vnet ? "en" : "dis"); +} + +int main(int argc, char **argv) +{ + char *ring; + int fd; + + parse_opts(argc, argv); + + fd = socket_open(); + socket_bind(fd); + + if (cfg_enable_ring) { + ring = ring_open(fd); + do_run_ring(fd, ring); + ring_close(ring); + } else { + if (cfg_vector_send) { + vector_write(fd, cfg_num_frames); + } else + do_run(fd); + } + + if (close(fd) == -1) + error(1, errno, "close"); + + return 0; +} diff --git a/tests/recv_cmsg_ipchecksum.c b/tests/recv_cmsg_ipchecksum.c new file mode 100644 index 0000000..5c0508a --- /dev/null +++ b/tests/recv_cmsg_ipchecksum.c @@ -0,0 +1,156 @@ +/* + * Test recv cmsg IP_CHECKSUM + * + * For both IPv4 and v4-mapped-v6: + * - read 100 B packet + * - peek 100 B packet at 2 B offset + * + * The cmsg is expected to arrive with CHECKSUM_COMPLETE, including + * on receive checksum conversion. It does not work with IPv6 or with + * hardware checksum disabled. + * + * To run: start on one host and send traffic from another host: + * + * dd if=/dev/zero bs=1 count=100 | sed 's/./\x01/2' > payload + * for i in 1 2; do nc -p 9000 -q 1 -u $hostA 8000 < payload; done + * + * The sum of the payload of range [2, 99] is 0. + * The sum of the payload of range [0, 99] is 1. + * + * Expected sum16 output is + * peek: 0xFFFF (because all zeroes) + * read: 0x0100 (because one \x01 halfword) + * + * + * Author: Willem de Bruijn (willemb@google.com) + * GPL v2 applies + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IP_CHECKSUM 23 + +#define CFG_PORT 8000 +#define PAYLOAD_CHAR 0 +#define PAYLOAD_LEN 100 +#define PEEK_OFF 2 + +static inline uint16_t csum_fold(__wsum csum) +{ + uint32_t sum = csum; + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + return (uint16_t) sum; +} + +static void do_rx(int fd, bool peek) +{ + char rbuf[100]; + struct cmsghdr *cm; + struct msghdr msg = {0}; + struct iovec iov = {0}; + char control[CMSG_SPACE(sizeof(__wsum))]; + int ret, expected; + + iov.iov_base = rbuf; + iov.iov_len = sizeof(rbuf); + + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, peek ? MSG_PEEK : 0); + if (ret == -1) + error(1, errno, "recv"); + if (msg.msg_flags & MSG_TRUNC) + error(1, errno, "recv: truncated data"); + if (msg.msg_flags & MSG_CTRUNC) + error(1, errno, "recv: truncated control"); + + for (cm = CMSG_FIRSTHDR(&msg); cm; cm = CMSG_NXTHDR(&msg, cm)) { + __wsum check32; + + if (cm->cmsg_level != SOL_IP) + error(1, 0, "cmsg: level=%u", cm->cmsg_level); + if (cm->cmsg_type != IP_CHECKSUM) + error(1, 0, "cmsg: type=%u", cm->cmsg_level); + + check32 = *((__wsum*) CMSG_DATA(cm)); + fprintf(stderr, "csum: sum32=0x%0x sum16=%hx ~sum16=%hx\n", + check32, csum_fold(check32), ~csum_fold(check32)); + } + + expected = PAYLOAD_LEN - (peek ? PEEK_OFF : 0); + if (ret != expected) + error(1, 0, "recv: %uB != %uB", ret, expected); + if (rbuf[0] != PAYLOAD_CHAR) + error(1, 0, "recv: payload mismatch"); +} + +static void do_main(struct sockaddr *addr, socklen_t alen) +{ + int fd, ret, one = 1, two = 2; + + fd = socket(addr->sa_family, SOCK_DGRAM, 0); + if (fd == -1) + error(1, errno, "socket rx"); + + ret = bind(fd, addr, alen); + if (ret) + error(1, errno, "bind rx"); + + if (setsockopt(fd, SOL_IP, IP_CHECKSUM, &one, sizeof(one))) + error(1, errno, "setsockopt csum"); + + if (setsockopt(fd, SOL_SOCKET, SO_PEEK_OFF, &two, sizeof(two))) + error(1, errno, "setsockopt peek_off"); + + do_rx(fd, true); + do_rx(fd, false); + + if (close(fd)) + error(1, errno, "close"); +} + +int main(int argc, char **argv) +{ + struct sockaddr_in addr4 = {0}; + struct sockaddr_in6 addr6 = {0}; + + fprintf(stderr, "PF_INET\n"); + addr4.sin_family = PF_INET; + addr4.sin_port = htons(CFG_PORT); + addr4.sin_addr.s_addr = htonl(INADDR_ANY); + do_main((void *) &addr4, sizeof(addr4)); + + fprintf(stderr, "PF_INET6\n"); + addr6.sin6_family = PF_INET6; + addr6.sin6_port = htons(CFG_PORT); + addr6.sin6_addr = in6addr_any; + do_main((void *) &addr6, sizeof(addr6)); + + return 0; +} diff --git a/tests/recvfragsize.c b/tests/recvfragsize.c new file mode 100644 index 0000000..54c79ea --- /dev/null +++ b/tests/recvfragsize.c @@ -0,0 +1,200 @@ +/* Test IP(V6)_RECVFRAGSIZE socket option */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IP_RECVFRAGSIZE 25 +#define IPV6_RECVFRAGSIZE 77 + +static int cfg_dest_port = 6000; +static int cfg_expected_fragsize = 1300; +static int cfg_proto_l3 = PF_INET6; +static int cfg_proto_l4 = SOCK_DGRAM; + +static int socket_rx(int domain, int type, int protocol, + struct sockaddr *addr, socklen_t alen) +{ + int fd, level, optname, one = 1; + + fd = socket(domain, type, protocol); + if (fd == -1) + error(1, errno, "socket"); + + if (domain == PF_INET6) { + level = SOL_IPV6; + optname = IPV6_RECVFRAGSIZE; + } else { + level = SOL_IP; + optname = IP_RECVFRAGSIZE; + } + + if (setsockopt(fd, level, optname, &one, sizeof(one))) + error(1, errno, "setsockopt recvfragsize (%u.%u)", + level, optname); + + if (bind(fd, addr, alen)) + error(1, errno, "bind"); + + return fd; +} + +static int socket_rx_ipv6(int type, int protocol) +{ + struct sockaddr_in6 addr = { + .sin6_family = AF_INET6, + .sin6_port = htons(cfg_dest_port), + .sin6_addr = in6addr_any, + }; + + return socket_rx(PF_INET6, type, protocol, (void*) &addr, sizeof(addr)); +} + +static int socket_rx_ipv4(int type, int protocol) +{ + struct sockaddr_in addr = { + .sin_family = AF_INET, + .sin_port = htons(cfg_dest_port), + .sin_addr.s_addr = htons(INADDR_ANY), + }; + + return socket_rx(PF_INET, type, protocol, (void*) &addr, sizeof(addr)); +} + +static void poll_one(int fd) +{ + struct pollfd pfd = {0}; + int ret; + + pfd.fd = fd; + pfd.events = POLLIN; + + ret = poll(&pfd, 1, 1000); + if (ret == -1) + error(1, errno, "poll"); + if (ret == 0) + error(1, 0, "poll: timeout"); + if (!(pfd.revents & POLLIN)) + error(1, 0, "poll: unexpected event(s) 0x%x\n", pfd.revents); +} + +static void rx_one(int fd, int level, int optname) +{ + struct msghdr msg = {0}; + struct cmsghdr *cmsg; + char control[2 * CMSG_SPACE(int)]; + int ret, size, num_cmsg = 0; + + poll_one(fd); + + msg.msg_control = control; + msg.msg_controllen = sizeof(control); + + ret = recvmsg(fd, &msg, MSG_TRUNC); + if (ret == -1) + error(1, errno, "recvmsg"); + if (msg.msg_flags & MSG_CTRUNC) + error(1, 0, "recvmsg: truncated cmsg"); + fprintf(stderr, "recv: %uB\n", ret); + + for (cmsg = CMSG_FIRSTHDR(&msg); + cmsg && cmsg->cmsg_len; + cmsg = CMSG_NXTHDR(&msg, cmsg)) { + if (cmsg->cmsg_level != level || cmsg->cmsg_type != optname) + error(1, 0, "wrong cmsg 0x%x.0x%x", + cmsg->cmsg_level, cmsg->cmsg_type); + num_cmsg++; + fprintf(stderr, "cmsg_level=%u cmsg_type=%u\n", + cmsg->cmsg_level, cmsg->cmsg_type); + size = *((int*) CMSG_DATA(cmsg)); + fprintf(stderr, "max fragsize: %u\n", size); + } + + if (num_cmsg > 1) + error(1, 0, "unexpected #cmsg: %u\n", num_cmsg); + if (num_cmsg == 1 && size != cfg_expected_fragsize) + error(1, 0, "unexpected frag size: %u\n", size); +} + +static void run_one_ipv6(int type) +{ + int fd; + + fprintf(stderr, "ipv6 %s\n", type == SOCK_DGRAM ? "udp" : "raw"); + + /* IPv6 fragments are 8-byte aligned, expect for the last */ + cfg_expected_fragsize = cfg_expected_fragsize >> 3 << 3; + + fd = socket_rx_ipv6(type, type == SOCK_RAW ? IPPROTO_EGP : 0); + rx_one(fd, SOL_IPV6, IPV6_RECVFRAGSIZE); + if (close(fd)) + error(1, errno, "close"); +} + +static void run_one_ipv4(int type) +{ + int fd; + + fprintf(stderr, "ipv4 %s\n", type == SOCK_DGRAM ? "udp" : "raw"); + + fd = socket_rx_ipv4(type, type == SOCK_RAW ? IPPROTO_EGP : 0); + rx_one(fd, SOL_IP, IP_RECVFRAGSIZE); + if (close(fd)) + error(1, errno, "close"); +} + +static void parse_opts(int argc, char **argv) +{ + int c; + + while ((c = getopt(argc, argv, "46p:ru")) != -1) { + switch (c) { + case '4': + cfg_proto_l3 = PF_INET; + break; + case '6': + cfg_proto_l3 = PF_INET6; + break; + case 'p': + cfg_dest_port = strtoul(optarg, NULL, 0); + break; + case 'r': + cfg_proto_l4 = SOCK_RAW; + break; + case 'u': + cfg_proto_l4 = SOCK_DGRAM; + break; + default: + error(1, 0, "invalid option %c\n", c); + } + } +} + +int main(int argc, char **argv) +{ + parse_opts(argc, argv); + + if (cfg_proto_l3 == PF_INET) + run_one_ipv4(cfg_proto_l4); + else + run_one_ipv6(cfg_proto_l4); + + fprintf(stderr, "OK. All tests passed\n"); + return 0; +} + diff --git a/tests/txtimestamp.c b/tests/txtimestamp.c new file mode 100644 index 0000000..9c70a69 --- /dev/null +++ b/tests/txtimestamp.c @@ -0,0 +1,525 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Conformance tests for software tx timestamping, including + * + * - SCHED, SND and ACK timestamps + * - RAW, UDP and TCP + * - IPv4 and IPv6 + * - various packet sizes (to test GSO and TSO) + * + * Consult the command line arguments for help on running + * the various testcases. + * + * This test requires a dummy TCP server. + * A simple `nc6 [-u] -l -p $DESTPORT` will do + * + * Tested against net-next (09ddb8e) + * + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* should be defined in include/uapi/linux/socket.h */ +#define MSG_TSTAMP 0x100000 +#define MSG_TSTAMP_ACK 0x200000 +#define MSG_TSTAMP_ENQ 0x400000 +#define MSG_TSTAMP_ANY (MSG_TSTAMP | MSG_TSTAMP_ACK | MSG_TSTAMP_ENQ) + +#ifndef SCM_TSTAMP_SND +struct scm_timestamping { + struct timespec ts[3]; +}; + +#define SCM_TSTAMP_SND 0 +#define SCM_TSTAMP_SCHED 1 +#define SCM_TSTAMP_ACK 2 + +#define SOF_TIMESTAMPING_OPT_ID (1<<7) +#define SOF_TIMESTAMPING_TX_SCHED (1<<8) +#define SOF_TIMESTAMPING_TX_ACK (1<<9) +#endif + +#define NUM_RUNS 4 + +/* command line parameters */ +static int cfg_proto = SOCK_STREAM; +static int cfg_ipproto = IPPROTO_TCP; +static int do_ipv4 = 1; +static int do_ipv6 = 1; +static int payload_len = 10; +static int tstamp_no_payload; +static uint16_t dest_port = 9000; + +struct sockaddr_in daddr; +struct sockaddr_in6 daddr6; + +/* random globals */ +static struct timeval tv; +static struct timespec ts_prev; +static int tstamp_payload_len; + +static void __print_timestamp(const char *name, struct timespec *cur, + uint32_t key) +{ + if (!(cur->tv_sec | cur->tv_nsec)) + return; + + fprintf(stderr, " %s: %lu s %lu us (seq=%u, len=%u)", + name, cur->tv_sec, cur->tv_nsec / 1000, + key, tstamp_payload_len); + + if ((ts_prev.tv_sec | ts_prev.tv_nsec)) { + int64_t cur_ms, prev_ms; + + cur_ms = (long) cur->tv_sec * 1000 * 1000; + cur_ms += cur->tv_nsec / 1000; + + prev_ms = (long) ts_prev.tv_sec * 1000 * 1000; + prev_ms += ts_prev.tv_nsec / 1000; + + fprintf(stderr, " (%+ld us)", cur_ms - prev_ms); + } + + ts_prev = *cur; + fprintf(stderr, "\n"); +} + +static void print_timestamp_usr(void) +{ + struct timespec ts; + + ts.tv_sec = tv.tv_sec; + ts.tv_nsec = tv.tv_usec * 1000; + __print_timestamp(" USR", &ts, 0); + +} + +static void print_timestamp(struct scm_timestamping *tss, int tstype, int tskey) +{ + const char *tsname; + + switch (tstype) { + case SCM_TSTAMP_SCHED: + tsname = " ENQ"; + break; + case SCM_TSTAMP_SND: + tsname = " SND"; + break; + case SCM_TSTAMP_ACK: + tsname = " ACK"; + break; + default: + error(1, 0, "unknown timestamp type: %u", + tstype); + } + __print_timestamp(tsname, &tss->ts[0], tskey); +} + +static void __poll(int fd) +{ + struct pollfd pollfd; + int ret; + + memset(&pollfd, 0, sizeof(pollfd)); + pollfd.events = POLLIN; + pollfd.fd = fd; + ret = poll(&pollfd, 1, 100); + if (ret == -1 && errno != EAGAIN) + error(1, errno, "poll"); +} + +static void __recv_errmsg_cmsg(struct msghdr *msg) +{ + struct sock_extended_err *serr = NULL; + struct scm_timestamping *tss = NULL; + struct cmsghdr *cm; + + for (cm = CMSG_FIRSTHDR(msg); cm; cm = CMSG_NXTHDR(msg, cm)) { + if (cm->cmsg_level == SOL_SOCKET && + cm->cmsg_type == SCM_TIMESTAMPING) { + tss = (void *) CMSG_DATA(cm); + } else if ((cm->cmsg_level == SOL_IP && + cm->cmsg_type == IP_RECVERR) || + (cm->cmsg_level == SOL_IPV6 && + cm->cmsg_type == IPV6_RECVERR)) { + + serr = (void *) CMSG_DATA(cm); + if (serr->ee_errno != ENOMSG || + serr->ee_origin != SO_EE_ORIGIN_TIMESTAMPING) { + fprintf(stderr, "unknown ip error %d %d\n", + serr->ee_errno, + serr->ee_origin); + serr = NULL; + } + } else + fprintf(stderr, "%d, %d\n", + cm->cmsg_level, cm->cmsg_type); + } + + if (serr && tss) + print_timestamp(tss, serr->ee_info, serr->ee_data); +} + +static int recv_errmsg(int fd) +{ + static char ctrl[1024 /* overcommit */]; + static struct msghdr msg; + struct iovec entry; + static char *data; + int ret = 0; + + data = malloc(payload_len); + if (!data) + error(1, 0, "malloc"); + + memset(&msg, 0, sizeof(msg)); + memset(&entry, 0, sizeof(entry)); + memset(ctrl, 0, sizeof(ctrl)); + memset(data, 0, sizeof(data)); + + entry.iov_base = data; + /* for TCP we specify payload length to read one packet at a time. */ + entry.iov_len = payload_len; + msg.msg_iov = &entry; + msg.msg_iovlen = 1; + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = ctrl; + msg.msg_controllen = sizeof(ctrl); + + ret = recvmsg(fd, &msg, MSG_ERRQUEUE | MSG_DONTWAIT); + if (ret == -1 && (errno == EINTR || errno == EWOULDBLOCK)) + goto done; + if (ret == -1) + error(1, errno, "recvmsg"); + + tstamp_payload_len = ret; + if (tstamp_no_payload && tstamp_payload_len) + error(1, 0, "recv: payload when configured without"); + else if (!tstamp_no_payload && !tstamp_payload_len) + error(1, 0, "recv: no payload when configured with"); + + __recv_errmsg_cmsg(&msg); + +done: + free(data); + return ret == -1; +} + +static int setsockopt_ts(int fd, int flags) +{ + int val; + + val = 0; + if (flags & MSG_TSTAMP_ANY) { + if (flags & MSG_TSTAMP) + val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (flags & MSG_TSTAMP_ENQ) + val |= SOF_TIMESTAMPING_TX_SCHED; + if (flags & MSG_TSTAMP_ACK) + val |= SOF_TIMESTAMPING_TX_ACK; + + val |= SOF_TIMESTAMPING_OPT_ID; + + flags &= ~MSG_TSTAMP_ANY; + } + +#if 0 + if (tstamp_no_payload) + val |= SOF_TIMESTAMPING_OPT_TX_NO_PAYLOAD; +#endif + + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, + (char *) &val, sizeof(val))) + error(1, 0, "setsockopt"); + + return flags; +} + +static void do_test(int family, unsigned int flags) +{ + char *buf; + int fd, i, val, total_len; + + if (family == IPPROTO_IPV6 && cfg_proto != SOCK_STREAM) { + /* due to lack of checksum generation code */ + fprintf(stderr, "test: skipping datagram over IPv6\n"); + return; + } + + total_len = payload_len; + if (cfg_proto == SOCK_RAW) { + total_len += sizeof(struct udphdr); + if (cfg_ipproto == IPPROTO_RAW) + total_len += sizeof(struct iphdr); + } + + buf = malloc(total_len); + if (!buf) + error(1, 0, "malloc"); + + fd = socket(family, cfg_proto, cfg_ipproto); + if (fd < 0) + error(1, errno, "socket"); + + if (cfg_proto == SOCK_STREAM) { + val = 1; + if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, + (char*) &val, sizeof(val))) + error(1, 0, "setsockopt no nagle"); + + if (family == PF_INET) { + if (connect(fd, (void *) &daddr, sizeof(daddr))) + error(1, errno, "connect ipv4"); + } else { + if (connect(fd, (void *) &daddr6, sizeof(daddr6))) + error(1, errno, "connect ipv6"); + } + } + + flags = setsockopt_ts(fd, flags); + + for (i = 0; i < NUM_RUNS; i++) { + memset(&ts_prev, 0, sizeof(ts_prev)); + memset(buf, 'a' + i, total_len); + buf[total_len - 2] = '\n'; + buf[total_len - 1] = '\0'; + + if (cfg_proto == SOCK_RAW) { + struct udphdr *udph; + struct iphdr *iph; + int off = 0; + + if (cfg_ipproto == IPPROTO_RAW) { + iph = (void *) buf; + + memset(iph, 0, sizeof(*iph)); + iph->ihl = 5; + iph->version = 4; + iph->ttl = 2; + iph->daddr = daddr.sin_addr.s_addr; + iph->protocol = IPPROTO_UDP; + /* kernel writes saddr, csum, len */ + + off = sizeof(*iph); + } + + udph = (void *) buf + off; + udph->source = ntohs(9000); /* random spoof */ + udph->dest = ntohs(dest_port); + udph->len = ntohs(sizeof(*udph) + payload_len); + udph->check = 0; /* not allowed for IPv6 */ + } + + gettimeofday(&tv, NULL); + if (cfg_proto != SOCK_STREAM) { + if (family == PF_INET) + val = sendto(fd, buf, total_len, flags, (void *) &daddr, sizeof(daddr)); + else + val = sendto(fd, buf, total_len, flags, (void *) &daddr6, sizeof(daddr6)); + } else { + val = send(fd, buf, payload_len, flags); + } + if (val != total_len) + error(1, errno, "send"); + + usleep(50 * 1000); + + print_timestamp_usr(); + + __poll(fd); + while (!recv_errmsg(fd)) {} + } + + if (close(fd)) + error(1, errno, "close"); + + free(buf); + usleep(400 * 1000); +} + +static void __attribute__((noreturn)) usage(const char *filepath) +{ + fprintf(stderr, "\nUsage: %s [options] hostname\n" + "\nwhere options are:\n" + " -4: only IPv4\n" + " -6: only IPv6\n" + " -h: show this message\n" + " -l N: send N bytes at a time\n" + " -n: no payload on tstamp\n" + " -r: use raw\n" + " -R: use raw (IP_HDRINCL)\n" + " -p N: connect to port N\n" + " -u: use udp\n", + filepath); + exit(1); +} + +static void parse_opt(int argc, char **argv) +{ + int proto_count = 0; + char c; + + while ((c = getopt(argc, argv, "46hl:np:rRu")) != -1) { + switch (c) { + case '4': + do_ipv6 = 0; + break; + case '6': + do_ipv4 = 0; + break; + case 'r': + proto_count++; + cfg_proto = SOCK_RAW; + cfg_ipproto = IPPROTO_UDP; + break; + case 'R': + proto_count++; + cfg_proto = SOCK_RAW; + cfg_ipproto = IPPROTO_RAW; + break; + case 'u': + proto_count++; + cfg_proto = SOCK_DGRAM; + cfg_ipproto = IPPROTO_UDP; + break; + case 'l': + payload_len = strtoul(optarg, NULL, 10); + break; + case 'n': + tstamp_no_payload = 1; + break; + case 'p': + dest_port = strtoul(optarg, NULL, 10); + break; + case 'h': + default: + usage(argv[0]); + } + } + + if (cfg_proto != SOCK_STREAM && payload_len > 1472) + error(1, 0, "udp packet might exceed expected MTU"); + if (!do_ipv4 && !do_ipv6) + error(1, 0, "pass -4 or -6, not both"); + if (proto_count > 1) + error(1, 0, "pass -r, -R or -u, not multiple"); + + if (optind != argc - 1) + error(1, 0, "missing required hostname argument"); +} + +static void resolve_hostname(const char *hostname) +{ + struct addrinfo *addrs, *cur; + int have_ipv4 = 0, have_ipv6 = 0; + + if (getaddrinfo(hostname, NULL, NULL, &addrs)) + error(1, errno, "getaddrinfo"); + + cur = addrs; + while (cur && !have_ipv4 && !have_ipv6) { + if (!have_ipv4 && cur->ai_family == AF_INET) { + memcpy(&daddr, cur->ai_addr, sizeof(daddr)); + daddr.sin_port = htons(dest_port); + have_ipv4 = 1; + } + else if (!have_ipv6 && cur->ai_family == AF_INET6) { + memcpy(&daddr6, cur->ai_addr, sizeof(daddr6)); + daddr6.sin6_port = htons(dest_port); + have_ipv6 = 1; + } + cur = cur->ai_next; + } + if (addrs) + freeaddrinfo(addrs); + + do_ipv4 &= have_ipv4; + do_ipv6 &= have_ipv6; +} + +static void do_main(int family) +{ + fprintf(stderr, "family: %s\n", + family == PF_INET ? "INET" : "INET6"); + + fprintf(stderr, "test SND\n"); + do_test(family, MSG_TSTAMP); + + fprintf(stderr, "test ENQ\n"); + do_test(family, MSG_TSTAMP_ENQ); + + fprintf(stderr, "test ENQ + SND\n"); + do_test(family, MSG_TSTAMP_ENQ | MSG_TSTAMP); + + if (cfg_proto == SOCK_STREAM) { + fprintf(stderr, "\ntest ACK\n"); + do_test(family, MSG_TSTAMP_ACK); + + fprintf(stderr, "\ntest SND + ACK\n"); + do_test(family, MSG_TSTAMP | MSG_TSTAMP_ACK); + + fprintf(stderr, "\ntest ENQ + SND + ACK\n"); + do_test(family, MSG_TSTAMP_ENQ | MSG_TSTAMP | MSG_TSTAMP_ACK); + } +} + +const char *sock_names[] = { NULL, "TCP", "UDP", "RAW" }; + +int main(int argc, char **argv) +{ + parse_opt(argc, argv); + resolve_hostname(argv[argc - 1]); + + fprintf(stderr, "protocol: %s\n", sock_names[cfg_proto]); + fprintf(stderr, "payload: %u\n", payload_len); + fprintf(stderr, "server port: %u\n", dest_port); + fprintf(stderr, "\n"); + + if (do_ipv4) + do_main(PF_INET); + if (do_ipv6) + do_main(PF_INET6); + + return 0; +} diff --git a/tools/tcplate/Makefile b/tools/tcplate/Makefile new file mode 100644 index 0000000..4d8928c --- /dev/null +++ b/tools/tcplate/Makefile @@ -0,0 +1,16 @@ + +.PHONY: all clean distclean + +all: tcplate + +tcplate: tcplate.o libnflog.o libpsock.o + gcc -static $+ -o $@ + +%.o: %.c + gcc -c -Wall -Werror $< -o $@ + +clean: + -rm -f *.o + +distclean: clean + -rm -f tcplate diff --git a/tools/tcplate/libnflog.c b/tools/tcplate/libnflog.c new file mode 100644 index 0000000..9fd1006 --- /dev/null +++ b/tools/tcplate/libnflog.c @@ -0,0 +1,326 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Netlink support library + * Geared at NETLINK_NETFILTER + * + * Read netfilter nflog output, for instance: + * `iptables -A OUTPUT -j NFLOG --nflog-group=10` + * + * To timestamp every packet, use the xt_time match: + * `iptables -A OUTPUT \ + * -m time --timestart 00:00 --timestop 23:59 \ + * -j NFLOG --nflog-group=10` + * or even + * `iptables -A OUTPUT -m time -j NFLOG --nflog-group=10` + * + * TODO: optimize by using mmapped ring, similar to psock + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE +#define _BSD_SOURCE /* for be64toh */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnflog.h" + +static int config_group = 10; /* nfnetlink group to follow */ +static int config_debug_lvl = 0; + +#define IOVLEN 8 +#define PKTLEN (1 << 11) + +static void __nflog_sendcmd(int fd, uint8_t cmd, void *msg, int msglen, + uint16_t family, uint16_t group_id) +{ + static int seq_id; + char buf[1024] __attribute__((aligned)); + struct nlmsghdr *nh; + struct nfgenmsg *ng; + struct nfattr *nfa; + int ret; + + memset(buf, 0, sizeof(buf)); + + nh = (void *) buf; + ng = (void *) buf + sizeof(*nh); + + nh->nlmsg_len = NLMSG_LENGTH(sizeof(*ng)); + nh->nlmsg_type = (NFNL_SUBSYS_ULOG << 8) | NFULNL_MSG_CONFIG; + nh->nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK; + nh->nlmsg_pid = 0; + nh->nlmsg_seq = ++seq_id; + + ng->nfgen_family = family; + ng->version = NFNETLINK_V0; + ng->res_id = htons(group_id); + + nfa = (void *) buf + NLMSG_ALIGN(nh->nlmsg_len); + nfa->nfa_type = cmd; + nfa->nfa_len = NFA_LENGTH(msglen); + + memcpy(NFA_DATA(nfa), msg, msglen); + + nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len) + NFA_ALIGN(nfa->nfa_len); + + if (send(fd, buf, nh->nlmsg_len, 0) != nh->nlmsg_len) + error(1, errno, "sendcmd"); + + /* TODO: handle EINTR */ + ret = recv(fd, buf, sizeof(buf), 0); + if (ret == -1) + error(1, errno, "recv ctrl: sock error"); + if (ret < NLMSG_OK(nh, ret)) + error(1, 0, "recv ctrl: insufficient length"); + if (nh->nlmsg_type != NLMSG_ERROR) + error(1, 0, "recv ctrl: unexpected type"); + ret = *(int *) NLMSG_DATA(nh); + if (ret) + error(1, ret, "recv ctrl: nflog error"); +} + +static void nflog_sendcmd(int fd, uint8_t cmd, uint16_t family, + uint16_t group_id) +{ + struct nfulnl_msg_config_cmd msg; + + memset(&msg, 0, sizeof(msg)); + msg.command = cmd; + __nflog_sendcmd(fd, NFULA_CFG_CMD, &msg, sizeof(msg), family, group_id); +} + +static void nflog_sendcmd_mode(int fd, uint16_t family, uint16_t group_id, + uint8_t mode, uint32_t value) +{ + struct nfulnl_msg_config_mode msg; + + memset(&msg, 0, sizeof(msg)); + msg.copy_mode = mode; + msg.copy_range = htonl(value); + __nflog_sendcmd(fd, NFULA_CFG_MODE, &msg, sizeof(msg), family, group_id); +} + +static void nflog_attach_inet(int fd, unsigned int snaplen) +{ + nflog_sendcmd(fd, NFULNL_CFG_CMD_PF_UNBIND, AF_INET, 0); + /* TODO: recv ack */ + nflog_sendcmd(fd, NFULNL_CFG_CMD_PF_BIND, AF_INET, 0); + /* TODO: recv ack */ + nflog_sendcmd(fd, NFULNL_CFG_CMD_BIND, AF_UNSPEC, config_group); + /* TODO: recv ack */ + + nflog_sendcmd_mode(fd, AF_UNSPEC, config_group, NFULNL_COPY_PACKET, snaplen); + /* TODO: recv ack */ +} + +int nflog_init(unsigned int snaplen) +{ + struct sockaddr_nl nladdr; + int fd, val; + + if (snaplen > PKTLEN) + error(1, 0, "snaplen exceeds pktlen: can cause drops"); + + fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_NETFILTER); + if (fd == -1) + error(1, errno, "socket"); + + val = 1 << 21; + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val))) + error(1, errno, "setsockopt SO_RCVBUF"); + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_groups = 1 << config_group; + + if (bind(fd, (void *) &nladdr, sizeof(nladdr))) + error(1, errno, "bind"); + + nflog_attach_inet(fd, snaplen); + return fd; +} + +void nflog_exit(int fd) +{ + if (close(fd)) + error(1, errno, "close"); +} + +void nflog_parse(const void *data, unsigned int len, log_fn fn) +{ + const struct nlmsghdr *nh; + + for (nh = (void *) data; NLMSG_OK(nh, len); nh = NLMSG_NEXT(nh, len)) { + const struct nfulnl_msg_packet_timestamp *nf_ts; + const struct nfgenmsg *ng; + const struct nfattr *attr; + uint64_t ts_sec = 0, ts_usec = 0; + const char *pkt; + int plen = 0; + int alen; + + if (nh->nlmsg_type == NLMSG_ERROR) + error(1, 0, "netlink error"); + if (nh->nlmsg_type == NLMSG_NOOP) + error(1, 0, "netlink noop"); + if (nh->nlmsg_len < sizeof(*nh) || len < nh->nlmsg_len) { + fprintf(stderr, "message truncated\n"); + continue; + } + + ng = NLMSG_DATA(nh); + if (config_debug_lvl) + fprintf(stderr, "P family=%s version=%d group=%d len=%d type=%hu\n", + ng->nfgen_family == AF_INET ? "INET" : "other", + ng->version, + ntohs(ng->res_id), + nh->nlmsg_len, + nh->nlmsg_type); + + attr = NFM_NFA(ng); + alen = nh->nlmsg_len - NLMSG_LENGTH(NLMSG_ALIGN(sizeof(*ng))); + while (NFA_OK(attr, alen)) { + switch (NFA_TYPE(attr)) { + case NFULA_PAYLOAD: + pkt = NFA_DATA(attr); + plen = NFA_PAYLOAD(attr); + break; + case NFULA_TIMESTAMP: + nf_ts = NFA_DATA(attr); + ts_sec = be64toh(nf_ts->sec); + ts_usec = be64toh(nf_ts->usec); + break; + case NFULA_GID: + case NFULA_PACKET_HDR: + case NFULA_PREFIX: + case NFULA_IFINDEX_OUTDEV: + case NFULA_UID: + default: + if (config_debug_lvl) + fprintf(stderr, " attr @%lu other type=%d\n", + ((unsigned long) attr) - (unsigned long) ng, + NFA_TYPE(attr)); + } + attr = NFA_NEXT(attr, alen); + } + + if (nh->nlmsg_type == NLMSG_DONE) + break; + + fn(pkt, plen, ts_sec, ts_usec); + } +} + +int nflog_read(int fd, log_fn fn) +{ + static char data[IOVLEN][PKTLEN]; + struct mmsghdr msgs[IOVLEN]; + struct iovec iovecs[IOVLEN]; + int i, len; + + memset(msgs, 0, sizeof(msgs)); + for (i = 0; i < IOVLEN; i++) { + iovecs[i].iov_base = data[i]; + iovecs[i].iov_len = PKTLEN; + msgs[i].msg_hdr.msg_iov = &iovecs[i]; + msgs[i].msg_hdr.msg_iovlen = 1; + } + + len = recvmmsg(fd, msgs, IOVLEN, MSG_DONTWAIT, NULL); + if (len == -1) { + if (errno == EAGAIN || errno == EINTR) + return 0; + if (errno == ENOBUFS) { + static int report_overflow; + if (!report_overflow) { + report_overflow = 1; + fprintf(stderr, "nflog: socket overflow detected. some packets will be lost (only warning once).\n"); + } + return 0; + } + error(1, errno, "recvmsg"); + } + + if (config_debug_lvl > 1) + fprintf(stderr, "recvmmsg len=%u\n", len); + + for (i = 0; i < len; i++) + nflog_parse(data[i], msgs[i].msg_len, fn); + + return 1; +} + +static int nflog_wait(int fd) +{ + struct pollfd pollfd[2]; + int len; + + do { + memset(&pollfd, 0, sizeof(pollfd)); + + pollfd[0].events = POLLIN; + pollfd[0].fd = 0; + + pollfd[1].events = POLLIN; + pollfd[1].fd = fd; + + len = poll(pollfd, 2, 50); + if (len == -1) { + if (errno == EINTR) + continue; + error(1, errno, "poll"); + } + if (len && pollfd[0].revents) + return 0; + } while (!len); + + return 1; +} + +void nflog_loop(int fd, log_fn fn) +{ + while (nflog_wait(fd)) { + while (nflog_read(fd, fn)) {} + } +} + +void nflog_all(log_fn fn, unsigned int snaplen) +{ + int fd; + + fd = nflog_init(snaplen); + nflog_loop(fd, fn); + nflog_exit(fd); +} + diff --git a/tools/tcplate/libnflog.h b/tools/tcplate/libnflog.h new file mode 100644 index 0000000..d1cdbb7 --- /dev/null +++ b/tools/tcplate/libnflog.h @@ -0,0 +1,42 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Netfilter LOG support library + * + * Only reads outgoing packets + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _LIBNFLOG_H_ +#define _LIBNFLOG_H_ + +#include + +/* can be called with len 0 */ +typedef void (*log_fn)(const void *pkt, unsigned int len, + uint64_t ts_sec, uint64_t ts_usec); + +void nflog_all(log_fn fn, unsigned int snaplen); + +int nflog_init(unsigned int snaplen); +int nflog_read(int fd, log_fn fn); +void nflog_loop(int fd, log_fn fn); +void nflog_exit(int fd); + +#endif // _LIBNFLOG_H_ + diff --git a/tools/tcplate/libpsock.c b/tools/tcplate/libpsock.c new file mode 100644 index 0000000..c92cb78 --- /dev/null +++ b/tools/tcplate/libpsock.c @@ -0,0 +1,200 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Packet socket support library + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libpsock.h" + +static void +psock_init_ring(struct psock *ps) +{ + struct tpacket_req tp; + int frames_per_block; + + if (ps->frame_size & (TPACKET_ALIGNMENT - 1)) + error(1, 0, "illegal frame size"); + + tp.tp_frame_size = ps->frame_size; + tp.tp_frame_nr = ps->frame_count; + + frames_per_block = getpagesize() / ps->frame_size; + tp.tp_block_size = getpagesize(); + tp.tp_block_nr = ps->frame_count / frames_per_block; + + if (setsockopt(ps->fd, SOL_PACKET, PACKET_RX_RING, (void*) &tp, sizeof(tp))) + error(1, errno, "setsockopt() ring"); + + ps->ring = mmap(0, tp.tp_block_size * tp.tp_block_nr, + PROT_READ | PROT_WRITE, MAP_SHARED, ps->fd, 0); + if (!ps->ring) + error(1, 0, "mmap()"); +} + +struct sock_filter egress_filter[] = { + { BPF_LD|BPF_B|BPF_ABS, 0, 0, SKF_AD_OFF + SKF_AD_PKTTYPE }, + { BPF_JMP|BPF_JEQ, 1, 0, PACKET_OUTGOING }, + { BPF_RET, 0, 0, 0x00000000 }, + { BPF_RET, 0, 0, 0x0000ffff } +}; + +struct sock_fprog egress_fprog = { + .len = sizeof(egress_filter) / sizeof(egress_filter[0]), + .filter = egress_filter, +}; + +void +psock_init(struct psock *ps) +{ + int val; + + ps->fd = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); + if (ps->fd < 0) + error(1, errno, "socket()"); + + val = TPACKET_V2; + if (setsockopt(ps->fd, SOL_PACKET, PACKET_VERSION, &val, sizeof(val))) + error(1, errno, "setsockopt() version"); + val = 1; + if (setsockopt(ps->fd, SOL_PACKET, PACKET_TIMESTAMP, &val, sizeof(val))) + error(1, errno, "setsockopt() tstamp"); + if (setsockopt(ps->fd, SOL_SOCKET, SO_ATTACH_FILTER, + &egress_fprog, sizeof(egress_fprog))) + error(1, errno, "setsockopt() filter"); + + if (ps->dev) { + struct sockaddr_ll laddr; + + memset(&laddr, 0, sizeof(laddr)); + laddr.sll_family = AF_PACKET; + laddr.sll_protocol = htons(ETH_P_ALL); /* must be on ptype_all to sniff egress */ + laddr.sll_ifindex = if_nametoindex(ps->dev); + if (!laddr.sll_ifindex) + error(1, errno, "no such device: %s", ps->dev); + if (bind(ps->fd, (void *) &laddr, sizeof(laddr))) + error(1, errno, "bind device: %s (%d)", ps->dev, laddr.sll_ifindex); + } + + psock_init_ring(ps); +} + +static int +psock_wait(struct psock *ps) +{ + struct pollfd pollset[2]; + int ret; + + pollset[0].fd = 0; + pollset[0].events = POLLIN; + pollset[0].revents = 0; + + pollset[1].fd = ps->fd; + pollset[1].events = POLLIN; + pollset[1].revents = 0; + + ret = poll(pollset, 2, 100); + if (ret < 0 && errno != EINTR && errno != EAGAIN) + error(1, errno, "poll()"); + + if (ret > 0 && pollset[0].revents) + return 0; + + return 1; +} + +int +psock_read(struct psock *ps, psock_fn fn) +{ + struct tpacket2_hdr *header; + + header = (void *) ps->ring + (ps->idx_reader * ps->frame_size); + + if (!(header->tp_status & TP_STATUS_USER)) + return 0; + if (header->tp_status & TP_STATUS_COPY) + error(1, 0, "detected incomplete packed"); + if (header->tp_status & TP_STATUS_LOSING) { + static int report_overflow; + if (!report_overflow) { + report_overflow = 1; + fprintf(stderr, "psock: socket overflow detected. some packets will be lost (only warning once).\n"); + } + } + + fn(header, ((void *) header) + header->tp_mac); + + header->tp_status = TP_STATUS_KERNEL; + ps->idx_reader = (ps->idx_reader + 1) & (ps->frame_count - 1); + return 1; +} + +void +psock_loop(struct psock *ps, psock_fn fn) +{ + while (psock_wait(ps)) { + while (psock_read(ps, fn)) {} + } +} + +void +psock_exit(struct psock *ps) +{ + if (munmap(ps->ring, ps->frame_count * ps->frame_size)) + error(1, errno, "munmap"); + + if (close(ps->fd)) + error(1, errno, "close"); +} + +void +psock_all(int frame_count, int frame_size, const char *dev, psock_fn fn) +{ + struct psock ps; + + memset(&ps, 0, sizeof(ps)); + + ps.frame_count = frame_count; + ps.frame_size = frame_size; + if (dev) + ps.dev = dev; + + psock_init(&ps); + psock_loop(&ps, fn); + psock_exit(&ps); +} + diff --git a/tools/tcplate/libpsock.h b/tools/tcplate/libpsock.h new file mode 100644 index 0000000..922f607 --- /dev/null +++ b/tools/tcplate/libpsock.h @@ -0,0 +1,51 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Packet socket support library + * + * Only reads outgoing packets + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#ifndef _LIBPSOCK_H_ +#define _LIBPSOCK_H_ + +#include + +struct psock { + int frame_size; + int frame_count; + const char *dev; /* (optional) device to bind to */ + + /* internal */ + int fd; + char *ring; + int idx_reader; +}; + +typedef void (*psock_fn)(struct tpacket2_hdr *tp, void *pkt); + +void psock_all(int frame_count, int frame_size, const char *dev, psock_fn fn); + +void psock_init(struct psock *ps); +int psock_read(struct psock *ps, psock_fn fn); +void psock_loop(struct psock *ps, psock_fn fn); +void psock_exit(struct psock *ps); + +#endif // _LIBPSOCK_H_ + diff --git a/tools/tcplate/tcplate.c b/tools/tcplate/tcplate.c new file mode 100644 index 0000000..b6064f7 --- /dev/null +++ b/tools/tcplate/tcplate.c @@ -0,0 +1,604 @@ +/* + * Copyright 2014 Google Inc. + * Author: willemb@google.com (Willem de Bruijn) + * + * Measure tcp latency through the kernel using pcap and nflog + * + * Read TCP/IP packets using pcap and nflog and calculate the + * latency spent within traffic shaping by subtracting timestamp + * of the first occurrence (iptables) from the timestamp of the + * second occurrence (packet socket). + * + * It has two modes: + * normal: latency of traffic shaping from protocol layer to dev: + * this subtracts a tstamp in packetsock on dev (eth0) + * from a tstamp in the ip layer at iptables NFLOG + * bonding: latency of traffic shaping on bonding slaves: + * this reads packets on every device, sees the same + * on both master (e.g., bond0) and slaves. + * + * Testing: + * verified correctness by adding delay at the relevant traffic + * shaping layer with + * `tc qdisc add dev $ETH root est 1sec 4sec netem limit 40000 delay 20ms` + * + * Implementation: + * tcplate uses two datastructures: + * - table: a hashtable to store new TCP segments and their timestamp + * - logs: a circular buffer to store tstamp diff on second viewing + * Logs is double buffered to allow sorting results offline. + * + * + * License (GPLv2): + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libnflog.h" +#include "libpsock.h" + +static int log_len = 10000; +static int table_len = 57251; /* prime */ +static int ival = 1; +static int frame_count = (1 << 14); +static int frame_size = 128; +static int bond_mode; +static int debug_mode; +static int show_extended; +static int verbose; +static char dev[IFNAMSIZ + 1] = "eth0"; +static uint8_t tos_mask = UCHAR_MAX; +static bool tos_filter = false; + +/* race condition. TODO: protect */ +static uint64_t collisions; +static uint64_t pktcount; +static uint64_t count_nflog; +static uint64_t count_psock; + +/* double buffered list of observations */ +static int64_t *logs[2]; +static int log_selector; +static int log_head; +static int exit_hard; + +struct table_key_full { + __be32 ip_src; + __be32 ip_dst; + __be16 tcp_src; + __be16 tcp_dst; + __be32 seqno; +} __attribute__((packed)); + +union table_key { + struct table_key_full full; + __int128 cmp; +}; + +struct table_elem { + union table_key key; + int64_t tstamp; +}; + +/* not thread safe */ +struct table_elem *table; + +/* Show how many table elements are in use */ +static int +table_scan(void) +{ + int i, used = 0; + + for (i = 0; i < table_len; i++) + if (table[i].key.cmp) + used++; + + return used; +} + +static void +log_record(int64_t val) +{ + /* do not wrap log_head, to discern a partial from full log */ + logs[log_selector][log_head % log_len] = val; + log_head++; +} + +/* switch between double buffered logs, return number of recorded events */ +static int +log_rotate(void) +{ + int old_head; + + log_selector = (log_selector + 1) & 0x1; + old_head = log_head; + log_head = 0; + + return old_head; +} + +/* qsort comparison callback */ +static int +log_compar(const void *_a, const void *_b) +{ + const int64_t *a = _a, *b = _b; + return *a < *b ? -1 : (*a > *b ? 1 : 0); +} + +static void +log_show(void) +{ + int len, matches, selector; + + matches = log_rotate(); + len = matches < log_len ? matches : log_len; + selector = (log_selector + 1) & 0x1; + + qsort(logs[selector], len, sizeof(logs[0][0]), log_compar); + if (len >= 100) { + fprintf(stderr, " %8ld %8ld %8ld", + logs[selector][len / 2], + logs[selector][(len * 9) / 10], + logs[selector][(len * 99) / 100]); + + if (show_extended) + fprintf(stderr, " %10lu %10u %10lu %10d", + pktcount, matches, collisions, + table_scan()); + if (show_extended && verbose > 0) + fprintf(stderr, " %10lu %10lu", + count_nflog, count_psock); + write(2, "\n", 1); + } else { + write(2, ".\n", 2); + } + + collisions = 0; + pktcount = 0; + count_nflog = 0; + count_psock = 0; +} + +/* From "The Practice of Programming" via + * PERL_HASH in Perl 5.005, which is GPL */ +static int hash_compute(void *_key, int klen) +{ + const unsigned int multiplier = 37; + unsigned char *cur, *key = _key; + unsigned int h = 0; + + for (cur = key; cur - key < klen; cur++) + h = (h * multiplier) + *cur; + return h + (h >> 5); +} + +static void +packet_process(__be32 ip_src, __be32 ip_dst, + __be16 tcp_src, __be16 tcp_dst, + __be32 seqno, int64_t tstamp, + int caller_type) +{ + union table_key key; + unsigned int idx; + + key.full.ip_src = ip_src; + key.full.ip_dst = ip_dst; + key.full.tcp_src = tcp_src; + key.full.tcp_dst = tcp_dst; + key.full.seqno = seqno; + + idx = hash_compute(&key, sizeof(key)); + idx %= table_len; + + /* if key is new, insert new tstamp */ + if (!table[idx].key.cmp) { +insert: + table[idx].key.cmp = key.cmp; + table[idx].tstamp = tstamp; + pktcount++; + } + /* if collision, record and insert */ + else if (table[idx].key.cmp != key.cmp) { + collisions++; + goto insert; + } + /* else log the diff and clear the key */ + else { + tstamp = tstamp - table[idx].tstamp; + if (tstamp < 0) + tstamp = -tstamp; + log_record(tstamp); + table[idx].key.cmp = 0; + } + + if (debug_mode) + fprintf(stderr, "%s %u:%hu > %u:%hu seqno=%u time=%lu\n", + caller_type == 0 ? "nflog" : "psock", + ntohl(ip_src), ntohs(tcp_src), + ntohl(ip_dst), ntohs(tcp_dst), + ntohl(seqno), tstamp); +} + +static bool +tos_match(uint8_t tos) +{ + if (!tos_filter) + return true; + + if (tos & tos_mask) + return true; + + if (tos == tos_mask) + return true; + + return false; +} + +static void +packet_callback(struct tpacket2_hdr *tp, void *pkt) +{ + struct ethhdr *eth; + struct iphdr *iph; + struct tcphdr *tcph; + eth = pkt; + + if (eth->h_proto != htons(ETH_P_IP)) + return; + + iph = pkt + sizeof(*eth); + + /* TODO: support IPv6 */ + if (iph->version != 4) + error(1, 0, "bug in parsing ip header"); + + if (iph->protocol != IPPROTO_TCP) + return; + if (!tos_match(iph->tos)) + return; + + tcph = ((void *) iph) + (iph->ihl << 2); + packet_process(iph->saddr, iph->daddr, + tcph->source, tcph->dest, + tcph->seq, + (1000LL * 1000 * tp->tp_sec) + tp->tp_nsec / 1000, + 1); + + count_psock++; +} + +static void +nflog_callback(const void *data, unsigned int len, + uint64_t ts_sec, uint64_t ts_usec) +{ + const struct iphdr *iph = data; + const struct tcphdr *tcph; + + if (!len) + return; + + if (iph->version != 4) + error(1, 0, "bug in parsing ip header"); + if ((iph->ihl << 2) + sizeof(*tcph) > len) + error(1, 0, "nflog snaplen too small"); + + if (iph->protocol != IPPROTO_TCP) + return; + if (!tos_match(iph->tos)) + return; + + tcph = ((void *) iph) + (iph->ihl << 2); + packet_process(iph->saddr, iph->daddr, + tcph->source, tcph->dest, + tcph->seq, + (1000LL * 1000 * ts_sec) + ts_usec, + 0); + + count_nflog++; +} + +static void +sigalrm_handler(int signum) +{ + log_show(); + alarm(ival); +} + +static void +sigint_handler(int signum) +{ + + if (exit_hard) + exit(1); + + /* first try to exit gracefully based in EINTR in poll */ + exit_hard = 1; +} + +static void +__init(void) +{ + logs[0] = malloc(log_len * sizeof(logs[0][0])); + logs[1] = malloc(log_len * sizeof(logs[0][0])); + table = calloc(table_len, sizeof(struct table_elem)); + if (!logs[0] || !logs[1] || !table) + error(1, 0, "alloc"); +} + +static void +__exit(void) +{ + free(table); + free(logs[1]); + free(logs[0]); +} + +static void __attribute__((noreturn)) +usage(const char *filepath) +{ + fprintf(stderr, "usage: %s [-bdfFhqvx] [-c count] [-i iface] [-l loglen] [-L tbllen] [-t ival]\n" + "\n" + "where\n" + " -b sets bonded mode: latency in slave device tc\n" + " -c sets capture queue length (in packets)\n" + " -d debug mode, displays individual records\n" + " -f filter by TOS bits (pass as base 10 or 16)\n" + " -h to show this message and exits\n" + " -i interface (default: eth0)\n" + " -l sets the timestamp log length\n" + " -L sets the tcp segment hashtable length\n" + " -q quiet: suppresses more output\n" + " -t sets the display interval (secs)\n" + " -v sets the verbose option\n" + " -x show extended stats: #matched, collisions, ..\n", + filepath); + exit(1); +} + +static void +parse_opt(int argc, char **argv) +{ + int c; + + while ((c = getopt (argc, argv, "bc:df:hi:l:L:qt:vx")) != -1) + { + switch (c) { + case 'b': + bond_mode = 1; + break; + case 'c': + frame_count = strtoul(optarg, NULL, 10); + break; + case 'd': + debug_mode = 1; + break; + case 'f': + tos_mask = strtoul(optarg, NULL, 0); + tos_filter = true; + break; + case 'h': + usage(argv[0]); + break; + case 'i': + strncpy(dev, optarg, IFNAMSIZ); + break; + case 'l': + log_len = strtoul(optarg, NULL, 10); + break; + case 'L': + table_len = strtoul(optarg, NULL, 10); + break; + case 'q': + if (verbose > 0) + error(1, 0, "pass -q or -v"); + verbose = -1; + break; + case 't': + ival = strtoul(optarg, NULL, 10); + break; + case 'v': + if (verbose < 0) + error(1, 0, "pass -q or -v"); + verbose = 1; + break; + case 'x': + show_extended = 1; + break; + } + } + + if (verbose > 0) { + fprintf(stderr, "mode: %s\n", bond_mode ? "bond" : dev); + fprintf(stderr, "log_len: %u\n", log_len); + fprintf(stderr, "table_len: %u\n", table_len); + fprintf(stderr, "frame_count: %u\n", frame_count); + fprintf(stderr, "frame_size: %u\n", frame_size); + fprintf(stderr, "interval: %u\n", ival); + if (tos_filter) + fprintf(stderr, "tos mask: 0x%x\n", tos_mask); + } +} + +/* @return 1 if data ready, 0 to exit */ +static int do_wait(int fd1, int fd2) +{ + struct pollfd pollset[3]; + int ret; + + pollset[0].fd = 0; + pollset[0].events = POLLIN; + pollset[0].revents = 0; + + pollset[1].fd = fd1; + pollset[1].events = POLLIN; + pollset[1].revents = 0; + + pollset[2].fd = fd2; + pollset[2].events = POLLIN; + pollset[2].revents = 0; + + /* minor race with entering poll(), below */ + if (exit_hard) + return 0; + + ret = poll(pollset, fd2 >= 0 ? 3 : 2, 100); + if (ret < 0 && errno != EINTR) + error(1, errno, "poll()"); + + if (ret > 0 && pollset[0].revents) + return 0; + + return 1; +} + +#define IPT_RULE " -m time -j NFLOG --nflog-group=10 --nflog-threshold=1" +static void __exit_nflog(void) +{ + if (verbose > 0) + system("iptables -v -nL OUTPUT | grep NFLOG"); + if (system("iptables -D OUTPUT " IPT_RULE)) { + error(1, 0, "error while removing log module"); + } +} + +/* + * System configuration change: insert an iptables rule. + * Ensure rollback with atexit() (though this fails with SIGINT, ..) + */ +static void __init_nflog(void) +{ + int ret; + + ret = system("iptables -L OUTPUT | grep -q NFLOG"); + if (ret == -1) + error(1, 0, "read iptables"); + if (WEXITSTATUS(ret) == 0) + error(1, 0, "log module still loaded? try iptables -L"); + + if (system("iptables -A OUTPUT" IPT_RULE)) { + __exit_nflog(); + error(1, 0, "load log module"); + } + atexit(__exit_nflog); +} + +static void __main(void) +{ + struct psock ps; + int logfd; + + memset(&ps, 0, sizeof(ps)); + ps.frame_count = frame_count; + ps.frame_size = frame_size; + /* + * in normal mode, get timestamp at ip layer and eth0 dequeue + * in bond mode, get timestamp at bond0 and eth0 dequeue. + * + * filter psock on eth0 if calculating latency from ip to eth0. + * else, do not filter to read packet on both master and slave, + * but disable nflog. + */ + if (bond_mode) { + logfd = -1; + } else { + /* + * snaplen must be smaller than PKTLEN in nflog_read + * or packets that are > PKTLEN && <= snaplen are dropped + */ + const int snaplen = 60; + + logfd = nflog_init(snaplen); + ps.dev = dev; + } + + psock_init(&ps); + + while (do_wait(ps.fd, logfd)) { + if (logfd != -1) + while (nflog_read(logfd, nflog_callback)) {} + while (psock_read(&ps, packet_callback)) {} + } + + psock_exit(&ps); + if (logfd != -1) + nflog_exit(logfd); +} + +static void +print_header(void) +{ +#define MAIN_HEADER "latency: 50 90 99 (%% us)" +#define EXTRA_HEADER " #total #matches #collis. #tblkeys" + + fprintf(stderr, "\npress Enter to exit\n" + "\n. indicates insufficient data\n" + "\n"); + + if (show_extended) + fprintf(stderr, MAIN_HEADER EXTRA_HEADER "\n"); + else + fprintf(stderr, MAIN_HEADER "\n"); +} + +int +main(int argc, char **argv) +{ + if (verbose >= 0) + fprintf(stderr, "tcplate v1.2: measure traffic shaping TCP latency\n"); + + parse_opt(argc, argv); + + if (verbose >= 0) + print_header(); + + __init(); + + signal(SIGALRM, &sigalrm_handler); + signal(SIGINT, &sigint_handler); + + alarm(ival); + + __init_nflog(); + __main(); + __exit(); + + return 0; +} +