NETWORKING MASTERY · PHASE 4 · MODULE 15 · WEEK 13
🔌 Socket Programming
POSIX sockets · TCP/UDP patterns · Non-blocking I/O · epoll · SO_REUSEPORT · Raw sockets · Socket options
Intermediate → Advanced Prerequisite: M14 Linux Stack POSIX.1-2017 C / Systems Programming 3 Labs

THE POSIX SOCKET API — THE GATEWAY TO THE NETWORK

🔌

Socket Fundamentals

BASICS
/* socket() — create a socket */
int fd = socket(domain, type, protocol);

domain:   AF_INET (IPv4), AF_INET6 (IPv6), AF_UNIX (local), AF_PACKET (raw L2)
type:     SOCK_STREAM (TCP), SOCK_DGRAM (UDP), SOCK_RAW (raw IP/L2)
protocol: Usually 0 (auto-select). IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP

/* Address structures */
struct sockaddr_in {               /* IPv4 */
    sa_family_t    sin_family;      /* AF_INET */
    in_port_t      sin_port;        /* htons(port) — network byte order! */
    struct in_addr sin_addr;        /* .s_addr = htonl(INADDR_ANY) or inet_addr("1.2.3.4") */
};

struct sockaddr_in6 {              /* IPv6 */
    sa_family_t     sin6_family;    /* AF_INET6 */
    in_port_t       sin6_port;      /* htons(port) */
    uint32_t        sin6_flowinfo;
    struct in6_addr sin6_addr;      /* IPv6 address (16 bytes) */
    uint32_t        sin6_scope_id;
};

/* Byte order — critical! */
htons(x):  host-to-network short (16-bit port numbers)
htonl(x):  host-to-network long  (32-bit IP addresses)
ntohs(x):  network-to-host short
ntohl(x):  network-to-host long
# Network byte order = big-endian
# x86 is little-endian → ALWAYS use htons/htonl for ports/IPs in structs

/* Dual-stack (IPv4+IPv6) */
int fd = socket(AF_INET6, SOCK_STREAM, 0);
int v6only = 0;
setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof(v6only));
/* Binding :: (IPv6 any) now accepts IPv4-mapped IPv6 addresses (::ffff:x.x.x.x) */

TCP SERVER PATTERNS

🖥️

Complete TCP Server Template

TCP SERVER
#include <sys/socket.h>
#include <netinet/in.h>
#include <unistd.h>
#include <string.h>

int tcp_server(uint16_t port) {
    int lfd = socket(AF_INET6, SOCK_STREAM, 0);

    /* SO_REUSEADDR: allow bind to port even if in TIME_WAIT */
    int opt = 1;
    setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
    /* SO_REUSEPORT: multiple processes can bind same port (load balance) */
    setsockopt(lfd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt));

    struct sockaddr_in6 addr = {0};
    addr.sin6_family = AF_INET6;
    addr.sin6_port   = htons(port);
    addr.sin6_addr   = in6addr_any;   /* :: = any interface */

    bind(lfd, (struct sockaddr *)&addr, sizeof(addr));
    listen(lfd, 128);  /* backlog: max pending connections in accept queue */

    while (1) {
        struct sockaddr_in6 client;
        socklen_t clen = sizeof(client);
        int cfd = accept(lfd, (struct sockaddr *)&client, &clen);
        /* cfd is a NEW socket for this connection; lfd still listens */

        /* Handle client — in production: fork() or thread */
        handle_client(cfd);
        close(cfd);
    }
}

void handle_client(int fd) {
    char buf[4096];
    ssize_t n;
    /* CRITICAL: recv may return LESS than requested — MUST loop */
    while ((n = recv(fd, buf, sizeof(buf), 0)) > 0) {
        /* n bytes received; process buf[0..n-1] */
        send(fd, buf, n, 0);  /* echo back */
        /* send may also return less than n → must loop send too */
    }
    /* n == 0: peer closed connection (FIN received) */
    /* n == -1: error (check errno: EAGAIN, ECONNRESET, etc.) */
}

/* TCP client */
int tcp_connect(const char *host, uint16_t port) {
    struct addrinfo hints = {0}, *res;
    hints.ai_family   = AF_UNSPEC;
    hints.ai_socktype = SOCK_STREAM;
    char portstr[8]; snprintf(portstr, sizeof(portstr), "%u", port);
    getaddrinfo(host, portstr, &hints, &res);

    int fd = socket(res->ai_family, res->ai_socktype, 0);
    connect(fd, res->ai_addr, res->ai_addrlen);
    freeaddrinfo(res);
    return fd;
}

UDP PROGRAMMING

📡

UDP Socket — sendto/recvfrom

UDP
/* UDP server — connectionless, per-datagram source address */
int udp_server(uint16_t port) {
    int fd = socket(AF_INET, SOCK_DGRAM, 0);
    struct sockaddr_in addr = {0};
    addr.sin_family = AF_INET;
    addr.sin_port   = htons(port);
    addr.sin_addr.s_addr = INADDR_ANY;
    bind(fd, (struct sockaddr *)&addr, sizeof(addr));

    char buf[65536];  /* max UDP payload */
    while (1) {
        struct sockaddr_in client;
        socklen_t clen = sizeof(client);
        ssize_t n = recvfrom(fd, buf, sizeof(buf), 0,
                            (struct sockaddr *)&client, &clen);
        /* n == complete datagram size — UDP preserves message boundaries */
        /* client contains source IP+port for this packet */
        sendto(fd, buf, n, 0, (struct sockaddr *)&client, clen);
    }
}

/* UDP multicast sender */
int udp_multicast_send(const char *group, uint16_t port) {
    int fd = socket(AF_INET, SOCK_DGRAM, 0);
    struct ip_mreq mreq;
    inet_aton(group, &mreq.imr_multiaddr);
    mreq.imr_interface.s_addr = INADDR_ANY;
    setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq));
    /* TTL for multicast (default 1 = link-local) */
    unsigned char ttl = 32;
    setsockopt(fd, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl));
    struct sockaddr_in dst = {0};
    dst.sin_family = AF_INET;
    dst.sin_port   = htons(port);
    inet_aton(group, &dst.sin_addr);
    sendto(fd, "hello multicast", 15, 0,
           (struct sockaddr *)&dst, sizeof(dst));
    return fd;
}

NON-BLOCKING I/O AND epoll

epoll — Scalable I/O Multiplexing

EPOLL

The classic select() and poll() have O(n) scan overhead — with 10,000 fds, every call scans all 10,000 even if only 1 is ready. epoll maintains a kernel-side data structure and returns only the fds that are actually ready — O(1) per event, O(k) where k is ready events.

#include <sys/epoll.h>
#include <fcntl.h>

/* Set fd to non-blocking */
void set_nonblocking(int fd) {
    int flags = fcntl(fd, F_GETFL, 0);
    fcntl(fd, F_SETFL, flags | O_NONBLOCK);
}

/* Create epoll instance and event loop */
int epoll_server(int lfd) {
    int epfd = epoll_create1(0);

    /* Add listener to epoll */
    struct epoll_event ev = { .events = EPOLLIN, .data.fd = lfd };
    epoll_ctl(epfd, EPOLL_CTL_ADD, lfd, &ev);

    struct epoll_event events[1024];
    while (1) {
        int n = epoll_wait(epfd, events, 1024, -1);  /* -1 = block forever */
        for (int i = 0; i < n; i++) {
            if (events[i].data.fd == lfd) {
                /* New connection */
                int cfd = accept(lfd, NULL, NULL);
                set_nonblocking(cfd);
                struct epoll_event cev = {
                    .events = EPOLLIN | EPOLLET,  /* edge-triggered */
                    .data.fd = cfd
                };
                epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &cev);
            } else {
                /* Data ready on existing connection */
                char buf[4096];
                ssize_t nr;
                /* Edge-triggered: MUST read until EAGAIN */
                while ((nr = recv(events[i].data.fd, buf, sizeof(buf), 0)) > 0)
                    process(buf, nr);
                if (nr == 0) {  /* connection closed */
                    epoll_ctl(epfd, EPOLL_CTL_DEL, events[i].data.fd, NULL);
                    close(events[i].data.fd);
                }
                /* nr == -1 && errno == EAGAIN: no more data right now */
            }
        }
    }
}

/* Edge-triggered vs Level-triggered */
# EPOLLET (edge): notify ONCE when state changes (unread→readable)
#   Must read ALL data immediately or it won't be reported again
#   Higher performance (fewer epoll_wait wakeups)
# Level (default): notify every time data is available
#   Easier to code correctly; acceptable for most applications

SOCKET OPTIONS — TUNING FOR PERFORMANCE

⚙️

Critical Socket Options

SOCKOPTS
OptionLevelEffectWhen to Use
SO_REUSEADDRSOL_SOCKETAllow bind to port in TIME_WAIT stateAlways on servers — prevents "address already in use" after restart
SO_REUSEPORTSOL_SOCKETMultiple sockets bind same IP:port; kernel load-balancesMulti-process/thread servers (Nginx, high-performance servers)
SO_KEEPALIVESOL_SOCKETSend TCP keepalive probes; detect dead connectionsLong-lived connections; detect peer disappear without data
SO_RCVBUFSOL_SOCKETSet receive buffer size (kernel doubles the value)High-bandwidth connections; increase for long fat networks
SO_SNDBUFSOL_SOCKETSet send buffer sizeHigh-throughput senders; typically let autotuning manage
TCP_NODELAYIPPROTO_TCPDisable Nagle's algorithm — send immediatelyLow-latency protocols (RPC, gaming, trading); penalises small writes
TCP_CORKIPPROTO_TCPBuffer all data until cork removed or MSS reachedHTTP/file transfers — batch headers+body into one segment
TCP_QUICKACKIPPROTO_TCPDisable delayed ACKLatency-sensitive request-response protocols
IP_TOSIPPROTO_IPSet DSCP/TOS field in outgoing IP packetsQoS marking for VoIP, streaming, or traffic shaping
SO_TIMESTAMPSOL_SOCKETReceive hardware/kernel timestamp with each packet via cmsgLatency measurement, PTP, network monitoring
/* Setting socket options */
int opt = 1;
setsockopt(fd, SOL_SOCKET,   SO_REUSEADDR, &opt, sizeof(opt));
setsockopt(fd, SOL_SOCKET,   SO_REUSEPORT, &opt, sizeof(opt));
setsockopt(fd, IPPROTO_TCP,  TCP_NODELAY,  &opt, sizeof(opt));

int bufsize = 4 * 1024 * 1024;  /* 4MB */
setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bufsize, sizeof(bufsize));
setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize));

/* TCP keepalive tuning */
int idle = 60, interval = 10, count = 3;
setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE,  &idle,     sizeof(idle));
setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &interval, sizeof(interval));
setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT,   &count,    sizeof(count));
/* After 60s idle: send probe every 10s, 3 times → declare dead after 30s */

/* Read back effective buffer size */
int actual; socklen_t alen = sizeof(actual);
getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &actual, &alen);
/* actual may be 2× requested (kernel doubles for overhead) */

RAW SOCKETS — CRAFTING ARBITRARY IP PACKETS

Raw IP Sockets

RAW SOCKETS
/* Raw socket — receive ALL IP packets of a given protocol */
/* Requires CAP_NET_RAW or root */
int fd = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);  /* all ICMP */
int fd = socket(AF_INET, SOCK_RAW, IPPROTO_TCP);   /* all TCP (also received by TCP stack) */
int fd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);   /* send-only; craft own IP header */

/* Send a custom ICMP echo request */
struct {
    struct icmphdr hdr;
    char           data[56];
} pkt;
pkt.hdr.type     = ICMP_ECHO;
pkt.hdr.code     = 0;
pkt.hdr.un.echo.id  = htons(getpid());
pkt.hdr.un.echo.sequence = htons(1);
pkt.hdr.checksum = 0;
pkt.hdr.checksum = checksum(&pkt, sizeof(pkt));

struct sockaddr_in dst;
dst.sin_family = AF_INET;
inet_aton("8.8.8.8", &dst.sin_addr);

int raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP);
sendto(raw, &pkt, sizeof(pkt), 0, (struct sockaddr *)&dst, sizeof(dst));

/* Receive: kernel prepends IP header on recvfrom */
char rbuf[1024];
recv(raw, rbuf, sizeof(rbuf), 0);
struct iphdr   *ip   = (struct iphdr *)rbuf;
struct icmphdr *icmp = (struct icmphdr *)(rbuf + ip->ihl * 4);

/* IP_HDRINCL — you supply your own IP header */
int opt = 1;
setsockopt(raw, IPPROTO_IP, IP_HDRINCL, &opt, sizeof(opt));
/* Now sendto() includes your crafted IP header in the buffer */

AF_PACKET — RAW LAYER 2 SOCKET

🔌

AF_PACKET — Capture and Inject Raw Frames

AF_PACKET
/* AF_PACKET — access raw Ethernet frames */
/* Foundation of tcpdump, Wireshark, and custom packet generators */
#include <linux/if_packet.h>
#include <net/ethernet.h>
#include <net/if.h>

/* Open raw L2 socket — receives ALL Ethernet frames */
int fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));

/* Bind to specific interface */
struct sockaddr_ll sll = {0};
sll.sll_family   = AF_PACKET;
sll.sll_ifindex  = if_nametoindex("eth0");
sll.sll_protocol = htons(ETH_P_ALL);
bind(fd, (struct sockaddr *)&sll, sizeof(sll));

/* Set promiscuous mode (receive frames not destined for us) */
struct packet_mreq mreq = {0};
mreq.mr_ifindex = sll.sll_ifindex;
mreq.mr_type    = PACKET_MR_PROMISC;
setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq));

/* Receive raw Ethernet frame */
unsigned char frame[2048];
ssize_t n = recv(fd, frame, sizeof(frame), 0);
struct ethhdr *eth = (struct ethhdr *)frame;
/* eth->h_dest, eth->h_source, eth->h_proto */

/* PACKET_MMAP — zero-copy ring buffer for high-speed capture */
/* Maps NIC DMA buffers directly into process address space */
/* Used by tcpdump/libpcap for high-performance capture */
struct tpacket_req req = {
    .tp_block_size = 4096,
    .tp_block_nr   = 64,
    .tp_frame_size = 2048,
    .tp_frame_nr   = 128
};
setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req));
void *ring = mmap(NULL, req.tp_block_size * req.tp_block_nr,
                  PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0);
/* Poll ring directly — no syscall per packet */

💡 AF_PACKET is how tcpdump/libpcap work. Every packet you've ever captured with Wireshark passed through an AF_PACKET socket. The PACKET_MMAP extension maps the NIC's DMA ring into userspace for zero-copy capture — this is how Wireshark achieves high capture rates. Your DPDK knowledge directly informs why this is still slower than full kernel bypass.

LAB 1

Multi-Connection TCP Server with epoll

Objective: Build a fully functional non-blocking TCP echo server using epoll that handles 1000+ simultaneous connections without threads.

1
Implement the epoll event loop from the code in Tab 3. Use EPOLLET (edge-triggered) mode. Handle: new connections (accept), incoming data (read loop until EAGAIN), connection close (EPOLLHUP/recv returns 0), errors (EPOLLERR).
2
Set all accepted sockets to non-blocking with fcntl(fd, F_SETFL, O_NONBLOCK). Set TCP_NODELAY and SO_REUSEPORT. Add a per-connection state structure (track bytes received, connection ID).
3
Load test with ab -n 100000 -c 1000 http://localhost:8080/ or a custom C client. Measure: connections/second, max concurrent connections, memory per connection. Compare with a fork-per-connection server under the same load.
4
Add SO_REUSEPORT: run 4 instances of your server on the same port (set different process IDs). Verify with ss -tlnp | grep 8080 that all 4 are bound. Use ab to send 40,000 requests and verify even distribution across processes.
LAB 2

Raw Packet Craft and AF_PACKET Capture

Objective: Write a minimal packet sniffer using AF_PACKET, then craft custom ICMP packets with raw sockets.

1
Write a packet sniffer: AF_PACKET socket, promiscuous mode, read loop. For each received frame: print timestamp, Ethernet src/dst MACs, EtherType. If EtherType=0x0800 (IPv4), also parse the IP header (src/dst IP, protocol, TTL). If protocol=ICMP, print type/code.
2
Run your sniffer and generate traffic: ping, curl a website, start a TCP connection. Verify your sniffer correctly identifies all frame types. Compare output with tcpdump running in parallel on the same interface.
3
Write an ICMP ping with raw sockets (SOCK_RAW, IPPROTO_ICMP). Calculate the ICMP checksum. Send to 8.8.8.8 and receive the reply. Parse the reply to extract RTT (measure time between send and receive). Implement 5 pings and show min/avg/max RTT.
LAB 3

Socket Performance Benchmarking

Objective: Measure the impact of socket options on latency and throughput.

1
Write a benchmark that sends 1-byte request, receives 1-byte response, measures RTT. Test with TCP_NODELAY on vs off. Expected: TCP_NODELAY on = ~0.2ms; off = up to 40ms (Nagle delay). The 200× difference illustrates why TCP_NODELAY matters for latency-sensitive code.
2
Measure SO_RCVBUF impact on throughput: send 1GB over TCP with recv buffer at 4KB vs 256KB vs 4MB. Use iperf3 as reference. Explain why buffer size affects throughput on a high-latency link (BDP = bandwidth × delay).
3
Compare select vs poll vs epoll with 1000 file descriptors: open 1000 socket-pairs, use each API to wait for activity on all 1000. Measure time per call. Document the O(n) vs O(1) difference empirically.

M15 MASTERY CHECKLIST

When complete: Move to M16 - eBPF and XDP — the most exciting recent addition to the Linux networking toolkit, enabling programmable packet processing without kernel modifications.

← M14 Linux Stack 🗺️ Roadmap Next: M16 - eBPF/XDP →