THE POSIX SOCKET API — THE GATEWAY TO THE NETWORK
Socket Fundamentals
BASICS/* socket() — create a socket */ int fd = socket(domain, type, protocol); domain: AF_INET (IPv4), AF_INET6 (IPv6), AF_UNIX (local), AF_PACKET (raw L2) type: SOCK_STREAM (TCP), SOCK_DGRAM (UDP), SOCK_RAW (raw IP/L2) protocol: Usually 0 (auto-select). IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP /* Address structures */ struct sockaddr_in { /* IPv4 */ sa_family_t sin_family; /* AF_INET */ in_port_t sin_port; /* htons(port) — network byte order! */ struct in_addr sin_addr; /* .s_addr = htonl(INADDR_ANY) or inet_addr("1.2.3.4") */ }; struct sockaddr_in6 { /* IPv6 */ sa_family_t sin6_family; /* AF_INET6 */ in_port_t sin6_port; /* htons(port) */ uint32_t sin6_flowinfo; struct in6_addr sin6_addr; /* IPv6 address (16 bytes) */ uint32_t sin6_scope_id; }; /* Byte order — critical! */ htons(x): host-to-network short (16-bit port numbers) htonl(x): host-to-network long (32-bit IP addresses) ntohs(x): network-to-host short ntohl(x): network-to-host long # Network byte order = big-endian # x86 is little-endian → ALWAYS use htons/htonl for ports/IPs in structs /* Dual-stack (IPv4+IPv6) */ int fd = socket(AF_INET6, SOCK_STREAM, 0); int v6only = 0; setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &v6only, sizeof(v6only)); /* Binding :: (IPv6 any) now accepts IPv4-mapped IPv6 addresses (::ffff:x.x.x.x) */
TCP SERVER PATTERNS
Complete TCP Server Template
TCP SERVER#include <sys/socket.h> #include <netinet/in.h> #include <unistd.h> #include <string.h> int tcp_server(uint16_t port) { int lfd = socket(AF_INET6, SOCK_STREAM, 0); /* SO_REUSEADDR: allow bind to port even if in TIME_WAIT */ int opt = 1; setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); /* SO_REUSEPORT: multiple processes can bind same port (load balance) */ setsockopt(lfd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); struct sockaddr_in6 addr = {0}; addr.sin6_family = AF_INET6; addr.sin6_port = htons(port); addr.sin6_addr = in6addr_any; /* :: = any interface */ bind(lfd, (struct sockaddr *)&addr, sizeof(addr)); listen(lfd, 128); /* backlog: max pending connections in accept queue */ while (1) { struct sockaddr_in6 client; socklen_t clen = sizeof(client); int cfd = accept(lfd, (struct sockaddr *)&client, &clen); /* cfd is a NEW socket for this connection; lfd still listens */ /* Handle client — in production: fork() or thread */ handle_client(cfd); close(cfd); } } void handle_client(int fd) { char buf[4096]; ssize_t n; /* CRITICAL: recv may return LESS than requested — MUST loop */ while ((n = recv(fd, buf, sizeof(buf), 0)) > 0) { /* n bytes received; process buf[0..n-1] */ send(fd, buf, n, 0); /* echo back */ /* send may also return less than n → must loop send too */ } /* n == 0: peer closed connection (FIN received) */ /* n == -1: error (check errno: EAGAIN, ECONNRESET, etc.) */ } /* TCP client */ int tcp_connect(const char *host, uint16_t port) { struct addrinfo hints = {0}, *res; hints.ai_family = AF_UNSPEC; hints.ai_socktype = SOCK_STREAM; char portstr[8]; snprintf(portstr, sizeof(portstr), "%u", port); getaddrinfo(host, portstr, &hints, &res); int fd = socket(res->ai_family, res->ai_socktype, 0); connect(fd, res->ai_addr, res->ai_addrlen); freeaddrinfo(res); return fd; }
UDP PROGRAMMING
UDP Socket — sendto/recvfrom
UDP/* UDP server — connectionless, per-datagram source address */ int udp_server(uint16_t port) { int fd = socket(AF_INET, SOCK_DGRAM, 0); struct sockaddr_in addr = {0}; addr.sin_family = AF_INET; addr.sin_port = htons(port); addr.sin_addr.s_addr = INADDR_ANY; bind(fd, (struct sockaddr *)&addr, sizeof(addr)); char buf[65536]; /* max UDP payload */ while (1) { struct sockaddr_in client; socklen_t clen = sizeof(client); ssize_t n = recvfrom(fd, buf, sizeof(buf), 0, (struct sockaddr *)&client, &clen); /* n == complete datagram size — UDP preserves message boundaries */ /* client contains source IP+port for this packet */ sendto(fd, buf, n, 0, (struct sockaddr *)&client, clen); } } /* UDP multicast sender */ int udp_multicast_send(const char *group, uint16_t port) { int fd = socket(AF_INET, SOCK_DGRAM, 0); struct ip_mreq mreq; inet_aton(group, &mreq.imr_multiaddr); mreq.imr_interface.s_addr = INADDR_ANY; setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); /* TTL for multicast (default 1 = link-local) */ unsigned char ttl = 32; setsockopt(fd, IPPROTO_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); struct sockaddr_in dst = {0}; dst.sin_family = AF_INET; dst.sin_port = htons(port); inet_aton(group, &dst.sin_addr); sendto(fd, "hello multicast", 15, 0, (struct sockaddr *)&dst, sizeof(dst)); return fd; }
NON-BLOCKING I/O AND epoll
epoll — Scalable I/O Multiplexing
EPOLLThe classic select() and poll() have O(n) scan overhead — with 10,000 fds, every call scans all 10,000 even if only 1 is ready. epoll maintains a kernel-side data structure and returns only the fds that are actually ready — O(1) per event, O(k) where k is ready events.
#include <sys/epoll.h> #include <fcntl.h> /* Set fd to non-blocking */ void set_nonblocking(int fd) { int flags = fcntl(fd, F_GETFL, 0); fcntl(fd, F_SETFL, flags | O_NONBLOCK); } /* Create epoll instance and event loop */ int epoll_server(int lfd) { int epfd = epoll_create1(0); /* Add listener to epoll */ struct epoll_event ev = { .events = EPOLLIN, .data.fd = lfd }; epoll_ctl(epfd, EPOLL_CTL_ADD, lfd, &ev); struct epoll_event events[1024]; while (1) { int n = epoll_wait(epfd, events, 1024, -1); /* -1 = block forever */ for (int i = 0; i < n; i++) { if (events[i].data.fd == lfd) { /* New connection */ int cfd = accept(lfd, NULL, NULL); set_nonblocking(cfd); struct epoll_event cev = { .events = EPOLLIN | EPOLLET, /* edge-triggered */ .data.fd = cfd }; epoll_ctl(epfd, EPOLL_CTL_ADD, cfd, &cev); } else { /* Data ready on existing connection */ char buf[4096]; ssize_t nr; /* Edge-triggered: MUST read until EAGAIN */ while ((nr = recv(events[i].data.fd, buf, sizeof(buf), 0)) > 0) process(buf, nr); if (nr == 0) { /* connection closed */ epoll_ctl(epfd, EPOLL_CTL_DEL, events[i].data.fd, NULL); close(events[i].data.fd); } /* nr == -1 && errno == EAGAIN: no more data right now */ } } } } /* Edge-triggered vs Level-triggered */ # EPOLLET (edge): notify ONCE when state changes (unread→readable) # Must read ALL data immediately or it won't be reported again # Higher performance (fewer epoll_wait wakeups) # Level (default): notify every time data is available # Easier to code correctly; acceptable for most applications
SOCKET OPTIONS — TUNING FOR PERFORMANCE
Critical Socket Options
SOCKOPTS| Option | Level | Effect | When to Use |
|---|---|---|---|
SO_REUSEADDR | SOL_SOCKET | Allow bind to port in TIME_WAIT state | Always on servers — prevents "address already in use" after restart |
SO_REUSEPORT | SOL_SOCKET | Multiple sockets bind same IP:port; kernel load-balances | Multi-process/thread servers (Nginx, high-performance servers) |
SO_KEEPALIVE | SOL_SOCKET | Send TCP keepalive probes; detect dead connections | Long-lived connections; detect peer disappear without data |
SO_RCVBUF | SOL_SOCKET | Set receive buffer size (kernel doubles the value) | High-bandwidth connections; increase for long fat networks |
SO_SNDBUF | SOL_SOCKET | Set send buffer size | High-throughput senders; typically let autotuning manage |
TCP_NODELAY | IPPROTO_TCP | Disable Nagle's algorithm — send immediately | Low-latency protocols (RPC, gaming, trading); penalises small writes |
TCP_CORK | IPPROTO_TCP | Buffer all data until cork removed or MSS reached | HTTP/file transfers — batch headers+body into one segment |
TCP_QUICKACK | IPPROTO_TCP | Disable delayed ACK | Latency-sensitive request-response protocols |
IP_TOS | IPPROTO_IP | Set DSCP/TOS field in outgoing IP packets | QoS marking for VoIP, streaming, or traffic shaping |
SO_TIMESTAMP | SOL_SOCKET | Receive hardware/kernel timestamp with each packet via cmsg | Latency measurement, PTP, network monitoring |
/* Setting socket options */ int opt = 1; setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); setsockopt(fd, SOL_SOCKET, SO_REUSEPORT, &opt, sizeof(opt)); setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, &opt, sizeof(opt)); int bufsize = 4 * 1024 * 1024; /* 4MB */ setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &bufsize, sizeof(bufsize)); setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &bufsize, sizeof(bufsize)); /* TCP keepalive tuning */ int idle = 60, interval = 10, count = 3; setsockopt(fd, IPPROTO_TCP, TCP_KEEPIDLE, &idle, sizeof(idle)); setsockopt(fd, IPPROTO_TCP, TCP_KEEPINTVL, &interval, sizeof(interval)); setsockopt(fd, IPPROTO_TCP, TCP_KEEPCNT, &count, sizeof(count)); /* After 60s idle: send probe every 10s, 3 times → declare dead after 30s */ /* Read back effective buffer size */ int actual; socklen_t alen = sizeof(actual); getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &actual, &alen); /* actual may be 2× requested (kernel doubles for overhead) */
RAW SOCKETS — CRAFTING ARBITRARY IP PACKETS
Raw IP Sockets
RAW SOCKETS/* Raw socket — receive ALL IP packets of a given protocol */ /* Requires CAP_NET_RAW or root */ int fd = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); /* all ICMP */ int fd = socket(AF_INET, SOCK_RAW, IPPROTO_TCP); /* all TCP (also received by TCP stack) */ int fd = socket(AF_INET, SOCK_RAW, IPPROTO_RAW); /* send-only; craft own IP header */ /* Send a custom ICMP echo request */ struct { struct icmphdr hdr; char data[56]; } pkt; pkt.hdr.type = ICMP_ECHO; pkt.hdr.code = 0; pkt.hdr.un.echo.id = htons(getpid()); pkt.hdr.un.echo.sequence = htons(1); pkt.hdr.checksum = 0; pkt.hdr.checksum = checksum(&pkt, sizeof(pkt)); struct sockaddr_in dst; dst.sin_family = AF_INET; inet_aton("8.8.8.8", &dst.sin_addr); int raw = socket(AF_INET, SOCK_RAW, IPPROTO_ICMP); sendto(raw, &pkt, sizeof(pkt), 0, (struct sockaddr *)&dst, sizeof(dst)); /* Receive: kernel prepends IP header on recvfrom */ char rbuf[1024]; recv(raw, rbuf, sizeof(rbuf), 0); struct iphdr *ip = (struct iphdr *)rbuf; struct icmphdr *icmp = (struct icmphdr *)(rbuf + ip->ihl * 4); /* IP_HDRINCL — you supply your own IP header */ int opt = 1; setsockopt(raw, IPPROTO_IP, IP_HDRINCL, &opt, sizeof(opt)); /* Now sendto() includes your crafted IP header in the buffer */
AF_PACKET — RAW LAYER 2 SOCKET
AF_PACKET — Capture and Inject Raw Frames
AF_PACKET/* AF_PACKET — access raw Ethernet frames */ /* Foundation of tcpdump, Wireshark, and custom packet generators */ #include <linux/if_packet.h> #include <net/ethernet.h> #include <net/if.h> /* Open raw L2 socket — receives ALL Ethernet frames */ int fd = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL)); /* Bind to specific interface */ struct sockaddr_ll sll = {0}; sll.sll_family = AF_PACKET; sll.sll_ifindex = if_nametoindex("eth0"); sll.sll_protocol = htons(ETH_P_ALL); bind(fd, (struct sockaddr *)&sll, sizeof(sll)); /* Set promiscuous mode (receive frames not destined for us) */ struct packet_mreq mreq = {0}; mreq.mr_ifindex = sll.sll_ifindex; mreq.mr_type = PACKET_MR_PROMISC; setsockopt(fd, SOL_PACKET, PACKET_ADD_MEMBERSHIP, &mreq, sizeof(mreq)); /* Receive raw Ethernet frame */ unsigned char frame[2048]; ssize_t n = recv(fd, frame, sizeof(frame), 0); struct ethhdr *eth = (struct ethhdr *)frame; /* eth->h_dest, eth->h_source, eth->h_proto */ /* PACKET_MMAP — zero-copy ring buffer for high-speed capture */ /* Maps NIC DMA buffers directly into process address space */ /* Used by tcpdump/libpcap for high-performance capture */ struct tpacket_req req = { .tp_block_size = 4096, .tp_block_nr = 64, .tp_frame_size = 2048, .tp_frame_nr = 128 }; setsockopt(fd, SOL_PACKET, PACKET_RX_RING, &req, sizeof(req)); void *ring = mmap(NULL, req.tp_block_size * req.tp_block_nr, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); /* Poll ring directly — no syscall per packet */
💡 AF_PACKET is how tcpdump/libpcap work. Every packet you've ever captured with Wireshark passed through an AF_PACKET socket. The PACKET_MMAP extension maps the NIC's DMA ring into userspace for zero-copy capture — this is how Wireshark achieves high capture rates. Your DPDK knowledge directly informs why this is still slower than full kernel bypass.
Multi-Connection TCP Server with epoll
Objective: Build a fully functional non-blocking TCP echo server using epoll that handles 1000+ simultaneous connections without threads.
fcntl(fd, F_SETFL, O_NONBLOCK). Set TCP_NODELAY and SO_REUSEPORT. Add a per-connection state structure (track bytes received, connection ID).ab -n 100000 -c 1000 http://localhost:8080/ or a custom C client. Measure: connections/second, max concurrent connections, memory per connection. Compare with a fork-per-connection server under the same load.ss -tlnp | grep 8080 that all 4 are bound. Use ab to send 40,000 requests and verify even distribution across processes.Raw Packet Craft and AF_PACKET Capture
Objective: Write a minimal packet sniffer using AF_PACKET, then craft custom ICMP packets with raw sockets.
Socket Performance Benchmarking
Objective: Measure the impact of socket options on latency and throughput.
M15 MASTERY CHECKLIST
- Know socket() parameters: AF_INET/INET6/UNIX/PACKET, SOCK_STREAM/DGRAM/RAW, protocol values
- Know byte order: network = big-endian; always use htons/htonl for ports/IPs in structs
- Know sockaddr_in fields: sin_family, sin_port (htons), sin_addr.s_addr
- Know TCP server sequence: socket → setsockopt(REUSEADDR) → bind → listen → accept loop
- Know that recv may return less than requested — always loop recv for stream sockets
- Know TCP client sequence: socket → getaddrinfo → connect → send/recv
- Know recv returns 0 on EOF (peer closed), -1 on error (check errno)
- Know UDP differences: sendto/recvfrom, message boundaries preserved, no connect needed
- Know why epoll outperforms select/poll at scale: O(1) per ready event vs O(n) scan
- Know epoll API: epoll_create1, epoll_ctl (ADD/MOD/DEL), epoll_wait
- Know edge-triggered vs level-triggered: ET fires once on state change (must read to EAGAIN), LT fires while data present
- Know SO_REUSEADDR: allow bind in TIME_WAIT — always set on servers
- Know SO_REUSEPORT: multiple processes bind same port, kernel load-balances
- Know TCP_NODELAY: disables Nagle's algorithm, reduces latency for small writes
- Know TCP keepalive options: TCP_KEEPIDLE, TCP_KEEPINTVL, TCP_KEEPCNT
- Know raw sockets require CAP_NET_RAW; receive all packets of specified IP protocol
- Know IP_HDRINCL: allows crafting custom IP header in SOCK_RAW
- Know AF_PACKET: raw Ethernet frame access; foundation of tcpdump/libpcap/Wireshark
- Know PACKET_MMAP: zero-copy ring buffer for high-speed capture in AF_PACKET
- Completed Lab 1: built epoll-based non-blocking server, benchmarked under 1000 concurrent connections
- Completed Lab 2: wrote packet sniffer with AF_PACKET, crafted ICMP with raw sockets
- Completed Lab 3: benchmarked TCP_NODELAY, buffer size, and select/poll/epoll performance
✅ When complete: Move to M16 - eBPF and XDP — the most exciting recent addition to the Linux networking toolkit, enabling programmable packet processing without kernel modifications.