// https://elixir.bootlin.com/linux/v4.15/source/samples/bpf/bpf_load.h#L41 /* parses elf file compiled by llvm .c->.o * . parses 'maps' section and creates maps via BPF syscall // 就是这里 * . parses 'license' section and passes it to syscall * . parses elf relocations for BPF maps and adjusts BPF_LD_IMM64 insns by * storing map_fd into insn->imm and marking such insns as BPF_PSEUDO_MAP_FD * . loads eBPF programs via BPF syscall * * One ELF file can contain multiple BPF programs which will be loaded * and their FDs stored stored in prog_fd array * * returns zero on success */ intload_bpf_file(char *path);
/* Protocol dispatch routine. It tail-calls next BPF program depending * on eth proto. Note, we could have used ... * * bpf_tail_call(skb, &jmp_table, proto); * * ... but it would need large prog_array and cannot be optimised given * the map key is not static. */ staticinlinevoidparse_eth_proto(struct __sk_buff *skb, u32 proto) { switch (proto) { case ETH_P_8021Q: case ETH_P_8021AD: bpf_tail_call(skb, &jmp_table, PARSE_VLAN); break; case ETH_P_MPLS_UC: case ETH_P_MPLS_MC: bpf_tail_call(skb, &jmp_table, PARSE_MPLS); break; case ETH_P_IP: bpf_tail_call(skb, &jmp_table, PARSE_IP); break; case ETH_P_IPV6: bpf_tail_call(skb, &jmp_table, PARSE_IPV6); break; } }
constchar* outer_map_name = "outer_map"; structbpf_map* outer_map = bpf_object__find_map_by_name(obj, outer_map_name); int inner_map_fd = bpf_create_map( BPF_MAP_TYPE_HASH, // type sizeof(__u32), // key_size sizeof(__u32), // value_size 8, // max_entries 0); // flag bpf_map__set_inner_map_fd(outer_map, inner_map_fd); bpf_object__load(obj); close(inner_map_fd); // Important
Insert
Insert Into Outer Map
插入到 outer map 步骤如下:
创建一个新的 inner map
将创建的 inner map 的 fd 作为 value 插入到 outer map
关闭 inner map fd
1 2 3 4 5 6 7 8 9 10
int inner_map_fd = bpf_create_map_name( BPF_MAP_TYPE_HASH, // type "hechaol_inner_map", // name sizeof(__u32), // key_size sizeof(__u32), // value_size 8, // max_entries 0); // flag __u32 outer_key = 42; bpf_map_update_elem(outer_map_fd, &outer_key, &inner_map_fd, 0/* flag */); close(inner_map_fd); // Important!
注意:
outer map 的每一项 entry 的 value 是 the id of an inner map,但是调用 bpf_map_update_elem API 时给的参数是 the fd of the inner map
在插入之后你必须关闭 inner map fd 以避免内存泄漏。
Insert Into Inner Map
如前所述,outer map 的每一项 entry 的 value 是 the id of an inner map,而不是 the fd of the inner map。即使我们在调用 bpf_map_update_elem 传递的参数是 inner map fd,使用 bpf_map_lookup_elem 的时候我们的到的 value 是 inner map id,为了获得 inner map fd,可以调用 bpf_map_get_fd_by_id。拿到 inner map fd 之后,就可以像之前一样操作 inner map 了。
staticint ifindex = 6; // target network interface to attach, you can find it via `ip a` static __u32 xdp_flags = 0;
// unlink the xdp program and exit staticvoidint_exit(int sig) { printf("stopping\n"); set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); }
// An XDP program which track packets with IP address // Usage: ./xdp_ip_tracker intmain(int argc, char **argv) { char *filename = "xdp_ip_tracker_kern.o"; // change limits structrlimitr = {RLIM_INFINITY, RLIM_INFINITY}; if (setrlimit(RLIMIT_MEMLOCK, &r)) { perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); return1; }
// load the kernel bpf object file if (load_bpf_file(filename)) { printf("error - bpf_log_buf: %s", bpf_log_buf); return1; }
// confirm the bpf prog fd is available if (!prog_fd[0]) { printf("load_bpf_file: %s\n", strerror(errno)); return1; }
// add signal handlers signal(SIGINT, int_exit); signal(SIGTERM, int_exit);
// link the xdp program to the network interface if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return1; }
int result; structpairnext_key, lookup_key = {0, 0}; structstatsvalue = {}; while (1) { sleep(2); // retrieve the bpf map of statistics while (bpf_map_get_next_key(map_fd[0], &lookup_key, &next_key) != -1) { //printf("The local ip of next key in the map is: '%d'\n", next_key.src_ip); //printf("The remote ip of next key in the map is: '%d'\n", next_key.dest_ip); structin_addrlocal = {next_key.src_ip}; structin_addrremote = {next_key.dest_ip}; printf("The local ip of next key in the map is: '%s'\n", inet_ntoa(local)); printf("The remote ip of next key in the map is: '%s'\n", inet_ntoa(remote)); // get the value via the key // TODO: change to assert // assert(bpf_map_lookup_elem(map_fd[0], &next_key, &value) == 0) result = bpf_map_lookup_elem(map_fd[0], &next_key, &value); if (result == 0) { // print the value printf("rx_cnt value read from the map: '%llu'\n", value.rx_cnt); printf("rx_bytes value read from the map: '%llu'\n", value.rx_bytes); } else { printf("Failed to read value from the map: %d (%s)\n", result, strerror(errno)); } lookup_key = next_key; printf("\n\n"); } printf("start a new loop...\n"); // reset the lookup key for a fresh start lookup_key.src_ip = 0; lookup_key.dest_ip = 0; }
printf("end\n"); // unlink the xdp program set_link_xdp_fd(ifindex, -1, xdp_flags); return0; }
err = security_bpf_map_alloc(map); if (err) goto free_map_nouncharge;
err = bpf_map_charge_memlock(map); if (err) goto free_map_sec;
err = bpf_map_alloc_id(map); if (err) goto free_map;
// assign a fd for bpf map err = bpf_map_new_fd(map, f_flags); if (err < 0) { /* failed to allocate fd. * bpf_map_put() is needed because the above * bpf_map_alloc_id() has published the map * to the userspace and the userspace may * have refcnt-ed it through BPF_MAP_GET_FD_BY_ID. */ bpf_map_put(map); return err; }
// https://elixir.bootlin.com/linux/v4.15/source/kernel/bpf/syscall.c#L327 intbpf_map_new_fd(struct bpf_map *map, int flags) { int ret;
ret = security_bpf_map(map, OPEN_FMODE(flags)); if (ret < 0) return ret; /** * anon_inode_getfd - creates a new file instance by hooking it up to an * anonymous inode, and a dentry that describe the "class" * of the file * * @name: [in] name of the "class" of the new file * @fops: [in] file operations for the new file * @priv: [in] private data for the new file (will be file's private_data) * @flags: [in] flags * * Creates a new file by hooking it on a single inode. This is useful for files * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfd() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry * setup. Returns new descriptor or an error code. */ return anon_inode_getfd("bpf-map", &bpf_map_fops, map, flags | O_CLOEXEC); }
# command #1, list all the bpf map in the current node # you can find map id, map type, map name, key type, value type, the number of max entry and memory allocation in the output > bpftool map 29: hash name tracker_map flags 0x0 key 8B value 32B max_entries 2048 memlock 217088B
# command #2, show the bpf map details including keys and value in hex-format # the map id can be found in the output of command #1 # you can also find the element number > bpftool map dump id [map id] key: c0 a8 3a 01 ac 11 00 02 value: 00 00 00 00 00 00 00 00 0a 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 e4 02 00 00 00 00 00 00 key: ac 11 00 01 ac 11 00 02 value: 00 00 00 00 00 00 00 00 07 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 06 02 00 00 00 00 00 00 Found 2 elements