diff options
77 files changed, 4265 insertions, 399 deletions
diff --git a/drivers/infiniband/hw/hfi1/file_ops.c b/drivers/infiniband/hw/hfi1/file_ops.c index 41fafebe3b0d..da4aa1a95b11 100644 --- a/drivers/infiniband/hw/hfi1/file_ops.c +++ b/drivers/infiniband/hw/hfi1/file_ops.c @@ -1153,7 +1153,7 @@ static int get_ctxt_info(struct hfi1_filedata *fd, unsigned long arg, u32 len) cinfo.sdma_ring_size = fd->cq->nentries; cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size; - trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo); + trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, &cinfo); if (copy_to_user((void __user *)arg, &cinfo, len)) return -EFAULT; diff --git a/drivers/infiniband/hw/hfi1/trace_ctxts.h b/drivers/infiniband/hw/hfi1/trace_ctxts.h index 4eb4cc798035..e00c8a7d559c 100644 --- a/drivers/infiniband/hw/hfi1/trace_ctxts.h +++ b/drivers/infiniband/hw/hfi1/trace_ctxts.h @@ -106,7 +106,7 @@ TRACE_EVENT(hfi1_uctxtdata, TRACE_EVENT(hfi1_ctxt_info, TP_PROTO(struct hfi1_devdata *dd, unsigned int ctxt, unsigned int subctxt, - struct hfi1_ctxt_info cinfo), + struct hfi1_ctxt_info *cinfo), TP_ARGS(dd, ctxt, subctxt, cinfo), TP_STRUCT__entry(DD_DEV_ENTRY(dd) __field(unsigned int, ctxt) @@ -120,11 +120,11 @@ TRACE_EVENT(hfi1_ctxt_info, TP_fast_assign(DD_DEV_ASSIGN(dd); __entry->ctxt = ctxt; __entry->subctxt = subctxt; - __entry->egrtids = cinfo.egrtids; - __entry->rcvhdrq_cnt = cinfo.rcvhdrq_cnt; - __entry->rcvhdrq_size = cinfo.rcvhdrq_entsize; - __entry->sdma_ring_size = cinfo.sdma_ring_size; - __entry->rcvegr_size = cinfo.rcvegr_size; + __entry->egrtids = cinfo->egrtids; + __entry->rcvhdrq_cnt = cinfo->rcvhdrq_cnt; + __entry->rcvhdrq_size = cinfo->rcvhdrq_entsize; + __entry->sdma_ring_size = cinfo->sdma_ring_size; + __entry->rcvegr_size = cinfo->rcvegr_size; ), TP_printk("[%s] ctxt %u:%u " CINFO_FMT, __get_str(dev), diff --git a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c index 80d3aa0fc9d3..7e298148ca26 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/cmsg.c @@ -218,17 +218,17 @@ nfp_bpf_cmsg_communicate(struct nfp_app_bpf *bpf, struct sk_buff *skb, return skb; hdr = (struct cmsg_hdr *)skb->data; - /* 0 reply_size means caller will do the validation */ - if (reply_size && skb->len != reply_size) { - cmsg_warn(bpf, "cmsg drop - wrong size %d != %d!\n", - skb->len, reply_size); - goto err_free; - } if (hdr->type != __CMSG_REPLY(type)) { cmsg_warn(bpf, "cmsg drop - wrong type 0x%02x != 0x%02lx!\n", hdr->type, __CMSG_REPLY(type)); goto err_free; } + /* 0 reply_size means caller will do the validation */ + if (reply_size && skb->len != reply_size) { + cmsg_warn(bpf, "cmsg drop - type 0x%02x wrong size %d != %d!\n", + type, skb->len, reply_size); + goto err_free; + } return skb; err_free: diff --git a/drivers/net/ethernet/netronome/nfp/bpf/fw.h b/drivers/net/ethernet/netronome/nfp/bpf/fw.h index cfcc7bcb2c67..39639ac28b01 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/fw.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/fw.h @@ -41,6 +41,7 @@ enum bpf_cap_tlv_type { NFP_BPF_CAP_TYPE_FUNC = 1, NFP_BPF_CAP_TYPE_ADJUST_HEAD = 2, NFP_BPF_CAP_TYPE_MAPS = 3, + NFP_BPF_CAP_TYPE_RANDOM = 4, }; struct nfp_bpf_cap_tlv_func { diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c index 56451edf01c2..4b631e26f199 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c @@ -103,23 +103,18 @@ nfp_prog_confirm_current_offset(struct nfp_prog *nfp_prog, unsigned int off) /* --- Emitters --- */ static void __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, - u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, bool sync, bool indir) + u8 mode, u8 xfer, u8 areg, u8 breg, u8 size, enum cmd_ctx_swap ctx, + bool indir) { - enum cmd_ctx_swap ctx; u64 insn; - if (sync) - ctx = CMD_CTX_SWAP; - else - ctx = CMD_CTX_NO_SWAP; - insn = FIELD_PREP(OP_CMD_A_SRC, areg) | FIELD_PREP(OP_CMD_CTX, ctx) | FIELD_PREP(OP_CMD_B_SRC, breg) | FIELD_PREP(OP_CMD_TOKEN, cmd_tgt_act[op].token) | FIELD_PREP(OP_CMD_XFER, xfer) | FIELD_PREP(OP_CMD_CNT, size) | - FIELD_PREP(OP_CMD_SIG, sync) | + FIELD_PREP(OP_CMD_SIG, ctx != CMD_CTX_NO_SWAP) | FIELD_PREP(OP_CMD_TGT_CMD, cmd_tgt_act[op].tgt_cmd) | FIELD_PREP(OP_CMD_INDIR, indir) | FIELD_PREP(OP_CMD_MODE, mode); @@ -129,7 +124,7 @@ __emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, static void emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, - swreg lreg, swreg rreg, u8 size, bool sync, bool indir) + swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx, bool indir) { struct nfp_insn_re_regs reg; int err; @@ -150,22 +145,22 @@ emit_cmd_any(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, return; } - __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, sync, + __emit_cmd(nfp_prog, op, mode, xfer, reg.areg, reg.breg, size, ctx, indir); } static void emit_cmd(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, - swreg lreg, swreg rreg, u8 size, bool sync) + swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx) { - emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, false); + emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, false); } static void emit_cmd_indir(struct nfp_prog *nfp_prog, enum cmd_tgt_map op, u8 mode, u8 xfer, - swreg lreg, swreg rreg, u8 size, bool sync) + swreg lreg, swreg rreg, u8 size, enum cmd_ctx_swap ctx) { - emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, sync, true); + emit_cmd_any(nfp_prog, op, mode, xfer, lreg, rreg, size, ctx, true); } static void @@ -410,7 +405,7 @@ __emit_lcsr(struct nfp_prog *nfp_prog, u16 areg, u16 breg, bool wr, u16 addr, FIELD_PREP(OP_LCSR_A_SRC, areg) | FIELD_PREP(OP_LCSR_B_SRC, breg) | FIELD_PREP(OP_LCSR_WRITE, wr) | - FIELD_PREP(OP_LCSR_ADDR, addr) | + FIELD_PREP(OP_LCSR_ADDR, addr / 4) | FIELD_PREP(OP_LCSR_SRC_LMEXTN, src_lmextn) | FIELD_PREP(OP_LCSR_DST_LMEXTN, dst_lmextn); @@ -438,10 +433,16 @@ static void emit_csr_wr(struct nfp_prog *nfp_prog, swreg src, u16 addr) return; } - __emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr / 4, + __emit_lcsr(nfp_prog, reg.areg, reg.breg, true, addr, false, reg.src_lmextn); } +/* CSR value is read in following immed[gpr, 0] */ +static void __emit_csr_rd(struct nfp_prog *nfp_prog, u16 addr) +{ + __emit_lcsr(nfp_prog, 0, 0, false, addr, false, false); +} + static void emit_nop(struct nfp_prog *nfp_prog) { __emit_immed(nfp_prog, UR_REG_IMM, UR_REG_IMM, 0, 0, 0, 0, 0, 0, 0); @@ -553,6 +554,19 @@ wrp_reg_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, u8 field_len, emit_ld_field_any(nfp_prog, dst, mask, src, sc, offset * 8, true); } +/* wrp_reg_or_subpart() - load @field_len bytes from low end of @src, or the + * result to @dst from offset, there is no change on the other bits of @dst. + */ +static void +wrp_reg_or_subpart(struct nfp_prog *nfp_prog, swreg dst, swreg src, + u8 field_len, u8 offset) +{ + enum shf_sc sc = offset ? SHF_SC_L_SHF : SHF_SC_NONE; + u8 mask = ((1 << field_len) - 1) << offset; + + emit_ld_field(nfp_prog, dst, mask, src, sc, 32 - offset * 8); +} + static void addr40_offset(struct nfp_prog *nfp_prog, u8 src_gpr, swreg offset, swreg *rega, swreg *regb) @@ -597,7 +611,7 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) /* Memory read from source addr into transfer-in registers. */ emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, src_40bit_addr ? CMD_MODE_40b_BA : CMD_MODE_32b, 0, - src_base, off, xfer_num - 1, true, len > 32); + src_base, off, xfer_num - 1, CMD_CTX_SWAP, len > 32); /* Move from transfer-in to transfer-out. */ for (i = 0; i < xfer_num; i++) @@ -609,39 +623,39 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) /* Use single direct_ref write8. */ emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, len - 1, - true); + CMD_CTX_SWAP); } else if (len <= 32 && IS_ALIGNED(len, 4)) { /* Use single direct_ref write32. */ emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, xfer_num - 1, - true); + CMD_CTX_SWAP); } else if (len <= 32) { /* Use single indirect_ref write8. */ wrp_immed(nfp_prog, reg_none(), CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, len - 1)); emit_cmd_indir(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, - len - 1, true); + len - 1, CMD_CTX_SWAP); } else if (IS_ALIGNED(len, 4)) { /* Use single indirect_ref write32. */ wrp_immed(nfp_prog, reg_none(), CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1)); emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, - xfer_num - 1, true); + xfer_num - 1, CMD_CTX_SWAP); } else if (len <= 40) { /* Use one direct_ref write32 to write the first 32-bytes, then * another direct_ref write8 to write the remaining bytes. */ emit_cmd(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, 7, - true); + CMD_CTX_SWAP); off = re_load_imm_any(nfp_prog, meta->paired_st->off + 32, imm_b(nfp_prog)); emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 8, reg_a(meta->paired_st->dst_reg * 2), off, len - 33, - true); + CMD_CTX_SWAP); } else { /* Use one indirect_ref write32 to write 4-bytes aligned length, * then another direct_ref write8 to write the remaining bytes. @@ -652,12 +666,12 @@ static int nfp_cpp_memcpy(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 2)); emit_cmd_indir(nfp_prog, CMD_TGT_WRITE32_SWAP, CMD_MODE_32b, 0, reg_a(meta->paired_st->dst_reg * 2), off, - xfer_num - 2, true); + xfer_num - 2, CMD_CTX_SWAP); new_off = meta->paired_st->off + (xfer_num - 1) * 4; off = re_load_imm_any(nfp_prog, new_off, imm_b(nfp_prog)); emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, xfer_num - 1, reg_a(meta->paired_st->dst_reg * 2), off, - (len & 0x3) - 1, true); + (len & 0x3) - 1, CMD_CTX_SWAP); } /* TODO: The following extra load is to make sure data flow be identical @@ -718,7 +732,7 @@ data_ld(struct nfp_prog *nfp_prog, swreg offset, u8 dst_gpr, int size) shift = size < 4 ? 4 - size : 0; emit_cmd(nfp_prog, CMD_TGT_READ8, CMD_MODE_32b, 0, - pptr_reg(nfp_prog), offset, sz - 1, true); + pptr_reg(nfp_prog), offset, sz - 1, CMD_CTX_SWAP); i = 0; if (shift) @@ -748,7 +762,7 @@ data_ld_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, mask = size < 4 ? GENMASK(size - 1, 0) : 0; emit_cmd(nfp_prog, CMD_TGT_READ32_SWAP, mode, 0, - lreg, rreg, sz / 4 - 1, true); + lreg, rreg, sz / 4 - 1, CMD_CTX_SWAP); i = 0; if (mask) @@ -828,7 +842,7 @@ data_stx_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset, wrp_mov(nfp_prog, reg_xfer(i), reg_a(src_gpr + i)); emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, - reg_a(dst_gpr), offset, size - 1, true); + reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP); return 0; } @@ -842,7 +856,7 @@ data_st_host_order(struct nfp_prog *nfp_prog, u8 dst_gpr, swreg offset, wrp_immed(nfp_prog, reg_xfer(1), imm >> 32); emit_cmd(nfp_prog, CMD_TGT_WRITE8_SWAP, CMD_MODE_32b, 0, - reg_a(dst_gpr), offset, size - 1, true); + reg_a(dst_gpr), offset, size - 1, CMD_CTX_SWAP); return 0; } @@ -1339,7 +1353,7 @@ static int adjust_head(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) } static int -map_lookup_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +map_call_stack_common(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { struct bpf_offloaded_map *offmap; struct nfp_bpf_map *nfp_map; @@ -1353,19 +1367,21 @@ map_lookup_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) /* We only have to reload LM0 if the key is not at start of stack */ lm_off = nfp_prog->stack_depth; - lm_off += meta->arg2.var_off.value + meta->arg2.off; - load_lm_ptr = meta->arg2_var_off || lm_off; + lm_off += meta->arg2.reg.var_off.value + meta->arg2.reg.off; + load_lm_ptr = meta->arg2.var_off || lm_off; /* Set LM0 to start of key */ if (load_lm_ptr) emit_csr_wr(nfp_prog, reg_b(2 * 2), NFP_CSR_ACT_LM_ADDR0); + if (meta->func_id == BPF_FUNC_map_update_elem) + emit_csr_wr(nfp_prog, reg_b(3 * 2), NFP_CSR_ACT_LM_ADDR2); /* Load map ID into a register, it should actually fit as an immediate * but in case it doesn't deal with it here, not in the delay slots. */ tid = ur_load_imm_any(nfp_prog, nfp_map->tid, imm_a(nfp_prog)); - emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + BPF_FUNC_map_lookup_elem, + emit_br_relo(nfp_prog, BR_UNC, BR_OFF_RELO + meta->func_id, 2, RELO_BR_HELPER); ret_tgt = nfp_prog_current_offset(nfp_prog) + 2; @@ -1388,6 +1404,18 @@ map_lookup_stack(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return 0; } +static int +nfp_get_prandom_u32(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + __emit_csr_rd(nfp_prog, NFP_CSR_PSEUDO_RND_NUM); + /* CSR value is read in following immed[gpr, 0] */ + emit_immed(nfp_prog, reg_both(0), 0, + IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B); + emit_immed(nfp_prog, reg_both(1), 0, + IMMED_WIDTH_ALL, false, IMMED_SHIFT_0B); + return 0; +} + /* --- Callbacks --- */ static int mov_reg64(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { @@ -1838,6 +1866,128 @@ mem_ldx_emem(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, tmp_reg, meta->insn.dst_reg * 2, size); } +static void +mem_ldx_data_init_pktcache(struct nfp_prog *nfp_prog, + struct nfp_insn_meta *meta) +{ + s16 range_start = meta->pkt_cache.range_start; + s16 range_end = meta->pkt_cache.range_end; + swreg src_base, off; + u8 xfer_num, len; + bool indir; + + off = re_load_imm_any(nfp_prog, range_start, imm_b(nfp_prog)); + src_base = reg_a(meta->insn.src_reg * 2); + len = range_end - range_start; + xfer_num = round_up(len, REG_WIDTH) / REG_WIDTH; + + indir = len > 8 * REG_WIDTH; + /* Setup PREV_ALU for indirect mode. */ + if (indir) + wrp_immed(nfp_prog, reg_none(), + CMD_OVE_LEN | FIELD_PREP(CMD_OV_LEN, xfer_num - 1)); + + /* Cache memory into transfer-in registers. */ + emit_cmd_any(nfp_prog, CMD_TGT_READ32_SWAP, CMD_MODE_32b, 0, src_base, + off, xfer_num - 1, CMD_CTX_SWAP, indir); +} + +static int +mem_ldx_data_from_pktcache_unaligned(struct nfp_prog *nfp_prog, + struct nfp_insn_meta *meta, + unsigned int size) +{ + s16 range_start = meta->pkt_cache.range_start; + s16 insn_off = meta->insn.off - range_start; + swreg dst_lo, dst_hi, src_lo, src_mid; + u8 dst_gpr = meta->insn.dst_reg * 2; + u8 len_lo = size, len_mid = 0; + u8 idx = insn_off / REG_WIDTH; + u8 off = insn_off % REG_WIDTH; + + dst_hi = reg_both(dst_gpr + 1); + dst_lo = reg_both(dst_gpr); + src_lo = reg_xfer(idx); + + /* The read length could involve as many as three registers. */ + if (size > REG_WIDTH - off) { + /* Calculate the part in the second register. */ + len_lo = REG_WIDTH - off; + len_mid = size - len_lo; + + /* Calculate the part in the third register. */ + if (size > 2 * REG_WIDTH - off) + len_mid = REG_WIDTH; + } + + wrp_reg_subpart(nfp_prog, dst_lo, src_lo, len_lo, off); + + if (!len_mid) { + wrp_immed(nfp_prog, dst_hi, 0); + return 0; + } + + src_mid = reg_xfer(idx + 1); + + if (size <= REG_WIDTH) { + wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, len_mid, len_lo); + wrp_immed(nfp_prog, dst_hi, 0); + } else { + swreg src_hi = reg_xfer(idx + 2); + + wrp_reg_or_subpart(nfp_prog, dst_lo, src_mid, + REG_WIDTH - len_lo, len_lo); + wrp_reg_subpart(nfp_prog, dst_hi, src_mid, len_lo, + REG_WIDTH - len_lo); + wrp_reg_or_subpart(nfp_prog, dst_hi, src_hi, REG_WIDTH - len_lo, + len_lo); + } + + return 0; +} + +static int +mem_ldx_data_from_pktcache_aligned(struct nfp_prog *nfp_prog, + struct nfp_insn_meta *meta, + unsigned int size) +{ + swreg dst_lo, dst_hi, src_lo; + u8 dst_gpr, idx; + + idx = (meta->insn.off - meta->pkt_cache.range_start) / REG_WIDTH; + dst_gpr = meta->insn.dst_reg * 2; + dst_hi = reg_both(dst_gpr + 1); + dst_lo = reg_both(dst_gpr); + src_lo = reg_xfer(idx); + + if (size < REG_WIDTH) { + wrp_reg_subpart(nfp_prog, dst_lo, src_lo, size, 0); + wrp_immed(nfp_prog, dst_hi, 0); + } else if (size == REG_WIDTH) { + wrp_mov(nfp_prog, dst_lo, src_lo); + wrp_immed(nfp_prog, dst_hi, 0); + } else { + swreg src_hi = reg_xfer(idx + 1); + + wrp_mov(nfp_prog, dst_lo, src_lo); + wrp_mov(nfp_prog, dst_hi, src_hi); + } + + return 0; +} + +static int +mem_ldx_data_from_pktcache(struct nfp_prog *nfp_prog, + struct nfp_insn_meta *meta, unsigned int size) +{ + u8 off = meta->insn.off - meta->pkt_cache.range_start; + + if (IS_ALIGNED(off, REG_WIDTH)) + return mem_ldx_data_from_pktcache_aligned(nfp_prog, meta, size); + + return mem_ldx_data_from_pktcache_unaligned(nfp_prog, meta, size); +} + static int mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, unsigned int size) @@ -1852,8 +2002,16 @@ mem_ldx(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, return mem_ldx_skb(nfp_prog, meta, size); } - if (meta->ptr.type == PTR_TO_PACKET) - return mem_ldx_data(nfp_prog, meta, size); + if (meta->ptr.type == PTR_TO_PACKET) { + if (meta->pkt_cache.range_end) { + if (meta->pkt_cache.do_init) + mem_ldx_data_init_pktcache(nfp_prog, meta); + + return mem_ldx_data_from_pktcache(nfp_prog, meta, size); + } else { + return mem_ldx_data(nfp_prog, meta, size); + } + } if (meta->ptr.type == PTR_TO_STACK) return mem_ldx_stack(nfp_prog, meta, size, @@ -1982,6 +2140,111 @@ static int mem_stx8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) return mem_stx(nfp_prog, meta, 8); } +static int +mem_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, bool is64) +{ + u8 dst_gpr = meta->insn.dst_reg * 2; + u8 src_gpr = meta->insn.src_reg * 2; + unsigned int full_add, out; + swreg addra, addrb, off; + + off = ur_load_imm_any(nfp_prog, meta->insn.off, imm_b(nfp_prog)); + + /* We can fit 16 bits into command immediate, if we know the immediate + * is guaranteed to either always or never fit into 16 bit we only + * generate code to handle that particular case, otherwise generate + * code for both. + */ + out = nfp_prog_current_offset(nfp_prog); + full_add = nfp_prog_current_offset(nfp_prog); + + if (meta->insn.off) { + out += 2; + full_add += 2; + } + if (meta->xadd_maybe_16bit) { + out += 3; + full_add += 3; + } + if (meta->xadd_over_16bit) + out += 2 + is64; + if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) { + out += 5; + full_add += 5; + } + + /* Generate the branch for choosing add_imm vs add */ + if (meta->xadd_maybe_16bit && meta->xadd_over_16bit) { + swreg max_imm = imm_a(nfp_prog); + + wrp_immed(nfp_prog, max_imm, 0xffff); + emit_alu(nfp_prog, reg_none(), + max_imm, ALU_OP_SUB, reg_b(src_gpr)); + emit_alu(nfp_prog, reg_none(), + reg_imm(0), ALU_OP_SUB_C, reg_b(src_gpr + 1)); + emit_br(nfp_prog, BR_BLO, full_add, meta->insn.off ? 2 : 0); + /* defer for add */ + } + + /* If insn has an offset add to the address */ + if (!meta->insn.off) { + addra = reg_a(dst_gpr); + addrb = reg_b(dst_gpr + 1); + } else { + emit_alu(nfp_prog, imma_a(nfp_prog), + reg_a(dst_gpr), ALU_OP_ADD, off); + emit_alu(nfp_prog, imma_b(nfp_prog), + reg_a(dst_gpr + 1), ALU_OP_ADD_C, reg_imm(0)); + addra = imma_a(nfp_prog); + addrb = imma_b(nfp_prog); + } + + /* Generate the add_imm if 16 bits are possible */ + if (meta->xadd_maybe_16bit) { + swreg prev_alu = imm_a(nfp_prog); + + wrp_immed(nfp_prog, prev_alu, + FIELD_PREP(CMD_OVE_DATA, 2) | + CMD_OVE_LEN | + FIELD_PREP(CMD_OV_LEN, 0x8 | is64 << 2)); + wrp_reg_or_subpart(nfp_prog, prev_alu, reg_b(src_gpr), 2, 2); + emit_cmd_indir(nfp_prog, CMD_TGT_ADD_IMM, CMD_MODE_40b_BA, 0, + addra, addrb, 0, CMD_CTX_NO_SWAP); + + if (meta->xadd_over_16bit) + emit_br(nfp_prog, BR_UNC, out, 0); + } + + if (!nfp_prog_confirm_current_offset(nfp_prog, full_add)) + return -EINVAL; + + /* Generate the add if 16 bits are not guaranteed */ + if (meta->xadd_over_16bit) { + emit_cmd(nfp_prog, CMD_TGT_ADD, CMD_MODE_40b_BA, 0, + addra, addrb, is64 << 2, + is64 ? CMD_CTX_SWAP_DEFER2 : CMD_CTX_SWAP_DEFER1); + + wrp_mov(nfp_prog, reg_xfer(0), reg_a(src_gpr)); + if (is64) + wrp_mov(nfp_prog, reg_xfer(1), reg_a(src_gpr + 1)); + } + + if (!nfp_prog_confirm_current_offset(nfp_prog, out)) + return -EINVAL; + + return 0; +} + +static int mem_xadd4(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_xadd(nfp_prog, meta, false); +} + +static int mem_xadd8(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) +{ + return mem_xadd(nfp_prog, meta, true); +} + static int jump(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) { emit_br(nfp_prog, BR_UNC, meta->insn.off, 0); @@ -2183,7 +2446,11 @@ static int call(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta) case BPF_FUNC_xdp_adjust_head: return adjust_head(nfp_prog, meta); case BPF_FUNC_map_lookup_elem: - return map_lookup_stack(nfp_prog, meta); + case BPF_FUNC_map_update_elem: + case BPF_FUNC_map_delete_elem: + return map_call_stack_common(nfp_prog, meta); + case BPF_FUNC_get_prandom_u32: + return nfp_get_prandom_u32(nfp_prog, meta); default: WARN_ONCE(1, "verifier allowed unsupported function\n"); return -EOPNOTSUPP; @@ -2243,6 +2510,8 @@ static const instr_cb_t instr_cb[256] = { [BPF_STX | BPF_MEM | BPF_H] = mem_stx2, [BPF_STX | BPF_MEM | BPF_W] = mem_stx4, [BPF_STX | BPF_MEM | BPF_DW] = mem_stx8, + [BPF_STX | BPF_XADD | BPF_W] = mem_xadd4, + [BPF_STX | BPF_XADD | BPF_DW] = mem_xadd8, [BPF_ST | BPF_MEM | BPF_B] = mem_st1, [BPF_ST | BPF_MEM | BPF_H] = mem_st2, [BPF_ST | BPF_MEM | BPF_W] = mem_st4, @@ -2821,6 +3090,120 @@ static void nfp_bpf_opt_ldst_gather(struct nfp_prog *nfp_prog) } } +static void nfp_bpf_opt_pkt_cache(struct nfp_prog *nfp_prog) +{ + struct nfp_insn_meta *meta, *range_node = NULL; + s16 range_start = 0, range_end = 0; + bool cache_avail = false; + struct bpf_insn *insn; + s32 range_ptr_off = 0; + u32 range_ptr_id = 0; + + list_for_each_entry(meta, &nfp_prog->insns, l) { + if (meta->flags & FLAG_INSN_IS_JUMP_DST) + cache_avail = false; + + if (meta->skip) + continue; + + insn = &meta->insn; + + if (is_mbpf_store_pkt(meta) || + insn->code == (BPF_JMP | BPF_CALL) || + is_mbpf_classic_store_pkt(meta) || + is_mbpf_classic_load(meta)) { + cache_avail = false; + continue; + } + + if (!is_mbpf_load(meta)) + continue; + + if (meta->ptr.type != PTR_TO_PACKET || meta->ldst_gather_len) { + cache_avail = false; + continue; + } + + if (!cache_avail) { + cache_avail = true; + if (range_node) + goto end_current_then_start_new; + goto start_new; + } + + /* Check ID to make sure two reads share the same + * variable offset against PTR_TO_PACKET, and check OFF + * to make sure they also share the same constant + * offset. + * + * OFFs don't really need to be the same, because they + * are the constant offsets against PTR_TO_PACKET, so + * for different OFFs, we could canonicalize them to + * offsets against original packet pointer. We don't + * support this. + */ + if (meta->ptr.id == range_ptr_id && + meta->ptr.off == range_ptr_off) { + s16 new_start = range_start; + s16 end, off = insn->off; + s16 new_end = range_end; + bool changed = false; + + if (off < range_start) { + new_start = off; + changed = true; + } + + end = off + BPF_LDST_BYTES(insn); + if (end > range_end) { + new_end = end; + changed = true; + } + + if (!changed) + continue; + + if (new_end - new_start <= 64) { + /* Install new range. */ + range_start = new_start; + range_end = new_end; + continue; + } + } + +end_current_then_start_new: + range_node->pkt_cache.range_start = range_start; + range_node->pkt_cache.range_end = range_end; +start_new: + range_node = meta; + range_node->pkt_cache.do_init = true; + range_ptr_id = range_node->ptr.id; + range_ptr_off = range_node->ptr.off; + range_start = insn->off; + range_end = insn->off + BPF_LDST_BYTES(insn); + } + + if (range_node) { + range_node->pkt_cache.range_start = range_start; + range_node->pkt_cache.range_end = range_end; + } + + list_for_each_entry(meta, &nfp_prog->insns, l) { + if (meta->skip) + continue; + + if (is_mbpf_load_pkt(meta) && !meta->ldst_gather_len) { + if (meta->pkt_cache.do_init) { + range_start = meta->pkt_cache.range_start; + range_end = meta->pkt_cache.range_end; + } else { + meta->pkt_cache.range_start = range_start; + meta->pkt_cache.range_end = range_end; + } + } + } +} + static int nfp_bpf_optimize(struct nfp_prog *nfp_prog) { nfp_bpf_opt_reg_init(nfp_prog); @@ -2828,6 +3211,7 @@ static int nfp_bpf_optimize(struct nfp_prog *nfp_prog) nfp_bpf_opt_ld_mask(nfp_prog); nfp_bpf_opt_ld_shift(nfp_prog); nfp_bpf_opt_ldst_gather(nfp_prog); + nfp_bpf_opt_pkt_cache(nfp_prog); return 0; } @@ -2952,6 +3336,12 @@ void *nfp_bpf_relo_for_vnic(struct nfp_prog *nfp_prog, struct nfp_bpf_vnic *bv) case BPF_FUNC_map_lookup_elem: val = nfp_prog->bpf->helpers.map_lookup; break; + case BPF_FUNC_map_update_elem: + val = nfp_prog->bpf->helpers.map_update; + break; + case BPF_FUNC_map_delete_elem: + val = nfp_prog->bpf->helpers.map_delete; + break; default: pr_err("relocation of unknown helper %d\n", val); diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.c b/drivers/net/ethernet/netronome/nfp/bpf/main.c index 752c45763ed9..1dc424685f4e 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.c @@ -284,6 +284,12 @@ nfp_bpf_parse_cap_func(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) case BPF_FUNC_map_lookup_elem: bpf->helpers.map_lookup = readl(&cap->func_addr); break; + case BPF_FUNC_map_update_elem: + bpf->helpers.map_update = readl(&cap->func_addr); + break; + case BPF_FUNC_map_delete_elem: + bpf->helpers.map_delete = readl(&cap->func_addr); + break; } return 0; @@ -309,6 +315,14 @@ nfp_bpf_parse_cap_maps(struct nfp_app_bpf *bpf, void __iomem *value, u32 length) return 0; } +static int +nfp_bpf_parse_cap_random(struct nfp_app_bpf *bpf, void __iomem *value, + u32 length) +{ + bpf->pseudo_random = true; + return 0; +} + static int nfp_bpf_parse_capabilities(struct nfp_app *app) { struct nfp_cpp *cpp = app->pf->cpp; @@ -347,6 +361,10 @@ static int nfp_bpf_parse_capabilities(struct nfp_app *app) if (nfp_bpf_parse_cap_maps(app->priv, value, length)) goto err_release_free; break; + case NFP_BPF_CAP_TYPE_RANDOM: + if (nfp_bpf_parse_cap_random(app->priv, value, length)) + goto err_release_free; + break; default: nfp_dbg(cpp, "unknown BPF capability: %d\n", type); break; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/main.h b/drivers/net/ethernet/netronome/nfp/bpf/main.h index 054df3dc0698..4981c8944ca3 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/main.h +++ b/drivers/net/ethernet/netronome/nfp/bpf/main.h @@ -72,6 +72,7 @@ enum nfp_relo_type { #define BR_OFF_RELO 15000 enum static_regs { + STATIC_REG_IMMA = 20, /* Bank AB */ STATIC_REG_IMM = 21, /* Bank AB */ STATIC_REG_STACK = 22, /* Bank A */ STATIC_REG_PKT_LEN = 22, /* Bank B */ @@ -91,6 +92,8 @@ enum pkt_vec { #define pptr_reg(np) pv_ctm_ptr(np) #define imm_a(np) reg_a(STATIC_REG_IMM) #define imm_b(np) reg_b(STATIC_REG_IMM) +#define imma_a(np) reg_a(STATIC_REG_IMMA) +#define imma_b(np) reg_b(STATIC_REG_IMMA) #define imm_both(np) reg_both(STATIC_REG_IMM) #define NFP_BPF_ABI_FLAGS reg_imm(0) @@ -128,6 +131,10 @@ enum pkt_vec { * * @helpers: helper addressess for various calls * @helpers.map_lookup: map lookup helper address + * @helpers.map_update: map update helper address + * @helpers.map_delete: map delete helper address + * + * @pseudo_random: FW initialized the pseudo-random machinery (CSRs) */ struct nfp_app_bpf { struct nfp_app *app; @@ -162,7 +169,18 @@ struct nfp_app_bpf { struct { u32 map_lookup; + u32 map_update; + u32 map_delete; } helpers; + + bool pseudo_random; +}; + +enum nfp_bpf_map_use { + NFP_MAP_UNUSED = 0, + NFP_MAP_USE_READ, + NFP_MAP_USE_WRITE, + NFP_MAP_USE_ATOMIC_CNT, }; /** @@ -171,12 +189,14 @@ struct nfp_app_bpf { * @bpf: back pointer to bpf app private structure * @tid: table id identifying map on datapath * @l: link on the nfp_app_bpf->map_list list + * @use_map: map of how the value is used (in 4B chunks) */ struct nfp_bpf_map { struct bpf_offloaded_map *offmap; struct nfp_app_bpf *bpf; u32 tid; struct list_head l; + enum nfp_bpf_map_use use_map[]; }; struct nfp_prog; @@ -190,6 +210,16 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); #define nfp_meta_next(meta) list_next_entry(meta, l) #define nfp_meta_prev(meta) list_prev_entry(meta, l) +/** + * struct nfp_bpf_reg_state - register state for calls + * @reg: BPF register state from latest path + * @var_off: for stack arg - changes stack offset on different paths + */ +struct nfp_bpf_reg_state { + struct bpf_reg_state reg; + bool var_off; +}; + #define FLAG_INSN_IS_JUMP_DST BIT(0) /** @@ -199,11 +229,16 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); * @ldst_gather_len: memcpy length gathered from load/store sequence * @paired_st: the paired store insn at the head of the sequence * @ptr_not_const: pointer is not always constant + * @pkt_cache: packet data cache information + * @pkt_cache.range_start: start offset for associated packet data cache + * @pkt_cache.range_end: end offset for associated packet data cache + * @pkt_cache.do_init: this read needs to initialize packet data cache + * @xadd_over_16bit: 16bit immediate is not guaranteed + * @xadd_maybe_16bit: 16bit immediate is possible * @jmp_dst: destination info for jump instructions * @func_id: function id for call instructions * @arg1: arg1 for call instructions * @arg2: arg2 for call instructions - * @arg2_var_off: arg2 changes stack offset on different paths * @off: index of first generated machine instruction (in nfp_prog.prog) * @n: eBPF instruction number * @flags: eBPF instruction extra optimization flags @@ -214,18 +249,27 @@ typedef int (*instr_cb_t)(struct nfp_prog *, struct nfp_insn_meta *); struct nfp_insn_meta { struct bpf_insn insn; union { + /* pointer ops (ld/st/xadd) */ struct { struct bpf_reg_state ptr; struct bpf_insn *paired_st; s16 ldst_gather_len; bool ptr_not_const; + struct { + s16 range_start; + s16 range_end; + bool do_init; + } pkt_cache; + bool xadd_over_16bit; + bool xadd_maybe_16bit; }; + /* jump */ struct nfp_insn_meta *jmp_dst; + /* function calls */ struct { u32 func_id; struct bpf_reg_state arg1; - struct bpf_reg_state arg2; - bool arg2_var_off; + struct nfp_bpf_reg_state arg2; }; }; unsigned int off; @@ -269,6 +313,41 @@ static inline bool is_mbpf_store(const struct nfp_insn_meta *meta) return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_MEM); } +static inline bool is_mbpf_load_pkt(const struct nfp_insn_meta *meta) +{ + return is_mbpf_load(meta) && meta->ptr.type == PTR_TO_PACKET; +} + +static inline bool is_mbpf_store_pkt(const struct nfp_insn_meta *meta) +{ + return is_mbpf_store(meta) && meta->ptr.type == PTR_TO_PACKET; +} + +static inline bool is_mbpf_classic_load(const struct nfp_insn_meta *meta) +{ + u8 code = meta->insn.code; + + return BPF_CLASS(code) == BPF_LD && + (BPF_MODE(code) == BPF_ABS || BPF_MODE(code) == BPF_IND); +} + +static inline bool is_mbpf_classic_store(const struct nfp_insn_meta *meta) +{ + u8 code = meta->insn.code; + + return BPF_CLASS(code) == BPF_ST && BPF_MODE(code) == BPF_MEM; +} + +static inline bool is_mbpf_classic_store_pkt(const struct nfp_insn_meta *meta) +{ + return is_mbpf_classic_store(meta) && meta->ptr.type == PTR_TO_PACKET; +} + +static inline bool is_mbpf_xadd(const struct nfp_insn_meta *meta) +{ + return (meta->insn.code & ~BPF_SIZE_MASK) == (BPF_STX | BPF_XADD); +} + /** * struct nfp_prog - nfp BPF program * @bpf: backpointer to the bpf app priv structure diff --git a/drivers/net/ethernet/netronome/nfp/bpf/offload.c b/drivers/net/ethernet/netronome/nfp/bpf/offload.c index 0a7732385469..42d98792bd25 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/offload.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/offload.c @@ -164,6 +164,41 @@ static int nfp_bpf_destroy(struct nfp_net *nn, struct bpf_prog *prog) return 0; } +/* Atomic engine requires values to be in big endian, we need to byte swap + * the value words used with xadd. + */ +static void nfp_map_bpf_byte_swap(struct nfp_bpf_map *nfp_map, void *value) +{ + u32 *word = value; + unsigned int i; + + for (i = 0; i < DIV_ROUND_UP(nfp_map->offmap->map.value_size, 4); i++) + if (nfp_map->use_map[i] == NFP_MAP_USE_ATOMIC_CNT) + word[i] = (__force u32)cpu_to_be32(word[i]); +} + +static int +nfp_bpf_map_lookup_entry(struct bpf_offloaded_map *offmap, + void *key, void *value) +{ + int err; + + err = nfp_bpf_ctrl_lookup_entry(offmap, key, value); + if (err) + return err; + + nfp_map_bpf_byte_swap(offmap->dev_priv, value); + return 0; +} + +static int +nfp_bpf_map_update_entry(struct bpf_offloaded_map *offmap, + void *key, void *value, u64 flags) +{ + nfp_map_bpf_byte_swap(offmap->dev_priv, value); + return nfp_bpf_ctrl_update_entry(offmap, key, value, flags); +} + static int nfp_bpf_map_get_next_key(struct bpf_offloaded_map *offmap, void *key, void *next_key) @@ -183,8 +218,8 @@ nfp_bpf_map_delete_elem(struct bpf_offloaded_map *offmap, void *key) static const struct bpf_map_dev_ops nfp_bpf_map_ops = { .map_get_next_key = nfp_bpf_map_get_next_key, - .map_lookup_elem = nfp_bpf_ctrl_lookup_entry, - .map_update_elem = nfp_bpf_ctrl_update_entry, + .map_lookup_elem = nfp_bpf_map_lookup_entry, + .map_update_elem = nfp_bpf_map_update_entry, .map_delete_elem = nfp_bpf_map_delete_elem, }; @@ -192,6 +227,7 @@ static int nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) { struct nfp_bpf_map *nfp_map; + unsigned int use_map_size; long long int res; if (!bpf->maps.types) @@ -226,7 +262,10 @@ nfp_bpf_map_alloc(struct nfp_app_bpf *bpf, struct bpf_offloaded_map *offmap) return -ENOMEM; } - nfp_map = kzalloc(sizeof(*nfp_map), GFP_USER); + use_map_size = DIV_ROUND_UP(offmap->map.value_size, 4) * + FIELD_SIZEOF(struct nfp_bpf_map, use_map[0]); + + nfp_map = kzalloc(sizeof(*nfp_map) + use_map_size, GFP_USER); if (!nfp_map) return -ENOMEM; diff --git a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c index 479f602887e9..06ad53ce4ad9 100644 --- a/drivers/net/ethernet/netronome/nfp/bpf/verifier.c +++ b/drivers/net/ethernet/netronome/nfp/bpf/verifier.c @@ -97,7 +97,7 @@ nfp_record_adjust_head(struct nfp_app_bpf *bpf, struct nfp_prog *nfp_prog, if (nfp_prog->adjust_head_location != meta->n) goto exit_set_location; - if (meta->arg2.var_off.value != imm) + if (meta->arg2.reg.var_off.value != imm) goto exit_set_location; } @@ -107,14 +107,69 @@ exit_set_location: } static int +nfp_bpf_stack_arg_ok(const char *fname, struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + struct nfp_bpf_reg_state *old_arg) +{ + s64 off, old_off; + + if (reg->type != PTR_TO_STACK) { + pr_vlog(env, "%s: unsupported ptr type %d\n", + fname, reg->type); + return false; + } + if (!tnum_is_const(reg->var_off)) { + pr_vlog(env, "%s: variable pointer\n", fname); + return false; + } + + off = reg->var_off.value + reg->off; + if (-off % 4) { + pr_vlog(env, "%s: unaligned stack pointer %lld\n", fname, -off); + return false; + } + + /* Rest of the checks is only if we re-parse the same insn */ + if (!old_arg) + return true; + + old_off = old_arg->reg.var_off.value + old_arg->reg.off; + old_arg->var_off |= off != old_off; + + return true; +} + +static bool +nfp_bpf_map_call_ok(const char *fname, struct bpf_verifier_env *env, + struct nfp_insn_meta *meta, + u32 helper_tgt, const struct bpf_reg_state *reg1) +{ + if (!helper_tgt) { + pr_vlog(env, "%s: not supported by FW\n", fname); + return false; + } + + /* Rest of the checks is only if we re-parse the same insn */ + if (!meta->func_id) + return true; + + if (meta->arg1.map_ptr != reg1->map_ptr) { + pr_vlog(env, "%s: called for different map\n", fname); + return false; + } + + return true; +} + +static int nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, struct nfp_insn_meta *meta) { const struct bpf_reg_state *reg1 = cur_regs(env) + BPF_REG_1; const struct bpf_reg_state *reg2 = cur_regs(env) + BPF_REG_2; + const struct bpf_reg_state *reg3 = cur_regs(env) + BPF_REG_3; struct nfp_app_bpf *bpf = nfp_prog->bpf; u32 func_id = meta->insn.imm; - s64 off, old_off; switch (func_id) { case BPF_FUNC_xdp_adjust_head: @@ -131,41 +186,36 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, break; case BPF_FUNC_map_lookup_elem: - if (!bpf->helpers.map_lookup) { - pr_vlog(env, "map_lookup: not supported by FW\n"); + if (!nfp_bpf_map_call_ok("map_lookup", env, meta, + bpf->helpers.map_lookup, reg1) || + !nfp_bpf_stack_arg_ok("map_lookup", env, reg2, + meta->func_id ? &meta->arg2 : NULL)) return -EOPNOTSUPP; - } - if (reg2->type != PTR_TO_STACK) { - pr_vlog(env, - "map_lookup: unsupported key ptr type %d\n", - reg2->type); - return -EOPNOTSUPP; - } - if (!tnum_is_const(reg2->var_off)) { - pr_vlog(env, "map_lookup: variable key pointer\n"); + break; + + case BPF_FUNC_map_update_elem: + if (!nfp_bpf_map_call_ok("map_update", env, meta, + bpf->helpers.map_update, reg1) || + !nfp_bpf_stack_arg_ok("map_update", env, reg2, + meta->func_id ? &meta->arg2 : NULL) || + !nfp_bpf_stack_arg_ok("map_update", env, reg3, NULL)) return -EOPNOTSUPP; - } + break; - off = reg2->var_off.value + reg2->off; - if (-off % 4) { - pr_vlog(env, - "map_lookup: unaligned stack pointer %lld\n", - -off); + case BPF_FUNC_map_delete_elem: + if (!nfp_bpf_map_call_ok("map_delete", env, meta, + bpf->helpers.map_delete, reg1) || + !nfp_bpf_stack_arg_ok("map_delete", env, reg2, + meta->func_id ? &meta->arg2 : NULL)) return -EOPNOTSUPP; - } + break; - /* Rest of the checks is only if we re-parse the same insn */ - if (!meta->func_id) + case BPF_FUNC_get_prandom_u32: + if (bpf->pseudo_random) break; + pr_vlog(env, "bpf_get_prandom_u32(): FW doesn't support random number generation\n"); + return -EOPNOTSUPP; - old_off = meta->arg2.var_off.value + meta->arg2.off; - meta->arg2_var_off |= off != old_off; - - if (meta->arg1.map_ptr != reg1->map_ptr) { - pr_vlog(env, "map_lookup: called for different map\n"); - return -EOPNOTSUPP; - } - break; default: pr_vlog(env, "unsupported function id: %d\n", func_id); return -EOPNOTSUPP; @@ -173,7 +223,7 @@ nfp_bpf_check_call(struct nfp_prog *nfp_prog, struct bpf_verifier_env *env, meta->func_id = func_id; meta->arg1 = *reg1; - meta->arg2 = *reg2; + meta->arg2.reg = *reg2; return 0; } @@ -242,6 +292,72 @@ nfp_bpf_check_stack_access(struct nfp_prog *nfp_prog, return -EINVAL; } +static const char *nfp_bpf_map_use_name(enum nfp_bpf_map_use use) +{ + static const char * const names[] = { + [NFP_MAP_UNUSED] = "unused", + [NFP_MAP_USE_READ] = "read", + [NFP_MAP_USE_WRITE] = "write", + [NFP_MAP_USE_ATOMIC_CNT] = "atomic", + }; + + if (use >= ARRAY_SIZE(names) || !names[use]) + return "unknown"; + return names[use]; +} + +static int +nfp_bpf_map_mark_used_one(struct bpf_verifier_env *env, + struct nfp_bpf_map *nfp_map, + unsigned int off, enum nfp_bpf_map_use use) +{ + if (nfp_map->use_map[off / 4] != NFP_MAP_UNUSED && + nfp_map->use_map[off / 4] != use) { + pr_vlog(env, "map value use type conflict %s vs %s off: %u\n", + nfp_bpf_map_use_name(nfp_map->use_map[off / 4]), + nfp_bpf_map_use_name(use), off); + return -EOPNOTSUPP; + } + + nfp_map->use_map[off / 4] = use; + + return 0; +} + +static int +nfp_bpf_map_mark_used(struct bpf_verifier_env *env, struct nfp_insn_meta *meta, + const struct bpf_reg_state *reg, + enum nfp_bpf_map_use use) +{ + struct bpf_offloaded_map *offmap; + struct nfp_bpf_map *nfp_map; + unsigned int size, off; + int i, err; + + if (!tnum_is_const(reg->var_off)) { + pr_vlog(env, "map value offset is variable\n"); + return -EOPNOTSUPP; + } + + off = reg->var_off.value + meta->insn.off + reg->off; + size = BPF_LDST_BYTES(&meta->insn); + offmap = map_to_offmap(reg->map_ptr); + nfp_map = offmap->dev_priv; + + if (off + size > offmap->map.value_size) { + pr_vlog(env, "map value access out-of-bounds\n"); + return -EINVAL; + } + + for (i = 0; i < size; i += 4 - (off + i) % 4) { + err = nfp_bpf_map_mark_used_one(env, nfp_map, off + i, use); + if (err) + return err; + } + + return 0; +} + static int nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, struct bpf_verifier_env *env, u8 reg_no) @@ -264,10 +380,22 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, } if (reg->type == PTR_TO_MAP_VALUE) { + if (is_mbpf_load(meta)) { + err = nfp_bpf_map_mark_used(env, meta, reg, + NFP_MAP_USE_READ); + if (err) + return err; + } if (is_mbpf_store(meta)) { pr_vlog(env, "map writes not supported\n"); return -EOPNOTSUPP; } + if (is_mbpf_xadd(meta)) { + err = nfp_bpf_map_mark_used(env, meta, reg, + NFP_MAP_USE_ATOMIC_CNT); + if (err) + return err; + } } if (meta->ptr.type != NOT_INIT && meta->ptr.type != reg->type) { @@ -282,6 +410,31 @@ nfp_bpf_check_ptr(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, } static int +nfp_bpf_check_xadd(struct nfp_prog *nfp_prog, struct nfp_insn_meta *meta, + struct bpf_verifier_env *env) +{ + const struct bpf_reg_state *sreg = cur_regs(env) + meta->insn.src_reg; + const struct bpf_reg_state *dreg = cur_regs(env) + meta->insn.dst_reg; + + if (dreg->type != PTR_TO_MAP_VALUE) { + pr_vlog(env, "atomic add not to a map value pointer: %d\n", + dreg->type); + return -EOPNOTSUPP; + } + if (sreg->type != SCALAR_VALUE) { + pr_vlog(env, "atomic add not of a scalar: %d\n", sreg->type); + return -EOPNOTSUPP; + } + + meta->xadd_over_16bit |= + sreg->var_off.value > 0xffff || sreg->var_off.mask > 0xffff; + meta->xadd_maybe_16bit |= + (sreg->var_off.value & ~sreg->var_off.mask) <= 0xffff; + + return nfp_bpf_check_ptr(nfp_prog, meta, env, meta->insn.dst_reg); +} + +static int nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) { struct nfp_prog *nfp_prog = env->prog->aux->offload->dev_priv; @@ -313,6 +466,8 @@ nfp_verify_insn(struct bpf_verifier_env *env, int insn_idx, int prev_insn_idx) if (is_mbpf_store(meta)) return nfp_bpf_check_ptr(nfp_prog, meta, env, meta->insn.dst_reg); + if (is_mbpf_xadd(meta)) + return nfp_bpf_check_xadd(nfp_prog, meta, env); return 0; } diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.c b/drivers/net/ethernet/netronome/nfp/nfp_asm.c index 1e597600c693..cc6ace2be8a9 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.c @@ -48,6 +48,8 @@ const struct cmd_tgt_act cmd_tgt_act[__CMD_TGT_MAP_SIZE] = { [CMD_TGT_READ32_SWAP] = { 0x02, 0x5c }, [CMD_TGT_READ_LE] = { 0x01, 0x40 }, [CMD_TGT_READ_SWAP_LE] = { 0x03, 0x40 }, + [CMD_TGT_ADD] = { 0x00, 0x47 }, + [CMD_TGT_ADD_IMM] = { 0x02, 0x47 }, }; static bool unreg_is_imm(u16 reg) diff --git a/drivers/net/ethernet/netronome/nfp/nfp_asm.h b/drivers/net/ethernet/netronome/nfp/nfp_asm.h index 5f9291db98e0..5f2b2f24f4fa 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_asm.h +++ b/drivers/net/ethernet/netronome/nfp/nfp_asm.h @@ -39,6 +39,7 @@ #include <linux/types.h> #define REG_NONE 0 +#define REG_WIDTH 4 #define RE_REG_NO_DST 0x020 #define RE_REG_IMM 0x020 @@ -237,6 +238,8 @@ enum cmd_tgt_map { CMD_TGT_READ32_SWAP, CMD_TGT_READ_LE, CMD_TGT_READ_SWAP_LE, + CMD_TGT_ADD, + CMD_TGT_ADD_IMM, __CMD_TGT_MAP_SIZE, }; @@ -250,9 +253,12 @@ enum cmd_mode { enum cmd_ctx_swap { CMD_CTX_SWAP = 0, + CMD_CTX_SWAP_DEFER1 = 1, + CMD_CTX_SWAP_DEFER2 = 2, CMD_CTX_NO_SWAP = 3, }; +#define CMD_OVE_DATA GENMASK(5, 3) #define CMD_OVE_LEN BIT(7) #define CMD_OV_LEN GENMASK(12, 8) @@ -278,6 +284,7 @@ enum lcsr_wr_src { #define NFP_CSR_ACT_LM_ADDR1 0x6c #define NFP_CSR_ACT_LM_ADDR2 0x94 #define NFP_CSR_ACT_LM_ADDR3 0x9c +#define NFP_CSR_PSEUDO_RND_NUM 0x148 /* Software register representation, independent of operand type */ #define NN_REG_TYPE GENMASK(31, 24) diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/main.c b/drivers/net/wireless/intel/iwlwifi/dvm/main.c index d11d72615de2..e68254e12764 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/main.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/main.c @@ -1651,12 +1651,7 @@ static void iwl_dump_nic_error_log(struct iwl_priv *priv) priv->status, table.valid); } - trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low, - table.data1, table.data2, table.line, - table.blink2, table.ilink1, table.ilink2, - table.bcon_time, table.gp1, table.gp2, - table.gp3, table.ucode_ver, table.hw_ver, - 0, table.brd_ver); + trace_iwlwifi_dev_ucode_error(trans->dev, &table, 0, table.brd_ver); IWL_ERR(priv, "0x%08X | %-28s\n", table.error_id, desc_lookup(table.error_id)); IWL_ERR(priv, "0x%08X | uPc\n", table.pc); diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h index 9518a82f44c2..27e3e4e96aa2 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace-iwlwifi.h @@ -126,14 +126,11 @@ TRACE_EVENT(iwlwifi_dev_tx, __entry->framelen, __entry->skbaddr) ); +struct iwl_error_event_table; TRACE_EVENT(iwlwifi_dev_ucode_error, - TP_PROTO(const struct device *dev, u32 desc, u32 tsf_low, - u32 data1, u32 data2, u32 line, u32 blink2, u32 ilink1, - u32 ilink2, u32 bcon_time, u32 gp1, u32 gp2, u32 rev_type, - u32 major, u32 minor, u32 hw_ver, u32 brd_ver), - TP_ARGS(dev, desc, tsf_low, data1, data2, line, - blink2, ilink1, ilink2, bcon_time, gp1, gp2, - rev_type, major, minor, hw_ver, brd_ver), + TP_PROTO(const struct device *dev, const struct iwl_error_event_table *table, + u32 hw_ver, u32 brd_ver), + TP_ARGS(dev, table, hw_ver, brd_ver), TP_STRUCT__entry( DEV_ENTRY __field(u32, desc) @@ -155,20 +152,20 @@ TRACE_EVENT(iwlwifi_dev_ucode_error, ), TP_fast_assign( DEV_ASSIGN; - __entry->desc = desc; - __entry->tsf_low = tsf_low; - __entry->data1 = data1; - __entry->data2 = data2; - __entry->line = line; - __entry->blink2 = blink2; - __entry->ilink1 = ilink1; - __entry->ilink2 = ilink2; - __entry->bcon_time = bcon_time; - __entry->gp1 = gp1; - __entry->gp2 = gp2; - __entry->rev_type = rev_type; - __entry->major = major; - __entry->minor = minor; + __entry->desc = table->error_id; + __entry->tsf_low = table->tsf_low; + __entry->data1 = table->data1; + __entry->data2 = table->data2; + __entry->line = table->line; + __entry->blink2 = table->blink2; + __entry->ilink1 = table->ilink1; + __entry->ilink2 = table->ilink2; + __entry->bcon_time = table->bcon_time; + __entry->gp1 = table->gp1; + __entry->gp2 = table->gp2; + __entry->rev_type = table->gp3; + __entry->major = table->ucode_ver; + __entry->minor = table->hw_ver; __entry->hw_ver = hw_ver; __entry->brd_ver = brd_ver; ), diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c index 50510fb6ab8c..6aa719865a58 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c +++ b/drivers/net/wireless/intel/iwlwifi/iwl-devtrace.c @@ -30,6 +30,7 @@ #ifndef __CHECKER__ #include "iwl-trans.h" +#include "dvm/commands.h" #define CREATE_TRACE_POINTS #include "iwl-devtrace.h" diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c index bebcfb44c8c2..d99d9ea78e4c 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/utils.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/utils.c @@ -549,12 +549,7 @@ static void iwl_mvm_dump_lmac_error_log(struct iwl_mvm *mvm, u32 base) IWL_ERR(mvm, "Loaded firmware version: %s\n", mvm->fw->fw_version); - trace_iwlwifi_dev_ucode_error(trans->dev, table.error_id, table.tsf_low, - table.data1, table.data2, table.data3, - table.blink2, table.ilink1, - table.ilink2, table.bcon_time, table.gp1, - table.gp2, table.fw_rev_type, table.major, - table.minor, table.hw_ver, table.brd_ver); + trace_iwlwifi_dev_ucode_error(trans->dev, &table, table.hw_ver, table.brd_ver); IWL_ERR(mvm, "0x%08X | %-28s\n", table.error_id, desc_lookup(table.error_id)); IWL_ERR(mvm, "0x%08X | trm_hw_status0\n", table.trm_hw_status0); diff --git a/drivers/net/wireless/mediatek/mt7601u/trace.h b/drivers/net/wireless/mediatek/mt7601u/trace.h index 289897300ef0..82c8898b9076 100644 --- a/drivers/net/wireless/mediatek/mt7601u/trace.h +++ b/drivers/net/wireless/mediatek/mt7601u/trace.h @@ -34,7 +34,7 @@ #define REG_PR_FMT "%04x=%08x" #define REG_PR_ARG __entry->reg, __entry->val -DECLARE_EVENT_CLASS(dev_reg_evt, +DECLARE_EVENT_CLASS(dev_reg_evtu, TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val), TP_ARGS(dev, reg, val), TP_STRUCT__entry( @@ -51,12 +51,12 @@ DECLARE_EVENT_CLASS(dev_reg_evt, ) ); -DEFINE_EVENT(dev_reg_evt, reg_read, +DEFINE_EVENT(dev_reg_evtu, reg_read, TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val), TP_ARGS(dev, reg, val) ); -DEFINE_EVENT(dev_reg_evt, reg_write, +DEFINE_EVENT(dev_reg_evtu, reg_write, TP_PROTO(struct mt7601u_dev *dev, u32 reg, u32 val), TP_ARGS(dev, reg, val) ); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 1ab0e520d6fc..8add3493a202 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -178,6 +178,15 @@ #define TRACE_SYSCALLS() #endif +#ifdef CONFIG_BPF_EVENTS +#define BPF_RAW_TP() STRUCT_ALIGN(); \ + VMLINUX_SYMBOL(__start__bpf_raw_tp) = .; \ + KEEP(*(__bpf_raw_tp_map)) \ + VMLINUX_SYMBOL(__stop__bpf_raw_tp) = .; +#else +#define BPF_RAW_TP() +#endif + #ifdef CONFIG_SERIAL_EARLYCON #define EARLYCON_TABLE() STRUCT_ALIGN(); \ VMLINUX_SYMBOL(__earlycon_table) = .; \ @@ -249,6 +258,7 @@ LIKELY_PROFILE() \ BRANCH_PROFILE() \ TRACE_PRINTKS() \ + BPF_RAW_TP() \ TRACEPOINT_STR() /* diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h index 8a4566691c8f..30d15e64b993 100644 --- a/include/linux/bpf-cgroup.h +++ b/include/linux/bpf-cgroup.h @@ -6,6 +6,7 @@ #include <uapi/linux/bpf.h> struct sock; +struct sockaddr; struct cgroup; struct sk_buff; struct bpf_sock_ops_kern; @@ -63,6 +64,10 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type); +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type); + int __cgroup_bpf_run_filter_sock_ops(struct sock *sk, struct bpf_sock_ops_kern *sock_ops, enum bpf_attach_type type); @@ -93,16 +98,64 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, __ret; \ }) -#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ +#define BPF_CGROUP_RUN_SK_PROG(sk, type) \ ({ \ int __ret = 0; \ if (cgroup_bpf_enabled) { \ - __ret = __cgroup_bpf_run_filter_sk(sk, \ - BPF_CGROUP_INET_SOCK_CREATE); \ + __ret = __cgroup_bpf_run_filter_sk(sk, type); \ + } \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET_SOCK_CREATE) + +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET4_POST_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, BPF_CGROUP_INET6_POST_BIND) + +#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + __ret; \ +}) + +#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, type) \ +({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled) { \ + lock_sock(sk); \ + __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, type); \ + release_sock(sk); \ } \ __ret; \ }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_BIND) + +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_BIND) + +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (cgroup_bpf_enabled && \ + sk->sk_prot->pre_connect) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET4_CONNECT) + +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ + BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, BPF_CGROUP_INET6_CONNECT) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) \ ({ \ int __ret = 0; \ @@ -132,9 +185,18 @@ struct cgroup_bpf {}; static inline void cgroup_bpf_put(struct cgroup *cgrp) {} static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; } +#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(type,major,minor,access) ({ 0; }) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 819229c80eca..95a7abd0ee92 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -208,12 +208,15 @@ struct bpf_prog_ops { struct bpf_verifier_ops { /* return eBPF function prototype for verification */ - const struct bpf_func_proto *(*get_func_proto)(enum bpf_func_id func_id); + const struct bpf_func_proto * + (*get_func_proto)(enum bpf_func_id func_id, + const struct bpf_prog *prog); /* return true if 'size' wide access at offset 'off' within bpf_context * with 'type' (read or write) is allowed */ bool (*is_valid_access)(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info); int (*gen_prologue)(struct bpf_insn *insn, bool direct_write, const struct bpf_prog *prog); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 5e2e8a49fb21..2b28fcf6f6ae 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -8,6 +8,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act) BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb) BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, cg_sock_addr) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout) BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit) @@ -19,6 +20,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SK_MSG, sk_msg) BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe) BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint) BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event) +BPF_PROG_TYPE(BPF_PROG_TYPE_RAW_TRACEPOINT, raw_tracepoint) #endif #ifdef CONFIG_CGROUP_BPF BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_DEVICE, cg_dev) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index 6b66cd1aa0b9..7e61c395fddf 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -153,7 +153,7 @@ struct bpf_insn_aux_data { #define BPF_VERIFIER_TMP_LOG_SIZE 1024 -struct bpf_verifer_log { +struct bpf_verifier_log { u32 level; char kbuf[BPF_VERIFIER_TMP_LOG_SIZE]; char __user *ubuf; @@ -161,11 +161,16 @@ struct bpf_verifer_log { u32 len_total; }; -static inline bool bpf_verifier_log_full(const struct bpf_verifer_log *log) +static inline bool bpf_verifier_log_full(const struct bpf_verifier_log *log) { return log->len_used >= log->len_total - 1; } +static inline bool bpf_verifier_log_needed(const struct bpf_verifier_log *log) +{ + return log->level && log->ubuf && !bpf_verifier_log_full(log); +} + #define BPF_MAX_SUBPROGS 256 /* single container for all structs @@ -185,13 +190,15 @@ struct bpf_verifier_env { bool allow_ptr_leaks; bool seen_direct_write; struct bpf_insn_aux_data *insn_aux_data; /* array of per-insn state */ - struct bpf_verifer_log log; + struct bpf_verifier_log log; u32 subprog_starts[BPF_MAX_SUBPROGS]; /* computes the stack depth of each bpf function */ u16 subprog_stack_depth[BPF_MAX_SUBPROGS + 1]; u32 subprog_cnt; }; +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args); __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, const char *fmt, ...); diff --git a/include/linux/filter.h b/include/linux/filter.h index 109d05ccea9a..fc4e8f91b03d 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -372,7 +372,7 @@ struct xdp_rxq_info; #define BPF_LDST_BYTES(insn) \ ({ \ - const int __size = bpf_size_to_bytes(BPF_SIZE(insn->code)); \ + const int __size = bpf_size_to_bytes(BPF_SIZE((insn)->code)); \ WARN_ON(__size < 0); \ __size; \ }) @@ -469,6 +469,7 @@ struct bpf_prog { is_func:1, /* program is a bpf function */ kprobe_override:1; /* Do we override a kprobe? */ enum bpf_prog_type type; /* Type of BPF program */ + enum bpf_attach_type expected_attach_type; /* For some prog types */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ u8 tag[BPF_TAG_SIZE]; @@ -521,6 +522,8 @@ struct sk_msg_buff { __u32 key; __u32 flags; struct bpf_map *map; + struct sk_buff *skb; + struct list_head list; }; /* Compute the linear packet data range [data, data_end) which @@ -1018,6 +1021,16 @@ static inline int bpf_tell_extensions(void) return SKF_AD_MAX; } +struct bpf_sock_addr_kern { + struct sock *sk; + struct sockaddr *uaddr; + /* Temporary "register" to make indirect stores to nested structures + * defined above. We need three registers to make such a store, but + * only two (src and dst) are available at convert_ctx_access time + */ + u64 tmp_reg; +}; + struct bpf_sock_ops_kern { struct sock *sk; u32 op; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 3fd291503576..293fa0677fba 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -919,6 +919,13 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { } #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +/* This counts to 12. Any more, it will return 13th argument. */ +#define __COUNT_ARGS(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _n, X...) _n +#define COUNT_ARGS(X...) __COUNT_ARGS(, ##X, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) + +#define __CONCAT(a, b) a ## b +#define CONCATENATE(a, b) __CONCAT(a, b) + /** * container_of - cast a member of a structure out to the containing structure * @ptr: the pointer to the member. diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index 22b2131bcdcd..aa5d4eb725f5 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -248,6 +248,24 @@ static inline void *sg_virt(struct scatterlist *sg) return page_address(sg_page(sg)) + sg->offset; } +/** + * sg_init_marker - Initialize markers in sg table + * @sgl: The SG table + * @nents: Number of entries in table + * + **/ +static inline void sg_init_marker(struct scatterlist *sgl, + unsigned int nents) +{ +#ifdef CONFIG_DEBUG_SG + unsigned int i; + + for (i = 0; i < nents; i++) + sgl[i].sg_magic = SG_MAGIC; +#endif + sg_mark_end(&sgl[nents - 1]); +} + int sg_nents(struct scatterlist *sg); int sg_nents_for_len(struct scatterlist *sg, u64 len); struct scatterlist *sg_next(struct scatterlist *); diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 8a1442c4e513..b0357cd198b0 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -468,6 +468,9 @@ unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx); int perf_event_attach_bpf_prog(struct perf_event *event, struct bpf_prog *prog); void perf_event_detach_bpf_prog(struct perf_event *event); int perf_event_query_prog_array(struct perf_event *event, void __user *info); +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog); +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name); #else static inline unsigned int trace_call_bpf(struct trace_event_call *call, void *ctx) { @@ -487,6 +490,18 @@ perf_event_query_prog_array(struct perf_event *event, void __user *info) { return -EOPNOTSUPP; } +static inline int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *p) +{ + return -EOPNOTSUPP; +} +static inline struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + return NULL; +} #endif enum { @@ -546,6 +561,33 @@ extern void ftrace_profile_free_filter(struct perf_event *event); void perf_trace_buf_update(void *record, u16 type); void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp); +void bpf_trace_run1(struct bpf_prog *prog, u64 arg1); +void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2); +void bpf_trace_run3(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3); +void bpf_trace_run4(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4); +void bpf_trace_run5(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5); +void bpf_trace_run6(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6); +void bpf_trace_run7(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7); +void bpf_trace_run8(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8); +void bpf_trace_run9(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9); +void bpf_trace_run10(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10); +void bpf_trace_run11(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11); +void bpf_trace_run12(struct bpf_prog *prog, u64 arg1, u64 arg2, + u64 arg3, u64 arg4, u64 arg5, u64 arg6, u64 arg7, + u64 arg8, u64 arg9, u64 arg10, u64 arg11, u64 arg12); void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx, struct trace_event_call *call, u64 count, struct pt_regs *regs, struct hlist_head *head, diff --git a/include/linux/tracepoint-defs.h b/include/linux/tracepoint-defs.h index 64ed7064f1fa..22c5a46e9693 100644 --- a/include/linux/tracepoint-defs.h +++ b/include/linux/tracepoint-defs.h @@ -35,4 +35,10 @@ struct tracepoint { struct tracepoint_func __rcu *funcs; }; +struct bpf_raw_event_map { + struct tracepoint *tp; + void *bpf_func; + u32 num_args; +} __aligned(32); + #endif diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 132e5b95167a..378d601258be 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -231,6 +231,13 @@ struct ipv6_stub { }; extern const struct ipv6_stub *ipv6_stub __read_mostly; +/* A stub used by bpf helpers. Similarly ugly as ipv6_stub */ +struct ipv6_bpf_stub { + int (*inet6_bind)(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); +}; +extern const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; + /* * identify MLD packets for MLD filter exceptions */ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index 500f81375200..384b90c62c0b 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -32,6 +32,8 @@ int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 37455e840347..9b6e7f51b1d4 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -1043,6 +1043,8 @@ void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); +int __inet6_bind(struct sock *sock, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); diff --git a/include/net/sock.h b/include/net/sock.h index 709311132d4c..49bd2c1796b0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1026,6 +1026,9 @@ static inline void sk_prot_clear_nulls(struct sock *sk, int size) struct proto { void (*close)(struct sock *sk, long timeout); + int (*pre_connect)(struct sock *sk, + struct sockaddr *uaddr, + int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); @@ -1085,6 +1088,7 @@ struct proto { #endif bool (*stream_memory_free)(const struct sock *sk); + bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); diff --git a/include/net/udp.h b/include/net/udp.h index 850a8e581cce..0676b272f6ac 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -273,6 +273,7 @@ void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); diff --git a/include/trace/bpf_probe.h b/include/trace/bpf_probe.h new file mode 100644 index 000000000000..505dae0bed80 --- /dev/null +++ b/include/trace/bpf_probe.h @@ -0,0 +1,92 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#undef TRACE_SYSTEM_VAR + +#ifdef CONFIG_BPF_EVENTS + +#undef __entry +#define __entry entry + +#undef __get_dynamic_array +#define __get_dynamic_array(field) \ + ((void *)__entry + (__entry->__data_loc_##field & 0xffff)) + +#undef __get_dynamic_array_len +#define __get_dynamic_array_len(field) \ + ((__entry->__data_loc_##field >> 16) & 0xffff) + +#undef __get_str +#define __get_str(field) ((char *)__get_dynamic_array(field)) + +#undef __get_bitmask +#define __get_bitmask(field) (char *)__get_dynamic_array(field) + +#undef __perf_count +#define __perf_count(c) (c) + +#undef __perf_task +#define __perf_task(t) (t) + +/* cast any integer, pointer, or small struct to u64 */ +#define UINTTYPE(size) \ + __typeof__(__builtin_choose_expr(size == 1, (u8)1, \ + __builtin_choose_expr(size == 2, (u16)2, \ + __builtin_choose_expr(size == 4, (u32)3, \ + __builtin_choose_expr(size == 8, (u64)4, \ + (void)5))))) +#define __CAST_TO_U64(x) ({ \ + typeof(x) __src = (x); \ + UINTTYPE(sizeof(x)) __dst; \ + memcpy(&__dst, &__src, sizeof(__dst)); \ + (u64)__dst; }) + +#define __CAST1(a,...) __CAST_TO_U64(a) +#define __CAST2(a,...) __CAST_TO_U64(a), __CAST1(__VA_ARGS__) +#define __CAST3(a,...) __CAST_TO_U64(a), __CAST2(__VA_ARGS__) +#define __CAST4(a,...) __CAST_TO_U64(a), __CAST3(__VA_ARGS__) +#define __CAST5(a,...) __CAST_TO_U64(a), __CAST4(__VA_ARGS__) +#define __CAST6(a,...) __CAST_TO_U64(a), __CAST5(__VA_ARGS__) +#define __CAST7(a,...) __CAST_TO_U64(a), __CAST6(__VA_ARGS__) +#define __CAST8(a,...) __CAST_TO_U64(a), __CAST7(__VA_ARGS__) +#define __CAST9(a,...) __CAST_TO_U64(a), __CAST8(__VA_ARGS__) +#define __CAST10(a,...) __CAST_TO_U64(a), __CAST9(__VA_ARGS__) +#define __CAST11(a,...) __CAST_TO_U64(a), __CAST10(__VA_ARGS__) +#define __CAST12(a,...) __CAST_TO_U64(a), __CAST11(__VA_ARGS__) +/* tracepoints with more than 12 arguments will hit build error */ +#define CAST_TO_U64(...) CONCATENATE(__CAST, COUNT_ARGS(__VA_ARGS__))(__VA_ARGS__) + +#undef DECLARE_EVENT_CLASS +#define DECLARE_EVENT_CLASS(call, proto, args, tstruct, assign, print) \ +static notrace void \ +__bpf_trace_##call(void *__data, proto) \ +{ \ + struct bpf_prog *prog = __data; \ + CONCATENATE(bpf_trace_run, COUNT_ARGS(args))(prog, CAST_TO_U64(args)); \ +} + +/* + * This part is compiled out, it is only here as a build time check + * to make sure that if the tracepoint handling changes, the + * bpf probe will fail to compile unless it too is updated. + */ +#undef DEFINE_EVENT +#define DEFINE_EVENT(template, call, proto, args) \ +static inline void bpf_test_probe_##call(void) \ +{ \ + check_trace_callback_type_##call(__bpf_trace_##template); \ +} \ +static struct bpf_raw_event_map __used \ + __attribute__((section("__bpf_raw_tp_map"))) \ +__bpf_trace_tp_map_##call = { \ + .tp = &__tracepoint_##call, \ + .bpf_func = (void *)__bpf_trace_##template, \ + .num_args = COUNT_ARGS(args), \ +}; + + +#undef DEFINE_EVENT_PRINT +#define DEFINE_EVENT_PRINT(template, name, proto, args, print) \ + DEFINE_EVENT(template, name, PARAMS(proto), PARAMS(args)) + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_BPF_EVENTS */ diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index d9e3d4aa3f6e..cb30c5532144 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -95,6 +95,7 @@ #ifdef TRACEPOINTS_ENABLED #include <trace/trace_events.h> #include <trace/perf.h> +#include <trace/bpf_probe.h> #endif #undef TRACE_EVENT diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 06c87f9f720c..795698925d20 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -491,7 +491,7 @@ DEFINE_EVENT(f2fs__truncate_node, f2fs_truncate_node, TRACE_EVENT(f2fs_truncate_partial_nodes, - TP_PROTO(struct inode *inode, nid_t nid[], int depth, int err), + TP_PROTO(struct inode *inode, nid_t *nid, int depth, int err), TP_ARGS(inode, nid, depth, err), diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 18b7c510c511..c5ec89732a8d 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, }; enum bpf_map_type { @@ -134,6 +135,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -145,6 +148,12 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -294,6 +303,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -344,6 +358,11 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -729,6 +748,13 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -794,7 +820,8 @@ union bpf_attr { FN(msg_redirect_map), \ FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ - FN(msg_pull_data), + FN(msg_pull_data), \ + FN(bind), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -923,6 +950,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 @@ -998,6 +1034,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need @@ -1152,4 +1208,8 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index c1c0b60d3f2f..43171a0bb02b 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -495,6 +495,42 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); /** + * __cgroup_bpf_run_filter_sock_addr() - Run a program on a sock and + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user + * @type: The type of program to be exectuted + * + * socket is expected to be of type INET or INET6. + * + * This function will return %-EPERM if an attached program is found and + * returned value != 1 during execution. In all other cases, 0 is returned. + */ +int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, + enum bpf_attach_type type) +{ + struct bpf_sock_addr_kern ctx = { + .sk = sk, + .uaddr = uaddr, + }; + struct cgroup *cgrp; + int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). + */ + if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) + return 0; + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], &ctx, BPF_PROG_RUN); + + return ret == 1 ? 0 : -EPERM; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + +/** * __cgroup_bpf_run_filter_sock_ops() - Run a program on a sock * @sk: socket to get cgroup from * @sock_ops: bpf_sock_ops_kern struct to pass to program. Contains @@ -545,7 +581,7 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor, EXPORT_SYMBOL(__cgroup_bpf_check_dev_permission); static const struct bpf_func_proto * -cgroup_dev_func_proto(enum bpf_func_id func_id) +cgroup_dev_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -566,6 +602,7 @@ cgroup_dev_func_proto(enum bpf_func_id func_id) static bool cgroup_dev_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); diff --git a/kernel/bpf/disasm.c b/kernel/bpf/disasm.c index 8740406df2cd..d6b76377cb6e 100644 --- a/kernel/bpf/disasm.c +++ b/kernel/bpf/disasm.c @@ -113,16 +113,16 @@ static const char *const bpf_jmp_string[16] = { }; static void print_bpf_end_insn(bpf_insn_print_t verbose, - struct bpf_verifier_env *env, + void *private_data, const struct bpf_insn *insn) { - verbose(env, "(%02x) r%d = %s%d r%d\n", insn->code, insn->dst_reg, + verbose(private_data, "(%02x) r%d = %s%d r%d\n", + insn->code, insn->dst_reg, BPF_SRC(insn->code) == BPF_TO_BE ? "be" : "le", insn->imm, insn->dst_reg); } void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks) { @@ -132,23 +132,23 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (class == BPF_ALU || class == BPF_ALU64) { if (BPF_OP(insn->code) == BPF_END) { if (class == BPF_ALU64) - verbose(env, "BUG_alu64_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_alu64_%02x\n", insn->code); else - print_bpf_end_insn(verbose, env, insn); + print_bpf_end_insn(verbose, cbs->private_data, insn); } else if (BPF_OP(insn->code) == BPF_NEG) { - verbose(env, "(%02x) r%d = %s-r%d\n", + verbose(cbs->private_data, "(%02x) r%d = %s-r%d\n", insn->code, insn->dst_reg, class == BPF_ALU ? "(u32) " : "", insn->dst_reg); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) %sr%d %s %sr%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %sr%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], class == BPF_ALU ? "(u32) " : "", insn->src_reg); } else { - verbose(env, "(%02x) %sr%d %s %s%d\n", + verbose(cbs->private_data, "(%02x) %sr%d %s %s%d\n", insn->code, class == BPF_ALU ? "(u32) " : "", insn->dst_reg, bpf_alu_string[BPF_OP(insn->code) >> 4], @@ -157,46 +157,46 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, } } else if (class == BPF_STX) { if (BPF_MODE(insn->code) == BPF_MEM) - verbose(env, "(%02x) *(%s *)(r%d %+d) = r%d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else if (BPF_MODE(insn->code) == BPF_XADD) - verbose(env, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", + verbose(cbs->private_data, "(%02x) lock *(%s *)(r%d %+d) += r%d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->src_reg); else - verbose(env, "BUG_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_%02x\n", insn->code); } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_st_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_st_%02x\n", insn->code); return; } - verbose(env, "(%02x) *(%s *)(r%d %+d) = %d\n", + verbose(cbs->private_data, "(%02x) *(%s *)(r%d %+d) = %d\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->dst_reg, insn->off, insn->imm); } else if (class == BPF_LDX) { if (BPF_MODE(insn->code) != BPF_MEM) { - verbose(env, "BUG_ldx_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ldx_%02x\n", insn->code); return; } - verbose(env, "(%02x) r%d = *(%s *)(r%d %+d)\n", + verbose(cbs->private_data, "(%02x) r%d = *(%s *)(r%d %+d)\n", insn->code, insn->dst_reg, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->off); } else if (class == BPF_LD) { if (BPF_MODE(insn->code) == BPF_ABS) { - verbose(env, "(%02x) r0 = *(%s *)skb[%d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[%d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->imm); } else if (BPF_MODE(insn->code) == BPF_IND) { - verbose(env, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", + verbose(cbs->private_data, "(%02x) r0 = *(%s *)skb[r%d + %d]\n", insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); @@ -212,12 +212,12 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, if (map_ptr && !allow_ptr_leaks) imm = 0; - verbose(env, "(%02x) r%d = %s\n", + verbose(cbs->private_data, "(%02x) r%d = %s\n", insn->code, insn->dst_reg, __func_imm_name(cbs, insn, imm, tmp, sizeof(tmp))); } else { - verbose(env, "BUG_ld_%02x\n", insn->code); + verbose(cbs->private_data, "BUG_ld_%02x\n", insn->code); return; } } else if (class == BPF_JMP) { @@ -227,35 +227,35 @@ void print_bpf_insn(const struct bpf_insn_cbs *cbs, char tmp[64]; if (insn->src_reg == BPF_PSEUDO_CALL) { - verbose(env, "(%02x) call pc%s\n", + verbose(cbs->private_data, "(%02x) call pc%s\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp))); } else { strcpy(tmp, "unknown"); - verbose(env, "(%02x) call %s#%d\n", insn->code, + verbose(cbs->private_data, "(%02x) call %s#%d\n", insn->code, __func_get_name(cbs, insn, tmp, sizeof(tmp)), insn->imm); } } else if (insn->code == (BPF_JMP | BPF_JA)) { - verbose(env, "(%02x) goto pc%+d\n", + verbose(cbs->private_data, "(%02x) goto pc%+d\n", insn->code, insn->off); } else if (insn->code == (BPF_JMP | BPF_EXIT)) { - verbose(env, "(%02x) exit\n", insn->code); + verbose(cbs->private_data, "(%02x) exit\n", insn->code); } else if (BPF_SRC(insn->code) == BPF_X) { - verbose(env, "(%02x) if r%d %s r%d goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s r%d goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->src_reg, insn->off); } else { - verbose(env, "(%02x) if r%d %s 0x%x goto pc%+d\n", + verbose(cbs->private_data, "(%02x) if r%d %s 0x%x goto pc%+d\n", insn->code, insn->dst_reg, bpf_jmp_string[BPF_OP(insn->code) >> 4], insn->imm, insn->off); } } else { - verbose(env, "(%02x) %s\n", + verbose(cbs->private_data, "(%02x) %s\n", insn->code, bpf_class_string[class]); } } diff --git a/kernel/bpf/disasm.h b/kernel/bpf/disasm.h index 266fe8ee542b..e1324a834a24 100644 --- a/kernel/bpf/disasm.h +++ b/kernel/bpf/disasm.h @@ -22,14 +22,12 @@ #include <string.h> #endif -struct bpf_verifier_env; - extern const char *const bpf_alu_string[16]; extern const char *const bpf_class_string[8]; const char *func_id_name(int id); -typedef __printf(2, 3) void (*bpf_insn_print_t)(struct bpf_verifier_env *env, +typedef __printf(2, 3) void (*bpf_insn_print_t)(void *private_data, const char *, ...); typedef const char *(*bpf_insn_revmap_call_t)(void *private_data, const struct bpf_insn *insn); @@ -45,7 +43,6 @@ struct bpf_insn_cbs { }; void print_bpf_insn(const struct bpf_insn_cbs *cbs, - struct bpf_verifier_env *env, const struct bpf_insn *insn, bool allow_ptr_leaks); #endif diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 69c5bccabd22..d2bda5aa25d7 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -41,6 +41,8 @@ #include <linux/mm.h> #include <net/strparser.h> #include <net/tcp.h> +#include <linux/ptr_ring.h> +#include <net/inet_common.h> #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) @@ -82,6 +84,7 @@ struct smap_psock { int sg_size; int eval; struct sk_msg_buff *cork; + struct list_head ingress; struct strparser strp; struct bpf_prog *bpf_tx_msg; @@ -103,6 +106,8 @@ struct smap_psock { }; static void smap_release_sock(struct smap_psock *psock, struct sock *sock); +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len); static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); static int bpf_tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); @@ -112,6 +117,21 @@ static inline struct smap_psock *smap_psock_sk(const struct sock *sk) return rcu_dereference_sk_user_data(sk); } +static bool bpf_tcp_stream_read(const struct sock *sk) +{ + struct smap_psock *psock; + bool empty = true; + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + empty = list_empty(&psock->ingress); +out: + rcu_read_unlock(); + return !empty; +} + static struct proto tcp_bpf_proto; static int bpf_tcp_init(struct sock *sk) { @@ -135,6 +155,8 @@ static int bpf_tcp_init(struct sock *sk) if (psock->bpf_tx_msg) { tcp_bpf_proto.sendmsg = bpf_tcp_sendmsg; tcp_bpf_proto.sendpage = bpf_tcp_sendpage; + tcp_bpf_proto.recvmsg = bpf_tcp_recvmsg; + tcp_bpf_proto.stream_memory_read = bpf_tcp_stream_read; } sk->sk_prot = &tcp_bpf_proto; @@ -170,6 +192,7 @@ static void bpf_tcp_close(struct sock *sk, long timeout) { void (*close_fun)(struct sock *sk, long timeout); struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; struct sock *osk; @@ -188,6 +211,12 @@ static void bpf_tcp_close(struct sock *sk, long timeout) close_fun = psock->save_close; write_lock_bh(&sk->sk_callback_lock); + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { osk = cmpxchg(e->entry, sk, NULL); if (osk == sk) { @@ -312,7 +341,7 @@ retry: md->sg_start++; if (md->sg_start == MAX_SKB_FRAGS) md->sg_start = 0; - memset(sg, 0, sizeof(*sg)); + sg_init_table(sg, 1); if (md->sg_start == md->sg_end) break; @@ -468,6 +497,72 @@ verdict: return _rc; } +static int bpf_tcp_ingress(struct sock *sk, int apply_bytes, + struct smap_psock *psock, + struct sk_msg_buff *md, int flags) +{ + bool apply = apply_bytes; + size_t size, copied = 0; + struct sk_msg_buff *r; + int err = 0, i; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_KERNEL); + if (unlikely(!r)) + return -ENOMEM; + + lock_sock(sk); + r->sg_start = md->sg_start; + i = md->sg_start; + + do { + r->sg_data[i] = md->sg_data[i]; + + size = (apply && apply_bytes < md->sg_data[i].length) ? + apply_bytes : md->sg_data[i].length; + + if (!sk_wmem_schedule(sk, size)) { + if (!copied) + err = -ENOMEM; + break; + } + + sk_mem_charge(sk, size); + r->sg_data[i].length = size; + md->sg_data[i].length -= size; + md->sg_data[i].offset += size; + copied += size; + + if (md->sg_data[i].length) { + get_page(sg_page(&r->sg_data[i])); + r->sg_end = (i + 1) == MAX_SKB_FRAGS ? 0 : i + 1; + } else { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + r->sg_end = i; + } + + if (apply) { + apply_bytes -= size; + if (!apply_bytes) + break; + } + } while (i != md->sg_end); + + md->sg_start = i; + + if (!err) { + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + } else { + free_start_sg(sk, r); + kfree(r); + } + + release_sock(sk); + return err; +} + static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct sk_msg_buff *md, int flags) @@ -475,6 +570,7 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, struct smap_psock *psock; struct scatterlist *sg; int i, err, free = 0; + bool ingress = !!(md->flags & BPF_F_INGRESS); sg = md->sg_data; @@ -487,9 +583,14 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, goto out_rcu; rcu_read_unlock(); - lock_sock(sk); - err = bpf_tcp_push(sk, send, md, flags, false); - release_sock(sk); + + if (ingress) { + err = bpf_tcp_ingress(sk, send, psock, md, flags); + } else { + lock_sock(sk); + err = bpf_tcp_push(sk, send, md, flags, false); + release_sock(sk); + } smap_release_sock(psock, sk); if (unlikely(err)) goto out; @@ -623,6 +724,92 @@ out_err: return err; } +static int bpf_tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + int nonblock, int flags, int *addr_len) +{ + struct iov_iter *iter = &msg->msg_iter; + struct smap_psock *psock; + int copied = 0; + + if (unlikely(flags & MSG_ERRQUEUE)) + return inet_recv_error(sk, msg, len, addr_len); + + rcu_read_lock(); + psock = smap_psock_sk(sk); + if (unlikely(!psock)) + goto out; + + if (unlikely(!refcount_inc_not_zero(&psock->refcnt))) + goto out; + rcu_read_unlock(); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); + + lock_sock(sk); + while (copied != len) { + struct scatterlist *sg; + struct sk_msg_buff *md; + int i; + + md = list_first_entry_or_null(&psock->ingress, + struct sk_msg_buff, list); + if (unlikely(!md)) + break; + i = md->sg_start; + do { + struct page *page; + int n, copy; + + sg = &md->sg_data[i]; + copy = sg->length; + page = sg_page(sg); + + if (copied + copy > len) + copy = len - copied; + + n = copy_page_to_iter(page, sg->offset, copy, iter); + if (n != copy) { + md->sg_start = i; + release_sock(sk); + smap_release_sock(psock, sk); + return -EFAULT; + } + + copied += copy; + sg->offset += copy; + sg->length -= copy; + sk_mem_uncharge(sk, copy); + + if (!sg->length) { + i++; + if (i == MAX_SKB_FRAGS) + i = 0; + if (!md->skb) + put_page(page); + } + if (copied == len) + break; + } while (i != md->sg_end); + md->sg_start = i; + + if (!sg->length && md->sg_start == md->sg_end) { + list_del(&md->list); + if (md->skb) + consume_skb(md->skb); + kfree(md); + } + } + + release_sock(sk); + smap_release_sock(psock, sk); + return copied; +out: + rcu_read_unlock(); + return tcp_recvmsg(sk, msg, len, nonblock, flags, addr_len); +} + + static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) { int flags = msg->msg_flags | MSG_NO_SHARED_FRAGS; @@ -656,7 +843,7 @@ static int bpf_tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) } sg = md.sg_data; - sg_init_table(sg, MAX_SKB_FRAGS); + sg_init_marker(sg, MAX_SKB_FRAGS); rcu_read_unlock(); lock_sock(sk); @@ -763,10 +950,14 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page, lock_sock(sk); - if (psock->cork_bytes) + if (psock->cork_bytes) { m = psock->cork; - else + sg = &m->sg_data[m->sg_end]; + } else { m = &md; + sg = m->sg_data; + sg_init_marker(sg, MAX_SKB_FRAGS); + } /* Catch case where ring is full and sendpage is stalled. */ if (unlikely(m->sg_end == m->sg_start && @@ -774,7 +965,6 @@ static int bpf_tcp_sendpage(struct sock *sk, struct page *page, goto out_err; psock->sg_size += size; - sg = &m->sg_data[m->sg_end]; sg_set_page(sg, page, size, offset); get_page(page); m->sg_copy[m->sg_end] = true; @@ -861,27 +1051,72 @@ static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb) __SK_DROP; } +static int smap_do_ingress(struct smap_psock *psock, struct sk_buff *skb) +{ + struct sock *sk = psock->sock; + int copied = 0, num_sg; + struct sk_msg_buff *r; + + r = kzalloc(sizeof(struct sk_msg_buff), __GFP_NOWARN | GFP_ATOMIC); + if (unlikely(!r)) + return -EAGAIN; + + if (!sk_rmem_schedule(sk, skb, skb->len)) { + kfree(r); + return -EAGAIN; + } + + sg_init_table(r->sg_data, MAX_SKB_FRAGS); + num_sg = skb_to_sgvec(skb, r->sg_data, 0, skb->len); + if (unlikely(num_sg < 0)) { + kfree(r); + return num_sg; + } + sk_mem_charge(sk, skb->len); + copied = skb->len; + r->sg_start = 0; + r->sg_end = num_sg == MAX_SKB_FRAGS ? 0 : num_sg; + r->skb = skb; + list_add_tail(&r->list, &psock->ingress); + sk->sk_data_ready(sk); + return copied; +} + static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb) { + struct smap_psock *peer; struct sock *sk; + __u32 in; int rc; rc = smap_verdict_func(psock, skb); switch (rc) { case __SK_REDIRECT: sk = do_sk_redirect_map(skb); - if (likely(sk)) { - struct smap_psock *peer = smap_psock_sk(sk); - - if (likely(peer && - test_bit(SMAP_TX_RUNNING, &peer->state) && - !sock_flag(sk, SOCK_DEAD) && - sock_writeable(sk))) { - skb_set_owner_w(skb, sk); - skb_queue_tail(&peer->rxqueue, skb); - schedule_work(&peer->tx_work); - break; - } + if (!sk) { + kfree_skb(skb); + break; + } + + peer = smap_psock_sk(sk); + in = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; + + if (unlikely(!peer || sock_flag(sk, SOCK_DEAD) || + !test_bit(SMAP_TX_RUNNING, &peer->state))) { + kfree_skb(skb); + break; + } + + if (!in && sock_writeable(sk)) { + skb_set_owner_w(skb, sk); + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; + } else if (in && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_queue_tail(&peer->rxqueue, skb); + schedule_work(&peer->tx_work); + break; } /* Fall through and free skb otherwise */ case __SK_DROP: @@ -943,15 +1178,23 @@ static void smap_tx_work(struct work_struct *w) } while ((skb = skb_dequeue(&psock->rxqueue))) { + __u32 flags; + rem = skb->len; off = 0; start: + flags = (TCP_SKB_CB(skb)->bpf.flags) & BPF_F_INGRESS; do { - if (likely(psock->sock->sk_socket)) - n = skb_send_sock_locked(psock->sock, - skb, off, rem); - else + if (likely(psock->sock->sk_socket)) { + if (flags) + n = smap_do_ingress(psock, skb); + else + n = skb_send_sock_locked(psock->sock, + skb, off, rem); + } else { n = -EINVAL; + } + if (n <= 0) { if (n == -EAGAIN) { /* Retry when space is available */ @@ -969,7 +1212,9 @@ start: rem -= n; off += n; } while (rem); - kfree_skb(skb); + + if (!flags) + kfree_skb(skb); } out: release_sock(psock->sock); @@ -1107,6 +1352,7 @@ static void sock_map_remove_complete(struct bpf_stab *stab) static void smap_gc_work(struct work_struct *w) { struct smap_psock_map_entry *e, *tmp; + struct sk_msg_buff *md, *mtmp; struct smap_psock *psock; psock = container_of(w, struct smap_psock, gc_work); @@ -1131,6 +1377,12 @@ static void smap_gc_work(struct work_struct *w) kfree(psock->cork); } + list_for_each_entry_safe(md, mtmp, &psock->ingress, list) { + list_del(&md->list); + free_start_sg(psock->sock, md); + kfree(md); + } + list_for_each_entry_safe(e, tmp, &psock->maps, list) { list_del(&e->list); kfree(e); @@ -1160,6 +1412,7 @@ static struct smap_psock *smap_init_psock(struct sock *sock, INIT_WORK(&psock->tx_work, smap_tx_work); INIT_WORK(&psock->gc_work, smap_gc_work); INIT_LIST_HEAD(&psock->maps); + INIT_LIST_HEAD(&psock->ingress); refcount_set(&psock->refcnt, 1); rcu_assign_sk_user_data(sock, psock); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index dd172ee16716..0244973ee544 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -203,11 +203,13 @@ static int bpf_map_alloc_id(struct bpf_map *map) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&map_idr_lock); id = idr_alloc_cyclic(&map_idr, map, 1, INT_MAX, GFP_ATOMIC); if (id > 0) map->id = id; spin_unlock_bh(&map_idr_lock); + idr_preload_end(); if (WARN_ON_ONCE(!id)) return -ENOSPC; @@ -940,11 +942,13 @@ static int bpf_prog_alloc_id(struct bpf_prog *prog) { int id; + idr_preload(GFP_KERNEL); spin_lock_bh(&prog_idr_lock); id = idr_alloc_cyclic(&prog_idr, prog, 1, INT_MAX, GFP_ATOMIC); if (id > 0) prog->aux->id = id; spin_unlock_bh(&prog_idr_lock); + idr_preload_end(); /* id is in [1, INT_MAX) */ if (WARN_ON_ONCE(!id)) @@ -1167,8 +1171,75 @@ struct bpf_prog *bpf_prog_get_type_dev(u32 ufd, enum bpf_prog_type type, } EXPORT_SYMBOL_GPL(bpf_prog_get_type_dev); +/* Initially all BPF programs could be loaded w/o specifying + * expected_attach_type. Later for some of them specifying expected_attach_type + * at load time became required so that program could be validated properly. + * Programs of types that are allowed to be loaded both w/ and w/o (for + * backward compatibility) expected_attach_type, should have the default attach + * type assigned to expected_attach_type for the latter case, so that it can be + * validated later at attach time. + * + * bpf_prog_load_fixup_attach_type() sets expected_attach_type in @attr if + * prog type requires it but has some attach types that have to be backward + * compatible. + */ +static void bpf_prog_load_fixup_attach_type(union bpf_attr *attr) +{ + switch (attr->prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + /* Unfortunately BPF_ATTACH_TYPE_UNSPEC enumeration doesn't + * exist so checking for non-zero is the way to go here. + */ + if (!attr->expected_attach_type) + attr->expected_attach_type = + BPF_CGROUP_INET_SOCK_CREATE; + break; + } +} + +static int +bpf_prog_load_check_attach_type(enum bpf_prog_type prog_type, + enum bpf_attach_type expected_attach_type) +{ + switch (prog_type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + switch (expected_attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + return 0; + default: + return -EINVAL; + } + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + switch (expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return 0; + default: + return -EINVAL; + } + default: + return 0; + } +} + +static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog, + enum bpf_attach_type attach_type) +{ + switch (prog->type) { + case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return attach_type == prog->expected_attach_type ? 0 : -EINVAL; + default: + return 0; + } +} + /* last field in 'union bpf_attr' used by this command */ -#define BPF_PROG_LOAD_LAST_FIELD prog_ifindex +#define BPF_PROG_LOAD_LAST_FIELD expected_attach_type static int bpf_prog_load(union bpf_attr *attr) { @@ -1205,11 +1276,17 @@ static int bpf_prog_load(union bpf_attr *attr) !capable(CAP_SYS_ADMIN)) return -EPERM; + bpf_prog_load_fixup_attach_type(attr); + if (bpf_prog_load_check_attach_type(type, attr->expected_attach_type)) + return -EINVAL; + /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; + prog->expected_attach_type = attr->expected_attach_type; + prog->aux->offload_requested = !!attr->prog_ifindex; err = security_bpf_prog_alloc(prog->aux); @@ -1311,6 +1388,81 @@ static int bpf_obj_get(const union bpf_attr *attr) attr->file_flags); } +struct bpf_raw_tracepoint { + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; +}; + +static int bpf_raw_tracepoint_release(struct inode *inode, struct file *filp) +{ + struct bpf_raw_tracepoint *raw_tp = filp->private_data; + + if (raw_tp->prog) { + bpf_probe_unregister(raw_tp->btp, raw_tp->prog); + bpf_prog_put(raw_tp->prog); + } + kfree(raw_tp); + return 0; +} + +static const struct file_operations bpf_raw_tp_fops = { + .release = bpf_raw_tracepoint_release, + .read = bpf_dummy_read, + .write = bpf_dummy_write, +}; + +#define BPF_RAW_TRACEPOINT_OPEN_LAST_FIELD raw_tracepoint.prog_fd + +static int bpf_raw_tracepoint_open(const union bpf_attr *attr) +{ + struct bpf_raw_tracepoint *raw_tp; + struct bpf_raw_event_map *btp; + struct bpf_prog *prog; + char tp_name[128]; + int tp_fd, err; + + if (strncpy_from_user(tp_name, u64_to_user_ptr(attr->raw_tracepoint.name), + sizeof(tp_name) - 1) < 0) + return -EFAULT; + tp_name[sizeof(tp_name) - 1] = 0; + + btp = bpf_find_raw_tracepoint(tp_name); + if (!btp) + return -ENOENT; + + raw_tp = kzalloc(sizeof(*raw_tp), GFP_USER); + if (!raw_tp) + return -ENOMEM; + raw_tp->btp = btp; + + prog = bpf_prog_get_type(attr->raw_tracepoint.prog_fd, + BPF_PROG_TYPE_RAW_TRACEPOINT); + if (IS_ERR(prog)) { + err = PTR_ERR(prog); + goto out_free_tp; + } + + err = bpf_probe_register(raw_tp->btp, prog); + if (err) + goto out_put_prog; + + raw_tp->prog = prog; + tp_fd = anon_inode_getfd("bpf-raw-tracepoint", &bpf_raw_tp_fops, raw_tp, + O_CLOEXEC); + if (tp_fd < 0) { + bpf_probe_unregister(raw_tp->btp, prog); + err = tp_fd; + goto out_put_prog; + } + return tp_fd; + +out_put_prog: + bpf_prog_put(prog); +out_free_tp: + kfree(raw_tp); + return err; +} + #ifdef CONFIG_CGROUP_BPF #define BPF_PROG_ATTACH_LAST_FIELD attach_flags @@ -1374,8 +1526,16 @@ static int bpf_prog_attach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1395,6 +1555,11 @@ static int bpf_prog_attach(const union bpf_attr *attr) if (IS_ERR(prog)) return PTR_ERR(prog); + if (bpf_prog_attach_check_attach_type(prog, attr->attach_type)) { + bpf_prog_put(prog); + return -EINVAL; + } + cgrp = cgroup_get_from_fd(attr->target_fd); if (IS_ERR(cgrp)) { bpf_prog_put(prog); @@ -1431,8 +1596,16 @@ static int bpf_prog_detach(const union bpf_attr *attr) ptype = BPF_PROG_TYPE_CGROUP_SKB; break; case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: ptype = BPF_PROG_TYPE_CGROUP_SOCK; break; + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + ptype = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + break; case BPF_CGROUP_SOCK_OPS: ptype = BPF_PROG_TYPE_SOCK_OPS; break; @@ -1482,6 +1655,12 @@ static int bpf_prog_query(const union bpf_attr *attr, case BPF_CGROUP_INET_INGRESS: case BPF_CGROUP_INET_EGRESS: case BPF_CGROUP_INET_SOCK_CREATE: + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: case BPF_CGROUP_SOCK_OPS: case BPF_CGROUP_DEVICE: break; @@ -1921,6 +2100,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET_INFO_BY_FD: err = bpf_obj_get_info_by_fd(&attr, uattr); break; + case BPF_RAW_TRACEPOINT_OPEN: + err = bpf_raw_tracepoint_open(&attr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e9f7c20691c1..5dd1dcb902bf 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -168,23 +168,12 @@ struct bpf_call_arg_meta { static DEFINE_MUTEX(bpf_verifier_lock); -/* log_level controls verbosity level of eBPF verifier. - * bpf_verifier_log_write() is used to dump the verification trace to the log, - * so the user can figure out what's wrong with the program - */ -__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, - const char *fmt, ...) +void bpf_verifier_vlog(struct bpf_verifier_log *log, const char *fmt, + va_list args) { - struct bpf_verifer_log *log = &env->log; unsigned int n; - va_list args; - - if (!log->level || !log->ubuf || bpf_verifier_log_full(log)) - return; - va_start(args, fmt); n = vscnprintf(log->kbuf, BPF_VERIFIER_TMP_LOG_SIZE, fmt, args); - va_end(args); WARN_ONCE(n >= BPF_VERIFIER_TMP_LOG_SIZE - 1, "verifier log line truncated - local buffer too short\n"); @@ -197,14 +186,37 @@ __printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, else log->ubuf = NULL; } -EXPORT_SYMBOL_GPL(bpf_verifier_log_write); -/* Historically bpf_verifier_log_write was called verbose, but the name was too - * generic for symbol export. The function was renamed, but not the calls in - * the verifier to avoid complicating backports. Hence the alias below. + +/* log_level controls verbosity level of eBPF verifier. + * bpf_verifier_log_write() is used to dump the verification trace to the log, + * so the user can figure out what's wrong with the program */ -static __printf(2, 3) void verbose(struct bpf_verifier_env *env, - const char *fmt, ...) - __attribute__((alias("bpf_verifier_log_write"))); +__printf(2, 3) void bpf_verifier_log_write(struct bpf_verifier_env *env, + const char *fmt, ...) +{ + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} +EXPORT_SYMBOL_GPL(bpf_verifier_log_write); + +__printf(2, 3) static void verbose(void *private_data, const char *fmt, ...) +{ + struct bpf_verifier_env *env = private_data; + va_list args; + + if (!bpf_verifier_log_needed(&env->log)) + return; + + va_start(args, fmt); + bpf_verifier_vlog(&env->log, fmt, args); + va_end(args); +} static bool type_is_pkt_pointer(enum bpf_reg_type type) { @@ -1311,7 +1323,7 @@ static int check_ctx_access(struct bpf_verifier_env *env, int insn_idx, int off, }; if (env->ops->is_valid_access && - env->ops->is_valid_access(off, size, t, &info)) { + env->ops->is_valid_access(off, size, t, env->prog, &info)) { /* A non zero info.ctx_field_size indicates that this field is a * candidate for later verifier transformation to load the whole * field and then apply a mask when accessed with a narrower @@ -2337,7 +2349,7 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } if (env->ops->get_func_proto) - fn = env->ops->get_func_proto(func_id); + fn = env->ops->get_func_proto(func_id, env->prog); if (!fn) { verbose(env, "unknown func %s#%d\n", func_id_name(func_id), func_id); @@ -3875,6 +3887,7 @@ static int check_return_code(struct bpf_verifier_env *env) switch (env->prog->type) { case BPF_PROG_TYPE_CGROUP_SKB: case BPF_PROG_TYPE_CGROUP_SOCK: + case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: case BPF_PROG_TYPE_SOCK_OPS: case BPF_PROG_TYPE_CGROUP_DEVICE: break; @@ -4600,10 +4613,11 @@ static int do_check(struct bpf_verifier_env *env) if (env->log.level) { const struct bpf_insn_cbs cbs = { .cb_print = verbose, + .private_data = env, }; verbose(env, "%d: ", insn_idx); - print_bpf_insn(&cbs, env, insn, env->allow_ptr_leaks); + print_bpf_insn(&cbs, insn, env->allow_ptr_leaks); } if (bpf_prog_is_dev_bound(env->prog->aux)) { @@ -5559,7 +5573,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn = new_prog->insnsi + i + delta; } patch_call_imm: - fn = env->ops->get_func_proto(insn->imm); + fn = env->ops->get_func_proto(insn->imm, env->prog); /* all functions that have prototype and verifier allowed * programs to call them, must be real in-kernel functions */ @@ -5601,7 +5615,7 @@ static void free_states(struct bpf_verifier_env *env) int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { struct bpf_verifier_env *env; - struct bpf_verifer_log *log; + struct bpf_verifier_log *log; int ret = -EINVAL; /* no program is valid */ diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 7f9691c86b6e..d88e96d4e12c 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -524,7 +524,8 @@ static const struct bpf_func_proto bpf_probe_read_str_proto = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_map_lookup_elem: @@ -568,7 +569,8 @@ static const struct bpf_func_proto *tracing_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +kprobe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -582,12 +584,13 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_override_return_proto; #endif default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } /* bpf+kprobe programs can access fields of 'struct pt_regs' */ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < 0 || off >= sizeof(struct pt_regs)) @@ -661,7 +664,8 @@ static const struct bpf_func_proto bpf_get_stackid_proto_tp = { .arg3_type = ARG_ANYTHING, }; -static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -669,11 +673,12 @@ static const struct bpf_func_proto *tp_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_get_stackid: return &bpf_get_stackid_proto_tp; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (off < sizeof(void *) || off >= PERF_MAX_TRACE_SIZE) @@ -721,7 +726,8 @@ static const struct bpf_func_proto bpf_perf_prog_read_value_proto = { .arg3_type = ARG_CONST_SIZE, }; -static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -731,11 +737,94 @@ static const struct bpf_func_proto *pe_prog_func_proto(enum bpf_func_id func_id) case BPF_FUNC_perf_prog_read_value: return &bpf_perf_prog_read_value_proto; default: - return tracing_func_proto(func_id); + return tracing_func_proto(func_id, prog); } } +/* + * bpf_raw_tp_regs are separate from bpf_pt_regs used from skb/xdp + * to avoid potential recursive reuse issue when/if tracepoints are added + * inside bpf_*_event_output and/or bpf_get_stack_id + */ +static DEFINE_PER_CPU(struct pt_regs, bpf_raw_tp_regs); +BPF_CALL_5(bpf_perf_event_output_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags, void *, data, u64, size) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + return ____bpf_perf_event_output(regs, map, flags, data, size); +} + +static const struct bpf_func_proto bpf_perf_event_output_proto_raw_tp = { + .func = bpf_perf_event_output_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, + .arg4_type = ARG_PTR_TO_MEM, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, +}; + +BPF_CALL_3(bpf_get_stackid_raw_tp, struct bpf_raw_tracepoint_args *, args, + struct bpf_map *, map, u64, flags) +{ + struct pt_regs *regs = this_cpu_ptr(&bpf_raw_tp_regs); + + perf_fetch_caller_regs(regs); + /* similar to bpf_perf_event_output_tp, but pt_regs fetched differently */ + return bpf_get_stackid((unsigned long) regs, (unsigned long) map, + flags, 0, 0); +} + +static const struct bpf_func_proto bpf_get_stackid_proto_raw_tp = { + .func = bpf_get_stackid_raw_tp, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +static const struct bpf_func_proto * +raw_tp_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_perf_event_output: + return &bpf_perf_event_output_proto_raw_tp; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto_raw_tp; + default: + return tracing_func_proto(func_id, prog); + } +} + +static bool raw_tp_prog_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + /* largest tracepoint in the kernel has 12 args */ + if (off < 0 || off >= sizeof(__u64) * 12) + return false; + if (type != BPF_READ) + return false; + if (off % size != 0) + return false; + return true; +} + +const struct bpf_verifier_ops raw_tracepoint_verifier_ops = { + .get_func_proto = raw_tp_prog_func_proto, + .is_valid_access = raw_tp_prog_is_valid_access, +}; + +const struct bpf_prog_ops raw_tracepoint_prog_ops = { +}; + static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_u64 = sizeof(u64); @@ -908,3 +997,106 @@ int perf_event_query_prog_array(struct perf_event *event, void __user *info) return ret; } + +extern struct bpf_raw_event_map __start__bpf_raw_tp[]; +extern struct bpf_raw_event_map __stop__bpf_raw_tp[]; + +struct bpf_raw_event_map *bpf_find_raw_tracepoint(const char *name) +{ + struct bpf_raw_event_map *btp = __start__bpf_raw_tp; + + for (; btp < __stop__bpf_raw_tp; btp++) { + if (!strcmp(btp->tp->name, name)) + return btp; + } + return NULL; +} + +static __always_inline +void __bpf_trace_run(struct bpf_prog *prog, u64 *args) +{ + rcu_read_lock(); + preempt_disable(); + (void) BPF_PROG_RUN(prog, args); + preempt_enable(); + rcu_read_unlock(); +} + +#define UNPACK(...) __VA_ARGS__ +#define REPEAT_1(FN, DL, X, ...) FN(X) +#define REPEAT_2(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_1(FN, DL, __VA_ARGS__) +#define REPEAT_3(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_2(FN, DL, __VA_ARGS__) +#define REPEAT_4(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_3(FN, DL, __VA_ARGS__) +#define REPEAT_5(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_4(FN, DL, __VA_ARGS__) +#define REPEAT_6(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_5(FN, DL, __VA_ARGS__) +#define REPEAT_7(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_6(FN, DL, __VA_ARGS__) +#define REPEAT_8(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_7(FN, DL, __VA_ARGS__) +#define REPEAT_9(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_8(FN, DL, __VA_ARGS__) +#define REPEAT_10(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_9(FN, DL, __VA_ARGS__) +#define REPEAT_11(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_10(FN, DL, __VA_ARGS__) +#define REPEAT_12(FN, DL, X, ...) FN(X) UNPACK DL REPEAT_11(FN, DL, __VA_ARGS__) +#define REPEAT(X, FN, DL, ...) REPEAT_##X(FN, DL, __VA_ARGS__) + +#define SARG(X) u64 arg##X +#define COPY(X) args[X] = arg##X + +#define __DL_COM (,) +#define __DL_SEM (;) + +#define __SEQ_0_11 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 + +#define BPF_TRACE_DEFN_x(x) \ + void bpf_trace_run##x(struct bpf_prog *prog, \ + REPEAT(x, SARG, __DL_COM, __SEQ_0_11)) \ + { \ + u64 args[x]; \ + REPEAT(x, COPY, __DL_SEM, __SEQ_0_11); \ + __bpf_trace_run(prog, args); \ + } \ + EXPORT_SYMBOL_GPL(bpf_trace_run##x) +BPF_TRACE_DEFN_x(1); +BPF_TRACE_DEFN_x(2); +BPF_TRACE_DEFN_x(3); +BPF_TRACE_DEFN_x(4); +BPF_TRACE_DEFN_x(5); +BPF_TRACE_DEFN_x(6); +BPF_TRACE_DEFN_x(7); +BPF_TRACE_DEFN_x(8); +BPF_TRACE_DEFN_x(9); +BPF_TRACE_DEFN_x(10); +BPF_TRACE_DEFN_x(11); +BPF_TRACE_DEFN_x(12); + +static int __bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + struct tracepoint *tp = btp->tp; + + /* + * check that program doesn't access arguments beyond what's + * available in this tracepoint + */ + if (prog->aux->max_ctx_offset > btp->num_args * sizeof(u64)) + return -EINVAL; + + return tracepoint_probe_register(tp, (void *)btp->bpf_func, prog); +} + +int bpf_probe_register(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = __bpf_probe_register(btp, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} + +int bpf_probe_unregister(struct bpf_raw_event_map *btp, struct bpf_prog *prog) +{ + int err; + + mutex_lock(&bpf_event_mutex); + err = tracepoint_probe_unregister(btp->tp, (void *)btp->bpf_func, prog); + mutex_unlock(&bpf_event_mutex); + return err; +} diff --git a/lib/scatterlist.c b/lib/scatterlist.c index 53728d391d3a..06dad7a072fd 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -132,14 +132,7 @@ EXPORT_SYMBOL(sg_last); void sg_init_table(struct scatterlist *sgl, unsigned int nents) { memset(sgl, 0, sizeof(*sgl) * nents); -#ifdef CONFIG_DEBUG_SG - { - unsigned int i; - for (i = 0; i < nents; i++) - sgl[i].sg_magic = SG_MAGIC; - } -#endif - sg_mark_end(&sgl[nents - 1]); + sg_init_marker(sgl, nents); } EXPORT_SYMBOL(sg_init_table); diff --git a/net/core/filter.c b/net/core/filter.c index 00c711c5f1a2..d31aff93270d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -33,6 +33,7 @@ #include <linux/if_packet.h> #include <linux/if_arp.h> #include <linux/gfp.h> +#include <net/inet_common.h> #include <net/ip.h> #include <net/protocol.h> #include <net/netlink.h> @@ -1855,7 +1856,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; tcb->bpf.key = key; @@ -1894,7 +1895,7 @@ BPF_CALL_4(bpf_msg_redirect_map, struct sk_msg_buff *, msg, struct bpf_map *, map, u32, key, u64, flags) { /* If user passes invalid input drop the packet. */ - if (unlikely(flags)) + if (unlikely(flags & ~(BPF_F_INGRESS))) return SK_DROP; msg->key = key; @@ -3462,6 +3463,27 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET + } else if (level == SOL_IP) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct inet_sock *inet = inet_sk(sk); + + if (val == -1) + val = 0; + inet->tos = val; + } + break; + default: + ret = -EINVAL; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { if (optlen != sizeof(int) || sk->sk_family != AF_INET6) @@ -3561,6 +3583,20 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } + } else if (level == SOL_IP) { + struct inet_sock *inet = inet_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IP_TOS: + *((int *)optval) = (int)inet->tos; + break; + default: + goto err_clear; + } #if IS_ENABLED(CONFIG_IPV6) } else if (level == SOL_IPV6) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -3621,6 +3657,52 @@ static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { .arg2_type = ARG_ANYTHING, }; +const struct ipv6_bpf_stub *ipv6_bpf_stub __read_mostly; +EXPORT_SYMBOL_GPL(ipv6_bpf_stub); + +BPF_CALL_3(bpf_bind, struct bpf_sock_addr_kern *, ctx, struct sockaddr *, addr, + int, addr_len) +{ +#ifdef CONFIG_INET + struct sock *sk = ctx->sk; + int err; + + /* Binding to port can be expensive so it's prohibited in the helper. + * Only binding to IP is supported. + */ + err = -EINVAL; + if (addr->sa_family == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return err; + if (((struct sockaddr_in *)addr)->sin_port != htons(0)) + return err; + return __inet_bind(sk, addr, addr_len, true, false); +#if IS_ENABLED(CONFIG_IPV6) + } else if (addr->sa_family == AF_INET6) { + if (addr_len < SIN6_LEN_RFC2133) + return err; + if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) + return err; + /* ipv6_bpf_stub cannot be NULL, since it's called from + * bpf_cgroup_inet6_connect hook and ipv6 is already loaded + */ + return ipv6_bpf_stub->inet6_bind(sk, addr, addr_len, true, false); +#endif /* CONFIG_IPV6 */ + } +#endif /* CONFIG_INET */ + + return -EAFNOSUPPORT; +} + +static const struct bpf_func_proto bpf_bind_proto = { + .func = bpf_bind, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_PTR_TO_MEM, + .arg3_type = ARG_CONST_SIZE, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3650,7 +3732,7 @@ bpf_base_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sock_filter_func_proto(enum bpf_func_id func_id) +sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { /* inet and inet6 sockets are created in a process @@ -3664,7 +3746,29 @@ sock_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -sk_filter_func_proto(enum bpf_func_id func_id) +sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { + /* inet and inet6 sockets are created in a process + * context so there is always a valid uid/gid + */ + case BPF_FUNC_get_current_uid_gid: + return &bpf_get_current_uid_gid_proto; + case BPF_FUNC_bind: + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_CONNECT: + case BPF_CGROUP_INET6_CONNECT: + return &bpf_bind_proto; + default: + return NULL; + } + default: + return bpf_base_func_proto(func_id); + } +} + +static const struct bpf_func_proto * +sk_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3679,7 +3783,7 @@ sk_filter_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -tc_cls_act_func_proto(enum bpf_func_id func_id) +tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3746,7 +3850,7 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -xdp_func_proto(enum bpf_func_id func_id) +xdp_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_perf_event_output: @@ -3769,7 +3873,7 @@ xdp_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_inout_func_proto(enum bpf_func_id func_id) +lwt_inout_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_load_bytes: @@ -3796,7 +3900,7 @@ lwt_inout_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * - sock_ops_func_proto(enum bpf_func_id func_id) +sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_setsockopt: @@ -3812,7 +3916,8 @@ static const struct bpf_func_proto * } } -static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_msg_redirect_map: @@ -3828,7 +3933,8 @@ static const struct bpf_func_proto *sk_msg_func_proto(enum bpf_func_id func_id) } } -static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) +static const struct bpf_func_proto * +sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_store_bytes: @@ -3853,7 +3959,7 @@ static const struct bpf_func_proto *sk_skb_func_proto(enum bpf_func_id func_id) } static const struct bpf_func_proto * -lwt_xmit_func_proto(enum bpf_func_id func_id) +lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { switch (func_id) { case BPF_FUNC_skb_get_tunnel_key: @@ -3883,11 +3989,12 @@ lwt_xmit_func_proto(enum bpf_func_id func_id) case BPF_FUNC_set_hash_invalid: return &bpf_set_hash_invalid_proto; default: - return lwt_inout_func_proto(func_id); + return lwt_inout_func_proto(func_id, prog); } } static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -3931,6 +4038,7 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type static bool sk_filter_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3951,11 +4059,12 @@ static bool sk_filter_is_valid_access(int off, int size, } } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool lwt_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -3985,32 +4094,83 @@ static bool lwt_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } -static bool sock_filter_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) + +/* Attach type specific accesses */ +static bool __sock_filter_check_attach_type(int off, + enum bpf_access_type access_type, + enum bpf_attach_type attach_type) { - if (type == BPF_WRITE) { - switch (off) { - case offsetof(struct bpf_sock, bound_dev_if): - case offsetof(struct bpf_sock, mark): - case offsetof(struct bpf_sock, priority): - break; + switch (off) { + case offsetof(struct bpf_sock, bound_dev_if): + case offsetof(struct bpf_sock, mark): + case offsetof(struct bpf_sock, priority): + switch (attach_type) { + case BPF_CGROUP_INET_SOCK_CREATE: + goto full_access; default: return false; } + case bpf_ctx_range(struct bpf_sock, src_ip4): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + switch (attach_type) { + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + case bpf_ctx_range(struct bpf_sock, src_port): + switch (attach_type) { + case BPF_CGROUP_INET4_POST_BIND: + case BPF_CGROUP_INET6_POST_BIND: + goto read_only; + default: + return false; + } + } +read_only: + return access_type == BPF_READ; +full_access: + return true; +} + +static bool __sock_filter_check_size(int off, int size, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + switch (off) { + case bpf_ctx_range(struct bpf_sock, src_ip4): + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): + bpf_ctx_record_field_size(info, size_default); + return bpf_ctx_narrow_access_ok(off, size, size_default); } - if (off < 0 || off + size > sizeof(struct bpf_sock)) + return size == size_default; +} + +static bool sock_filter_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + if (off < 0 || off >= sizeof(struct bpf_sock)) return false; - /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) + if (!__sock_filter_check_attach_type(off, type, + prog->expected_attach_type)) + return false; + if (!__sock_filter_check_size(off, size, info)) return false; - return true; } @@ -4061,6 +4221,7 @@ static int tc_cls_act_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool tc_cls_act_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) { @@ -4090,7 +4251,7 @@ static bool tc_cls_act_is_valid_access(int off, int size, return false; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool __is_valid_xdp_access(int off, int size) @@ -4107,6 +4268,7 @@ static bool __is_valid_xdp_access(int off, int size) static bool xdp_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4137,8 +4299,74 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static bool sock_addr_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sock_addr)) + return false; + if (off % size != 0) + return false; + + /* Disallow access to IPv6 fields from IPv4 contex and vise + * versa. + */ + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET4_BIND: + case BPF_CGROUP_INET4_CONNECT: + break; + default: + return false; + } + break; + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + switch (prog->expected_attach_type) { + case BPF_CGROUP_INET6_BIND: + case BPF_CGROUP_INET6_CONNECT: + break; + default: + return false; + } + break; + } + + switch (off) { + case bpf_ctx_range(struct bpf_sock_addr, user_ip4): + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + /* Only narrow read access allowed for now. */ + if (type == BPF_READ) { + bpf_ctx_record_field_size(info, size_default); + if (!bpf_ctx_narrow_access_ok(off, size, size_default)) + return false; + } else { + if (size != size_default) + return false; + } + break; + case bpf_ctx_range(struct bpf_sock_addr, user_port): + if (size != size_default) + return false; + break; + default: + if (type == BPF_READ) { + if (size != size_default) + return false; + } else { + return false; + } + } + + return true; +} + static bool sock_ops_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { const int size_default = sizeof(__u32); @@ -4185,6 +4413,7 @@ static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, static bool sk_skb_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { switch (off) { @@ -4214,11 +4443,12 @@ static bool sk_skb_is_valid_access(int off, int size, break; } - return bpf_skb_is_valid_access(off, size, type, info); + return bpf_skb_is_valid_access(off, size, type, prog, info); } static bool sk_msg_is_valid_access(int off, int size, enum bpf_access_type type, + const struct bpf_prog *prog, struct bpf_insn_access_aux *info) { if (type == BPF_WRITE) @@ -4548,6 +4778,7 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; + int off; switch (si->off) { case offsetof(struct bpf_sock, bound_dev_if): @@ -4603,6 +4834,43 @@ static u32 sock_filter_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT); break; + + case offsetof(struct bpf_sock, src_ip4): + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_rcv_saddr, + FIELD_SIZEOF(struct sock_common, + skc_rcv_saddr), + target_size)); + break; + + case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]): +#if IS_ENABLED(CONFIG_IPV6) + off = si->off; + off -= offsetof(struct bpf_sock, src_ip6[0]); + *insn++ = BPF_LDX_MEM( + BPF_SIZE(si->code), si->dst_reg, si->src_reg, + bpf_target_off( + struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0], + FIELD_SIZEOF(struct sock_common, + skc_v6_rcv_saddr.s6_addr32[0]), + target_size) + off); +#else + (void)off; + *insn++ = BPF_MOV32_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct bpf_sock, src_port): + *insn++ = BPF_LDX_MEM( + BPF_FIELD_SIZEOF(struct sock_common, skc_num), + si->dst_reg, si->src_reg, + bpf_target_off(struct sock_common, skc_num, + FIELD_SIZEOF(struct sock_common, + skc_num), + target_size)); + break; } return insn - insn_buf; @@ -4678,6 +4946,152 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* SOCK_ADDR_LOAD_NESTED_FIELD() loads Nested Field S.F.NF where S is type of + * context Structure, F is Field in context structure that contains a pointer + * to Nested Structure of type NS that has the field NF. + * + * SIZE encodes the load size (BPF_B, BPF_H, etc). It's up to caller to make + * sure that SIZE is not greater than actual size of S.F.NF. + * + * If offset OFF is provided, the load happens from that offset relative to + * offset of NF. + */ +#define SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF) \ + do { \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), si->dst_reg, \ + si->src_reg, offsetof(S, F)); \ + *insn++ = BPF_LDX_MEM( \ + SIZE, si->dst_reg, si->dst_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + } while (0) + +#define SOCK_ADDR_LOAD_NESTED_FIELD(S, NS, F, NF) \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, \ + BPF_FIELD_SIZEOF(NS, NF), 0) + +/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to + * SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation. + * + * It doesn't support SIZE argument though since narrow stores are not + * supported for now. + * + * In addition it uses Temporary Field TF (member of struct S) as the 3rd + * "register" since two registers available in convert_ctx_access are not + * enough: we can't override neither SRC, since it contains value to store, nor + * DST since it contains pointer to context that may be used by later + * instructions. But we need a temporary place to save pointer to nested + * structure whose field we want to store to. + */ +#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \ + do { \ + int tmp_reg = BPF_REG_9; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \ + --tmp_reg; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, tmp_reg, \ + offsetof(S, TF)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \ + si->dst_reg, offsetof(S, F)); \ + *insn++ = BPF_STX_MEM( \ + BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \ + bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \ + target_size) \ + + OFF); \ + *insn++ = BPF_LDX_MEM(BPF_DW, tmp_reg, si->dst_reg, \ + offsetof(S, TF)); \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF(S, NS, F, NF, SIZE, OFF, \ + TF) \ + do { \ + if (type == BPF_WRITE) { \ + SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \ + TF); \ + } else { \ + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, SIZE, OFF); \ + } \ + } while (0) + +#define SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(S, NS, F, NF, TF) \ + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( \ + S, NS, F, NF, BPF_FIELD_SIZEOF(NS, NF), 0, TF) + +static u32 sock_addr_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + int off; + + switch (si->off) { + case offsetof(struct bpf_sock_addr, user_family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr, uaddr, sa_family); + break; + + case offsetof(struct bpf_sock_addr, user_ip4): + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in, uaddr, + sin_addr, BPF_SIZE(si->code), 0, tmp_reg); + break; + + case bpf_ctx_range_till(struct bpf_sock_addr, user_ip6[0], user_ip6[3]): + off = si->off; + off -= offsetof(struct bpf_sock_addr, user_ip6[0]); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sockaddr_in6, uaddr, + sin6_addr.s6_addr32[0], BPF_SIZE(si->code), off, + tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, user_port): + /* To get port we need to know sa_family first and then treat + * sockaddr as either sockaddr_in or sockaddr_in6. + * Though we can simplify since port field has same offset and + * size in both structures. + * Here we check this invariant and use just one of the + * structures if it's true. + */ + BUILD_BUG_ON(offsetof(struct sockaddr_in, sin_port) != + offsetof(struct sockaddr_in6, sin6_port)); + BUILD_BUG_ON(FIELD_SIZEOF(struct sockaddr_in, sin_port) != + FIELD_SIZEOF(struct sockaddr_in6, sin6_port)); + SOCK_ADDR_LOAD_OR_STORE_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sockaddr_in6, uaddr, + sin6_port, tmp_reg); + break; + + case offsetof(struct bpf_sock_addr, family): + SOCK_ADDR_LOAD_NESTED_FIELD(struct bpf_sock_addr_kern, + struct sock, sk, sk_family); + break; + + case offsetof(struct bpf_sock_addr, type): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT); + break; + + case offsetof(struct bpf_sock_addr, protocol): + SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( + struct bpf_sock_addr_kern, struct sock, sk, + __sk_flags_offset, BPF_W, 0); + *insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK); + *insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, + SK_FL_PROTO_SHIFT); + break; + } + + return insn - insn_buf; +} + static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, @@ -5135,6 +5549,15 @@ const struct bpf_verifier_ops cg_sock_verifier_ops = { const struct bpf_prog_ops cg_sock_prog_ops = { }; +const struct bpf_verifier_ops cg_sock_addr_verifier_ops = { + .get_func_proto = sock_addr_func_proto, + .is_valid_access = sock_addr_is_valid_access, + .convert_ctx_access = sock_addr_convert_ctx_access, +}; + +const struct bpf_prog_ops cg_sock_addr_prog_ops = { +}; + const struct bpf_verifier_ops sock_ops_verifier_ops = { .get_func_proto = sock_ops_func_proto, .is_valid_access = sock_ops_is_valid_access, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index f98e2f0db841..eaed0367e669 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -432,23 +432,37 @@ EXPORT_SYMBOL(inet_release); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct net *net = sock_net(sk); - unsigned short snum; - int chk_addr_ret; - u32 tb_id = RT_TABLE_LOCAL; int err; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { - err = sk->sk_prot->bind(sk, uaddr, addr_len); - goto out; + return sk->sk_prot->bind(sk, uaddr, addr_len); } - err = -EINVAL; if (addr_len < sizeof(struct sockaddr_in)) - goto out; + return -EINVAL; + + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET4_BIND(sk, uaddr); + if (err) + return err; + + return __inet_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet_bind); + +int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct net *net = sock_net(sk); + unsigned short snum; + int chk_addr_ret; + u32 tb_id = RT_TABLE_LOCAL; + int err; if (addr->sin_family != AF_INET) { /* Compatibility games : accept AF_UNSPEC (mapped to AF_INET) @@ -492,7 +506,8 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) * would be illegal to use them (multicast/broadcast) in * which case the sending device address is used. */ - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ err = -EINVAL; @@ -504,11 +519,18 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_saddr = 0; /* Use device */ /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && - sk->sk_prot->get_port(sk, snum)) { - inet->inet_saddr = inet->inet_rcv_saddr = 0; - err = -EADDRINUSE; - goto out_release_sock; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk); + if (err) { + inet->inet_saddr = inet->inet_rcv_saddr = 0; + goto out_release_sock; + } } if (inet->inet_rcv_saddr) @@ -521,22 +543,29 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk_dst_reset(sk); err = 0; out_release_sock: - release_sock(sk); + if (with_lock) + release_sock(sk); out: return err; } -EXPORT_SYMBOL(inet_bind); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; if (uaddr->sa_family == AF_UNSPEC) return sk->sk_prot->disconnect(sk, flags); + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + return err; + } + if (!inet_sk(sk)->inet_num && inet_autobind(sk)) return -EAGAIN; return sk->sk_prot->connect(sk, uaddr, addr_len); @@ -617,6 +646,12 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_CLOSE) goto out; + if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { + err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + if (err) + goto out; + } + err = sk->sk_prot->connect(sk, uaddr, addr_len); if (err < 0) goto out; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0c31be306572..bccc4c270087 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -485,6 +485,14 @@ static void tcp_tx_timestamp(struct sock *sk, u16 tsflags) } } +static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, + int target, struct sock *sk) +{ + return (tp->rcv_nxt - tp->copied_seq >= target) || + (sk->sk_prot->stream_memory_read ? + sk->sk_prot->stream_memory_read(sk) : false); +} + /* * Wait for a TCP event. * @@ -554,7 +562,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) tp->urg_data) target++; - if (tp->rcv_nxt - tp->copied_seq >= target) + if (tcp_stream_is_readable(tp, target, sk)) mask |= EPOLLIN | EPOLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9639334ebb7c..f70586b50838 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -140,6 +140,21 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) } EXPORT_SYMBOL_GPL(tcp_twsk_unique); +static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v4_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); +} + /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -2408,6 +2423,7 @@ struct proto tcp_prot = { .name = "TCP", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v4_pre_connect, .connect = tcp_v4_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index f49e14cd3891..24b5c59b1c53 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1658,6 +1658,19 @@ csum_copy_err: goto try_again; } +int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} +EXPORT_SYMBOL(udp_pre_connect); + int __udp_disconnect(struct sock *sk, int flags) { struct inet_sock *inet = inet_sk(sk); @@ -2530,6 +2543,7 @@ struct proto udp_prot = { .name = "UDP", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udp_pre_connect, .connect = ip4_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index c1e292db04db..8da0b513f188 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -277,15 +277,7 @@ out_rcu_unlock: /* bind for INET6 API */ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { - struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; struct sock *sk = sock->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct net *net = sock_net(sk); - __be32 v4addr = 0; - unsigned short snum; - bool saved_ipv6only; - int addr_type = 0; int err = 0; /* If the socket has its own bind function then use it. */ @@ -295,11 +287,35 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len < SIN6_LEN_RFC2133) return -EINVAL; + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ + err = BPF_CGROUP_RUN_PROG_INET6_BIND(sk, uaddr); + if (err) + return err; + + return __inet6_bind(sk, uaddr, addr_len, false, true); +} +EXPORT_SYMBOL(inet6_bind); + +int __inet6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, + bool force_bind_address_no_port, bool with_lock) +{ + struct sockaddr_in6 *addr = (struct sockaddr_in6 *)uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct net *net = sock_net(sk); + __be32 v4addr = 0; + unsigned short snum; + bool saved_ipv6only; + int addr_type = 0; + int err = 0; + if (addr->sin6_family != AF_INET6) return -EAFNOSUPPORT; addr_type = ipv6_addr_type(&addr->sin6_addr); - if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + if ((addr_type & IPV6_ADDR_MULTICAST) && sk->sk_type == SOCK_STREAM) return -EINVAL; snum = ntohs(addr->sin6_port); @@ -307,7 +323,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE)) return -EACCES; - lock_sock(sk); + if (with_lock) + lock_sock(sk); /* Check these errors (active socket, double bind). */ if (sk->sk_state != TCP_CLOSE || inet->inet_num) { @@ -395,12 +412,20 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) sk->sk_ipv6only = 1; /* Make sure we are allowed to bind here. */ - if ((snum || !inet->bind_address_no_port) && - sk->sk_prot->get_port(sk, snum)) { - sk->sk_ipv6only = saved_ipv6only; - inet_reset_saddr(sk); - err = -EADDRINUSE; - goto out; + if (snum || !(inet->bind_address_no_port || + force_bind_address_no_port)) { + if (sk->sk_prot->get_port(sk, snum)) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + err = BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk); + if (err) { + sk->sk_ipv6only = saved_ipv6only; + inet_reset_saddr(sk); + goto out; + } } if (addr_type != IPV6_ADDR_ANY) @@ -411,13 +436,13 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) inet->inet_dport = 0; inet->inet_daddr = 0; out: - release_sock(sk); + if (with_lock) + release_sock(sk); return err; out_unlock: rcu_read_unlock(); goto out; } -EXPORT_SYMBOL(inet6_bind); int inet6_release(struct socket *sock) { @@ -868,6 +893,10 @@ static const struct ipv6_stub ipv6_stub_impl = { .nd_tbl = &nd_tbl, }; +static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { + .inet6_bind = __inet6_bind, +}; + static int __init inet6_init(void) { struct list_head *r; @@ -1024,6 +1053,7 @@ static int __init inet6_init(void) /* ensure that ipv6 stubs are visible only after ipv6 is ready */ wmb(); ipv6_stub = &ipv6_stub_impl; + ipv6_bpf_stub = &ipv6_bpf_stub_impl; out: return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 883df0ad5bfe..6d664d83cd16 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -117,6 +117,21 @@ static u32 tcp_v6_init_ts_off(const struct net *net, const struct sk_buff *skb) ipv6_hdr(skb)->saddr.s6_addr32); } +static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from tcp_v6_connect() and intended to + * prevent BPF program called below from accessing bytes that are out + * of the bound specified by user in addr_len. + */ + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + sock_owned_by_me(sk); + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); +} + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -1925,6 +1940,7 @@ struct proto tcpv6_prot = { .name = "TCPv6", .owner = THIS_MODULE, .close = tcp_close, + .pre_connect = tcp_v6_pre_connect, .connect = tcp_v6_connect, .disconnect = tcp_disconnect, .accept = inet_csk_accept, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index ad30f5e31969..6861ed479469 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -957,6 +957,25 @@ static void udp_v6_flush_pending_frames(struct sock *sk) } } +static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* The following checks are replicated from __ip6_datagram_connect() + * and intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + if (uaddr->sa_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + return udp_pre_connect(sk, uaddr, addr_len); + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + /** * udp6_hwcsum_outgoing - handle outgoing HW checksumming * @sk: socket we are sending on @@ -1512,6 +1531,7 @@ struct proto udpv6_prot = { .name = "UDPv6", .owner = THIS_MODULE, .close = udp_lib_close, + .pre_connect = udpv6_pre_connect, .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, diff --git a/net/mac802154/trace.h b/net/mac802154/trace.h index 2c8a43d3607f..df855c33daf2 100644 --- a/net/mac802154/trace.h +++ b/net/mac802154/trace.h @@ -33,7 +33,7 @@ /* Tracing for driver callbacks */ -DECLARE_EVENT_CLASS(local_only_evt, +DECLARE_EVENT_CLASS(local_only_evt4, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local), TP_STRUCT__entry( @@ -45,7 +45,7 @@ DECLARE_EVENT_CLASS(local_only_evt, TP_printk(LOCAL_PR_FMT, LOCAL_PR_ARG) ); -DEFINE_EVENT(local_only_evt, 802154_drv_return_void, +DEFINE_EVENT(local_only_evt4, 802154_drv_return_void, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); @@ -65,12 +65,12 @@ TRACE_EVENT(802154_drv_return_int, __entry->ret) ); -DEFINE_EVENT(local_only_evt, 802154_drv_start, +DEFINE_EVENT(local_only_evt4, 802154_drv_start, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); -DEFINE_EVENT(local_only_evt, 802154_drv_stop, +DEFINE_EVENT(local_only_evt4, 802154_drv_stop, TP_PROTO(struct ieee802154_local *local), TP_ARGS(local) ); diff --git a/net/wireless/trace.h b/net/wireless/trace.h index a64291ae52a6..55fb279a5196 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -3184,7 +3184,7 @@ TRACE_EVENT(rdev_start_radar_detection, TRACE_EVENT(rdev_set_mcast_rate, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - int mcast_rate[NUM_NL80211_BANDS]), + int *mcast_rate), TP_ARGS(wiphy, netdev, mcast_rate), TP_STRUCT__entry( WIPHY_ENTRY diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 2c2a587e0942..4d6a6edd4bf6 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -119,6 +119,7 @@ always += offwaketime_kern.o always += spintest_kern.o always += map_perf_test_kern.o always += test_overhead_tp_kern.o +always += test_overhead_raw_tp_kern.o always += test_overhead_kprobe_kern.o always += parse_varlen.o parse_simple.o parse_ldabs.o always += test_cgrp2_tc_kern.o diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index b1a310c3ae89..bebe4188b4b3 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -61,6 +61,7 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) bool is_kprobe = strncmp(event, "kprobe/", 7) == 0; bool is_kretprobe = strncmp(event, "kretprobe/", 10) == 0; bool is_tracepoint = strncmp(event, "tracepoint/", 11) == 0; + bool is_raw_tracepoint = strncmp(event, "raw_tracepoint/", 15) == 0; bool is_xdp = strncmp(event, "xdp", 3) == 0; bool is_perf_event = strncmp(event, "perf_event", 10) == 0; bool is_cgroup_skb = strncmp(event, "cgroup/skb", 10) == 0; @@ -85,6 +86,8 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) prog_type = BPF_PROG_TYPE_KPROBE; } else if (is_tracepoint) { prog_type = BPF_PROG_TYPE_TRACEPOINT; + } else if (is_raw_tracepoint) { + prog_type = BPF_PROG_TYPE_RAW_TRACEPOINT; } else if (is_xdp) { prog_type = BPF_PROG_TYPE_XDP; } else if (is_perf_event) { @@ -131,6 +134,16 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return populate_prog_array(event, fd); } + if (is_raw_tracepoint) { + efd = bpf_raw_tracepoint_open(event + 15, fd); + if (efd < 0) { + printf("tracepoint %s %s\n", event + 15, strerror(errno)); + return -1; + } + event_fd[prog_cnt - 1] = efd; + return 0; + } + if (is_kprobe || is_kretprobe) { if (is_kprobe) event += 7; @@ -587,6 +600,7 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) if (memcmp(shname, "kprobe/", 7) == 0 || memcmp(shname, "kretprobe/", 10) == 0 || memcmp(shname, "tracepoint/", 11) == 0 || + memcmp(shname, "raw_tracepoint/", 15) == 0 || memcmp(shname, "xdp", 3) == 0 || memcmp(shname, "perf_event", 10) == 0 || memcmp(shname, "socket", 6) == 0 || diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c index 9d751e209f31..8eca27e595ae 100644 --- a/samples/bpf/cookie_uid_helper_example.c +++ b/samples/bpf/cookie_uid_helper_example.c @@ -246,7 +246,7 @@ static void udp_client(void) recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0, (struct sockaddr *)&si_me, &slen); if (recv_len < 0) - error(1, errno, "revieve\n"); + error(1, errno, "receive\n"); res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr), sizeof(si_me.sin_addr)); if (res != 0) diff --git a/samples/bpf/test_overhead_raw_tp_kern.c b/samples/bpf/test_overhead_raw_tp_kern.c new file mode 100644 index 000000000000..d2af8bc1c805 --- /dev/null +++ b/samples/bpf/test_overhead_raw_tp_kern.c @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2018 Facebook */ +#include <uapi/linux/bpf.h> +#include "bpf_helpers.h" + +SEC("raw_tracepoint/task_rename") +int prog(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} + +SEC("raw_tracepoint/urandom_read") +int prog2(struct bpf_raw_tracepoint_args *ctx) +{ + return 0; +} +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/test_overhead_user.c b/samples/bpf/test_overhead_user.c index d291167fd3c7..e1d35e07a10e 100644 --- a/samples/bpf/test_overhead_user.c +++ b/samples/bpf/test_overhead_user.c @@ -158,5 +158,17 @@ int main(int argc, char **argv) unload_progs(); } + if (test_flags & 0xC0) { + snprintf(filename, sizeof(filename), + "%s_raw_tp_kern.o", argv[0]); + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + printf("w/RAW_TRACEPOINT\n"); + run_perf_test(num_cpu, test_flags >> 6); + unload_progs(); + } + return 0; } diff --git a/samples/sockmap/sockmap_kern.c b/samples/sockmap/sockmap_kern.c index 9ad5ba79c85a..9ff8bc5dc206 100644 --- a/samples/sockmap/sockmap_kern.c +++ b/samples/sockmap/sockmap_kern.c @@ -54,7 +54,7 @@ struct bpf_map_def SEC("maps") sock_map_redir = { .type = BPF_MAP_TYPE_SOCKMAP, .key_size = sizeof(int), .value_size = sizeof(int), - .max_entries = 1, + .max_entries = 20, }; struct bpf_map_def SEC("maps") sock_apply_bytes = { @@ -78,6 +78,19 @@ struct bpf_map_def SEC("maps") sock_pull_bytes = { .max_entries = 2 }; +struct bpf_map_def SEC("maps") sock_redir_flags = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; + +struct bpf_map_def SEC("maps") sock_skb_opts = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(int), + .value_size = sizeof(int), + .max_entries = 1 +}; SEC("sk_skb1") int bpf_prog1(struct __sk_buff *skb) @@ -90,15 +103,24 @@ int bpf_prog2(struct __sk_buff *skb) { __u32 lport = skb->local_port; __u32 rport = skb->remote_port; - int ret = 0; + int len, *f, ret, zero = 0; + __u64 flags = 0; if (lport == 10000) ret = 10; else ret = 1; - bpf_printk("sockmap: %d -> %d @ %d\n", lport, bpf_ntohl(rport), ret); - return bpf_sk_redirect_map(skb, &sock_map, ret, 0); + len = (__u32)skb->data_end - (__u32)skb->data; + f = bpf_map_lookup_elem(&sock_skb_opts, &zero); + if (f && *f) { + ret = 3; + flags = *f; + } + + bpf_printk("sk_skb2: redirect(%iB) flags=%i\n", + len, flags); + return bpf_sk_redirect_map(skb, &sock_map, ret, flags); } SEC("sockops") @@ -197,8 +219,9 @@ int bpf_prog5(struct sk_msg_md *msg) SEC("sk_msg3") int bpf_prog6(struct sk_msg_md *msg) { - int *bytes, zero = 0, one = 1; - int *start, *end; + int *bytes, zero = 0, one = 1, key = 0; + int *start, *end, *f; + __u64 flags = 0; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) @@ -210,15 +233,22 @@ int bpf_prog6(struct sk_msg_md *msg) end = bpf_map_lookup_elem(&sock_pull_bytes, &one); if (start && end) bpf_msg_pull_data(msg, *start, *end, 0); - return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } + return bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); } SEC("sk_msg4") int bpf_prog7(struct sk_msg_md *msg) { - int err1 = 0, err2 = 0, zero = 0, one = 1; - int *bytes, *start, *end, len1, len2; + int err1 = 0, err2 = 0, zero = 0, one = 1, key = 0; + int *f, *bytes, *start, *end, len1, len2; + __u64 flags = 0; + int err; bytes = bpf_map_lookup_elem(&sock_apply_bytes, &zero); if (bytes) err1 = bpf_msg_apply_bytes(msg, *bytes); @@ -229,7 +259,6 @@ int bpf_prog7(struct sk_msg_md *msg) start = bpf_map_lookup_elem(&sock_pull_bytes, &zero); end = bpf_map_lookup_elem(&sock_pull_bytes, &one); if (start && end) { - int err; bpf_printk("sk_msg2: pull(%i:%i)\n", start ? *start : 0, end ? *end : 0); @@ -241,9 +270,16 @@ int bpf_prog7(struct sk_msg_md *msg) bpf_printk("sk_msg2: length update %i->%i\n", len1, len2); } - bpf_printk("sk_msg3: redirect(%iB) err1=%i err2=%i\n", - len1, err1, err2); - return bpf_msg_redirect_map(msg, &sock_map_redir, zero, 0); + f = bpf_map_lookup_elem(&sock_redir_flags, &zero); + if (f && *f) { + key = 2; + flags = *f; + } + bpf_printk("sk_msg3: redirect(%iB) flags=%i err=%i\n", + len1, flags, err1 ? err1 : err2); + err = bpf_msg_redirect_map(msg, &sock_map_redir, key, flags); + bpf_printk("sk_msg3: err %i\n", err); + return err; } SEC("sk_msg5") diff --git a/samples/sockmap/sockmap_test.sh b/samples/sockmap/sockmap_test.sh index 6d8cc40cca22..ace75f070eb8 100755 --- a/samples/sockmap/sockmap_test.sh +++ b/samples/sockmap/sockmap_test.sh @@ -1,5 +1,5 @@ #Test a bunch of positive cases to verify basic functionality -for prog in "--txmsg" "--txmsg_redir" "--txmsg_drop"; do +for prog in "--txmsg_redir --txmsg_skb" "--txmsg_redir --txmsg_ingress" "--txmsg" "--txmsg_redir" "--txmsg_redir --txmsg_ingress" "--txmsg_drop"; do for t in "sendmsg" "sendpage"; do for r in 1 10 100; do for i in 1 10 100; do @@ -100,6 +100,25 @@ for t in "sendmsg" "sendpage"; do sleep 2 done +prog="--txmsg_redir --txmsg_apply 1 --txmsg_ingress" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +prog="--txmsg_redir --txmsg_apply 1 --txmsg_skb" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + # Test apply and redirect with larger value than send r=1 i=8 @@ -113,6 +132,25 @@ for t in "sendmsg" "sendpage"; do sleep 2 done +prog="--txmsg_redir --txmsg_apply 2048 --txmsg_ingress" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + +prog="--txmsg_redir --txmsg_apply 2048 --txmsg_skb" + +for t in "sendmsg" "sendpage"; do + TEST="./sockmap --cgroup /mnt/cgroup2/ -t $t -r $r -i $i -l $l $prog" + echo $TEST + $TEST + sleep 2 +done + + # Test apply and redirect with apply that never reaches limit r=1024 i=1 diff --git a/samples/sockmap/sockmap_user.c b/samples/sockmap/sockmap_user.c index 07aa237221d1..6f2334912283 100644 --- a/samples/sockmap/sockmap_user.c +++ b/samples/sockmap/sockmap_user.c @@ -64,6 +64,8 @@ int txmsg_apply; int txmsg_cork; int txmsg_start; int txmsg_end; +int txmsg_ingress; +int txmsg_skb; static const struct option long_options[] = { {"help", no_argument, NULL, 'h' }, @@ -83,6 +85,8 @@ static const struct option long_options[] = { {"txmsg_cork", required_argument, NULL, 'k'}, {"txmsg_start", required_argument, NULL, 's'}, {"txmsg_end", required_argument, NULL, 'e'}, + {"txmsg_ingress", no_argument, &txmsg_ingress, 1 }, + {"txmsg_skb", no_argument, &txmsg_skb, 1 }, {0, 0, NULL, 0 } }; @@ -793,6 +797,60 @@ run: return err; } } + + if (txmsg_ingress) { + int in = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[6], &i, &in, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + i = 1; + err = bpf_map_update_elem(map_fd[1], &i, &p1, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (p1 txmsg): %d (%s)\n", + err, strerror(errno)); + } + err = bpf_map_update_elem(map_fd[2], &i, &p1, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (p1 redir): %d (%s)\n", + err, strerror(errno)); + } + + i = 2; + err = bpf_map_update_elem(map_fd[2], &i, &p2, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (p2 txmsg): %d (%s)\n", + err, strerror(errno)); + } + } + + if (txmsg_skb) { + int skb_fd = (test == SENDMSG || test == SENDPAGE) ? p2 : p1; + int ingress = BPF_F_INGRESS; + + i = 0; + err = bpf_map_update_elem(map_fd[7], &i, &ingress, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (txmsg_ingress): %d (%s)\n", + err, strerror(errno)); + } + + i = 3; + err = bpf_map_update_elem(map_fd[0], &i, &skb_fd, BPF_ANY); + if (err) { + fprintf(stderr, + "ERROR: bpf_map_update_elem (c1 sockmap): %d (%s)\n", + err, strerror(errno)); + } + } } if (txmsg_drop) diff --git a/security/apparmor/include/path.h b/security/apparmor/include/path.h index 05fb3305671e..e042b994f2b8 100644 --- a/security/apparmor/include/path.h +++ b/security/apparmor/include/path.h @@ -43,15 +43,10 @@ struct aa_buffers { DECLARE_PER_CPU(struct aa_buffers, aa_buffers); -#define COUNT_ARGS(X...) COUNT_ARGS_HELPER(, ##X, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0) -#define COUNT_ARGS_HELPER(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, n, X...) n -#define CONCAT(X, Y) X ## Y -#define CONCAT_AFTER(X, Y) CONCAT(X, Y) - #define ASSIGN(FN, X, N) ((X) = FN(N)) #define EVAL1(FN, X) ASSIGN(FN, X, 0) /*X = FN(0)*/ #define EVAL2(FN, X, Y...) do { ASSIGN(FN, X, 1); EVAL1(FN, Y); } while (0) -#define EVAL(FN, X...) CONCAT_AFTER(EVAL, COUNT_ARGS(X))(FN, X) +#define EVAL(FN, X...) CONCATENATE(EVAL, COUNT_ARGS(X))(FN, X) #define for_each_cpu_buffer(I) for ((I) = 0; (I) < MAX_PATH_BUFFERS; (I)++) diff --git a/sound/firewire/amdtp-stream-trace.h b/sound/firewire/amdtp-stream-trace.h index ea0d486652c8..54cdd4ffa9ce 100644 --- a/sound/firewire/amdtp-stream-trace.h +++ b/sound/firewire/amdtp-stream-trace.h @@ -14,7 +14,7 @@ #include <linux/tracepoint.h> TRACE_EVENT(in_packet, - TP_PROTO(const struct amdtp_stream *s, u32 cycles, u32 cip_header[2], unsigned int payload_length, unsigned int index), + TP_PROTO(const struct amdtp_stream *s, u32 cycles, u32 *cip_header, unsigned int payload_length, unsigned int index), TP_ARGS(s, cycles, cip_header, payload_length, index), TP_STRUCT__entry( __field(unsigned int, second) diff --git a/tools/bpf/bpftool/xlated_dumper.c b/tools/bpf/bpftool/xlated_dumper.c index 20da835e9e38..7a3173b76c16 100644 --- a/tools/bpf/bpftool/xlated_dumper.c +++ b/tools/bpf/bpftool/xlated_dumper.c @@ -114,7 +114,7 @@ static struct kernel_sym *kernel_syms_search(struct dump_data *dd, sizeof(*dd->sym_mapping), kernel_syms_cmp) : NULL; } -static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...) +static void print_insn(void *private_data, const char *fmt, ...) { va_list args; @@ -124,7 +124,7 @@ static void print_insn(struct bpf_verifier_env *env, const char *fmt, ...) } static void -print_insn_for_graph(struct bpf_verifier_env *env, const char *fmt, ...) +print_insn_for_graph(void *private_data, const char *fmt, ...) { char buf[64], *p; va_list args; @@ -154,7 +154,7 @@ print_insn_for_graph(struct bpf_verifier_env *env, const char *fmt, ...) printf("%s", buf); } -static void print_insn_json(struct bpf_verifier_env *env, const char *fmt, ...) +static void print_insn_json(void *private_data, const char *fmt, ...) { unsigned int l = strlen(fmt); char chomped_fmt[l]; @@ -248,7 +248,7 @@ void dump_xlated_json(struct dump_data *dd, void *buf, unsigned int len, jsonw_start_object(json_wtr); jsonw_name(json_wtr, "disasm"); - print_bpf_insn(&cbs, NULL, insn + i, true); + print_bpf_insn(&cbs, insn + i, true); if (opcodes) { jsonw_name(json_wtr, "opcodes"); @@ -302,7 +302,7 @@ void dump_xlated_plain(struct dump_data *dd, void *buf, unsigned int len, double_insn = insn[i].code == (BPF_LD | BPF_IMM | BPF_DW); printf("% 4d: ", i); - print_bpf_insn(&cbs, NULL, insn + i, true); + print_bpf_insn(&cbs, insn + i, true); if (opcodes) { printf(" "); @@ -331,7 +331,7 @@ void dump_xlated_for_graph(struct dump_data *dd, void *buf_start, void *buf_end, for (; cur <= insn_end; cur++) { printf("% 4d: ", (int)(cur - insn_start + start_idx)); - print_bpf_insn(&cbs, NULL, cur, true); + print_bpf_insn(&cbs, cur, true); if (cur != insn_end) printf(" | "); } diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index d245c41213ac..9d07465023a2 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -94,6 +94,7 @@ enum bpf_cmd { BPF_MAP_GET_FD_BY_ID, BPF_OBJ_GET_INFO_BY_FD, BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, }; enum bpf_map_type { @@ -134,6 +135,8 @@ enum bpf_prog_type { BPF_PROG_TYPE_SK_SKB, BPF_PROG_TYPE_CGROUP_DEVICE, BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, }; enum bpf_attach_type { @@ -145,6 +148,12 @@ enum bpf_attach_type { BPF_SK_SKB_STREAM_VERDICT, BPF_CGROUP_DEVICE, BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, __MAX_BPF_ATTACH_TYPE }; @@ -294,6 +303,11 @@ union bpf_attr { __u32 prog_flags; char prog_name[BPF_OBJ_NAME_LEN]; __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; }; struct { /* anonymous struct used by BPF_OBJ_* commands */ @@ -344,6 +358,11 @@ union bpf_attr { __aligned_u64 prog_ids; __u32 prog_cnt; } query; + + struct { + __u64 name; + __u32 prog_fd; + } raw_tracepoint; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -729,6 +748,13 @@ union bpf_attr { * @flags: reserved for future use * Return: SK_PASS * + * int bpf_bind(ctx, addr, addr_len) + * Bind socket to address. Only binding to IP is supported, no port can be + * set in addr. + * @ctx: pointer to context of type bpf_sock_addr + * @addr: pointer to struct sockaddr to bind socket to + * @addr_len: length of sockaddr structure + * Return: 0 on success or negative error code */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -794,7 +820,8 @@ union bpf_attr { FN(msg_redirect_map), \ FN(msg_apply_bytes), \ FN(msg_cork_bytes), \ - FN(msg_pull_data), + FN(msg_pull_data), \ + FN(bind), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -922,6 +949,15 @@ struct bpf_sock { __u32 protocol; __u32 mark; __u32 priority; + __u32 src_ip4; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_ip6[4]; /* Allows 1,2,4-byte read. + * Stored in network byte order. + */ + __u32 src_port; /* Allows 4-byte read. + * Stored in host byte order + */ }; #define XDP_PACKET_HEADROOM 256 @@ -997,6 +1033,26 @@ struct bpf_map_info { __u64 netns_ino; } __attribute__((aligned(8))); +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4-byte read an 4-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 4-byte read and write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ +}; + /* User bpf_sock_ops struct to access socket values and specify request ops * and their replies. * Some of this fields are in network (bigendian) byte order and may need @@ -1151,4 +1207,8 @@ struct bpf_cgroup_dev_ctx { __u32 minor; }; +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + #endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c index 592a58a2b681..acbb3f8b3bec 100644 --- a/tools/lib/bpf/bpf.c +++ b/tools/lib/bpf/bpf.c @@ -146,26 +146,30 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, -1); } -int bpf_load_program_name(enum bpf_prog_type type, const char *name, - const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz) +int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, + char *log_buf, size_t log_buf_sz) { - int fd; union bpf_attr attr; - __u32 name_len = name ? strlen(name) : 0; + __u32 name_len; + int fd; + + if (!load_attr) + return -EINVAL; + + name_len = load_attr->name ? strlen(load_attr->name) : 0; bzero(&attr, sizeof(attr)); - attr.prog_type = type; - attr.insn_cnt = (__u32)insns_cnt; - attr.insns = ptr_to_u64(insns); - attr.license = ptr_to_u64(license); + attr.prog_type = load_attr->prog_type; + attr.expected_attach_type = load_attr->expected_attach_type; + attr.insn_cnt = (__u32)load_attr->insns_cnt; + attr.insns = ptr_to_u64(load_attr->insns); + attr.license = ptr_to_u64(load_attr->license); attr.log_buf = ptr_to_u64(NULL); attr.log_size = 0; attr.log_level = 0; - attr.kern_version = kern_version; - memcpy(attr.prog_name, name, min(name_len, BPF_OBJ_NAME_LEN - 1)); + attr.kern_version = load_attr->kern_version; + memcpy(attr.prog_name, load_attr->name, + min(name_len, BPF_OBJ_NAME_LEN - 1)); fd = sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); if (fd >= 0 || !log_buf || !log_buf_sz) @@ -184,8 +188,18 @@ int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, __u32 kern_version, char *log_buf, size_t log_buf_sz) { - return bpf_load_program_name(type, NULL, insns, insns_cnt, license, - kern_version, log_buf, log_buf_sz); + struct bpf_load_program_attr load_attr; + + memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); + load_attr.prog_type = type; + load_attr.expected_attach_type = 0; + load_attr.name = NULL; + load_attr.insns = insns; + load_attr.insns_cnt = insns_cnt; + load_attr.license = license; + load_attr.kern_version = kern_version; + + return bpf_load_program_xattr(&load_attr, log_buf, log_buf_sz); } int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, @@ -428,6 +442,17 @@ int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len) return err; } +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + union bpf_attr attr; + + bzero(&attr, sizeof(attr)); + attr.raw_tracepoint.name = ptr_to_u64(name); + attr.raw_tracepoint.prog_fd = prog_fd; + + return sys_bpf(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); +} + int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) { struct sockaddr_nl sa; diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h index 8d18fb73d7fb..39f6a0d64a3b 100644 --- a/tools/lib/bpf/bpf.h +++ b/tools/lib/bpf/bpf.h @@ -41,13 +41,20 @@ int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, int key_size, int inner_map_fd, int max_entries, __u32 map_flags); +struct bpf_load_program_attr { + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; + const char *name; + const struct bpf_insn *insns; + size_t insns_cnt; + const char *license; + __u32 kern_version; +}; + /* Recommend log buffer size */ #define BPF_LOG_BUF_SIZE (256 * 1024) -int bpf_load_program_name(enum bpf_prog_type type, const char *name, - const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz); +int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, + char *log_buf, size_t log_buf_sz); int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, size_t insns_cnt, const char *license, __u32 kern_version, char *log_buf, @@ -79,4 +86,5 @@ int bpf_map_get_fd_by_id(__u32 id); int bpf_obj_get_info_by_fd(int prog_fd, void *info, __u32 *info_len); int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); +int bpf_raw_tracepoint_open(const char *name, int prog_fd); #endif diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index 64a8fc384186..5922443063f0 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -203,6 +203,8 @@ struct bpf_program { struct bpf_object *obj; void *priv; bpf_program_clear_priv_t clear_priv; + + enum bpf_attach_type expected_attach_type; }; struct bpf_map { @@ -1162,21 +1164,31 @@ static int bpf_object__collect_reloc(struct bpf_object *obj) } static int -load_program(enum bpf_prog_type type, const char *name, struct bpf_insn *insns, - int insns_cnt, char *license, u32 kern_version, int *pfd) +load_program(enum bpf_prog_type type, enum bpf_attach_type expected_attach_type, + const char *name, struct bpf_insn *insns, int insns_cnt, + char *license, u32 kern_version, int *pfd) { - int ret; + struct bpf_load_program_attr load_attr; char *log_buf; + int ret; - if (!insns || !insns_cnt) + memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); + load_attr.prog_type = type; + load_attr.expected_attach_type = expected_attach_type; + load_attr.name = name; + load_attr.insns = insns; + load_attr.insns_cnt = insns_cnt; + load_attr.license = license; + load_attr.kern_version = kern_version; + + if (!load_attr.insns || !load_attr.insns_cnt) return -EINVAL; log_buf = malloc(BPF_LOG_BUF_SIZE); if (!log_buf) pr_warning("Alloc log buffer for bpf loader error, continue without log\n"); - ret = bpf_load_program_name(type, name, insns, insns_cnt, license, - kern_version, log_buf, BPF_LOG_BUF_SIZE); + ret = bpf_load_program_xattr(&load_attr, log_buf, BPF_LOG_BUF_SIZE); if (ret >= 0) { *pfd = ret; @@ -1192,18 +1204,18 @@ load_program(enum bpf_prog_type type, const char *name, struct bpf_insn *insns, pr_warning("-- BEGIN DUMP LOG ---\n"); pr_warning("\n%s\n", log_buf); pr_warning("-- END LOG --\n"); - } else if (insns_cnt >= BPF_MAXINSNS) { - pr_warning("Program too large (%d insns), at most %d insns\n", - insns_cnt, BPF_MAXINSNS); + } else if (load_attr.insns_cnt >= BPF_MAXINSNS) { + pr_warning("Program too large (%zu insns), at most %d insns\n", + load_attr.insns_cnt, BPF_MAXINSNS); ret = -LIBBPF_ERRNO__PROG2BIG; } else { /* Wrong program type? */ - if (type != BPF_PROG_TYPE_KPROBE) { + if (load_attr.prog_type != BPF_PROG_TYPE_KPROBE) { int fd; - fd = bpf_load_program_name(BPF_PROG_TYPE_KPROBE, name, - insns, insns_cnt, license, - kern_version, NULL, 0); + load_attr.prog_type = BPF_PROG_TYPE_KPROBE; + load_attr.expected_attach_type = 0; + fd = bpf_load_program_xattr(&load_attr, NULL, 0); if (fd >= 0) { close(fd); ret = -LIBBPF_ERRNO__PROGTYPE; @@ -1247,8 +1259,9 @@ bpf_program__load(struct bpf_program *prog, pr_warning("Program '%s' is inconsistent: nr(%d) != 1\n", prog->section_name, prog->instances.nr); } - err = load_program(prog->type, prog->name, prog->insns, - prog->insns_cnt, license, kern_version, &fd); + err = load_program(prog->type, prog->expected_attach_type, + prog->name, prog->insns, prog->insns_cnt, + license, kern_version, &fd); if (!err) prog->instances.fds[0] = fd; goto out; @@ -1276,8 +1289,8 @@ bpf_program__load(struct bpf_program *prog, continue; } - err = load_program(prog->type, prog->name, - result.new_insn_ptr, + err = load_program(prog->type, prog->expected_attach_type, + prog->name, result.new_insn_ptr, result.new_insn_cnt, license, kern_version, &fd); @@ -1835,11 +1848,25 @@ BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT); BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); -#define BPF_PROG_SEC(string, type) { string, sizeof(string) - 1, type } +static void bpf_program__set_expected_attach_type(struct bpf_program *prog, + enum bpf_attach_type type) +{ + prog->expected_attach_type = type; +} + +#define BPF_PROG_SEC_FULL(string, ptype, atype) \ + { string, sizeof(string) - 1, ptype, atype } + +#define BPF_PROG_SEC(string, ptype) BPF_PROG_SEC_FULL(string, ptype, 0) + +#define BPF_SA_PROG_SEC(string, ptype) \ + BPF_PROG_SEC_FULL(string, BPF_PROG_TYPE_CGROUP_SOCK_ADDR, ptype) + static const struct { const char *sec; size_t len; enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; } section_names[] = { BPF_PROG_SEC("socket", BPF_PROG_TYPE_SOCKET_FILTER), BPF_PROG_SEC("kprobe/", BPF_PROG_TYPE_KPROBE), @@ -1858,10 +1885,17 @@ static const struct { BPF_PROG_SEC("sockops", BPF_PROG_TYPE_SOCK_OPS), BPF_PROG_SEC("sk_skb", BPF_PROG_TYPE_SK_SKB), BPF_PROG_SEC("sk_msg", BPF_PROG_TYPE_SK_MSG), + BPF_SA_PROG_SEC("cgroup/bind4", BPF_CGROUP_INET4_BIND), + BPF_SA_PROG_SEC("cgroup/bind6", BPF_CGROUP_INET6_BIND), + BPF_SA_PROG_SEC("cgroup/connect4", BPF_CGROUP_INET4_CONNECT), + BPF_SA_PROG_SEC("cgroup/connect6", BPF_CGROUP_INET6_CONNECT), }; + #undef BPF_PROG_SEC +#undef BPF_PROG_SEC_FULL +#undef BPF_SA_PROG_SEC -static enum bpf_prog_type bpf_program__guess_type(struct bpf_program *prog) +static int bpf_program__identify_section(struct bpf_program *prog) { int i; @@ -1871,13 +1905,13 @@ static enum bpf_prog_type bpf_program__guess_type(struct bpf_program *prog) for (i = 0; i < ARRAY_SIZE(section_names); i++) if (strncmp(prog->section_name, section_names[i].sec, section_names[i].len) == 0) - return section_names[i].prog_type; + return i; err: pr_warning("failed to guess program type based on section name %s\n", prog->section_name); - return BPF_PROG_TYPE_UNSPEC; + return -1; } int bpf_map__fd(struct bpf_map *map) @@ -1977,11 +2011,30 @@ long libbpf_get_error(const void *ptr) int bpf_prog_load(const char *file, enum bpf_prog_type type, struct bpf_object **pobj, int *prog_fd) { + struct bpf_prog_load_attr attr; + + memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); + attr.file = file; + attr.prog_type = type; + attr.expected_attach_type = 0; + + return bpf_prog_load_xattr(&attr, pobj, prog_fd); +} + +int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, + struct bpf_object **pobj, int *prog_fd) +{ struct bpf_program *prog, *first_prog = NULL; + enum bpf_attach_type expected_attach_type; + enum bpf_prog_type prog_type; struct bpf_object *obj; + int section_idx; int err; - obj = bpf_object__open(file); + if (!attr) + return -EINVAL; + + obj = bpf_object__open(attr->file); if (IS_ERR(obj)) return -ENOENT; @@ -1990,15 +2043,23 @@ int bpf_prog_load(const char *file, enum bpf_prog_type type, * If type is not specified, try to guess it based on * section name. */ - if (type == BPF_PROG_TYPE_UNSPEC) { - type = bpf_program__guess_type(prog); - if (type == BPF_PROG_TYPE_UNSPEC) { + prog_type = attr->prog_type; + expected_attach_type = attr->expected_attach_type; + if (prog_type == BPF_PROG_TYPE_UNSPEC) { + section_idx = bpf_program__identify_section(prog); + if (section_idx < 0) { bpf_object__close(obj); return -EINVAL; } + prog_type = section_names[section_idx].prog_type; + expected_attach_type = + section_names[section_idx].expected_attach_type; } - bpf_program__set_type(prog, type); + bpf_program__set_type(prog, prog_type); + bpf_program__set_expected_attach_type(prog, + expected_attach_type); + if (prog->idx != obj->efile.text_shndx && !first_prog) first_prog = prog; } diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h index f85906533cdd..a3a62a583f27 100644 --- a/tools/lib/bpf/libbpf.h +++ b/tools/lib/bpf/libbpf.h @@ -248,6 +248,14 @@ int bpf_map__pin(struct bpf_map *map, const char *path); long libbpf_get_error(const void *ptr); +struct bpf_prog_load_attr { + const char *file; + enum bpf_prog_type prog_type; + enum bpf_attach_type expected_attach_type; +}; + +int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, + struct bpf_object **pobj, int *prog_fd); int bpf_prog_load(const char *file, enum bpf_prog_type type, struct bpf_object **pobj, int *prog_fd); diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index f35fb02bdf56..0a315ddabbf4 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -23,21 +23,23 @@ urandom_read: urandom_read.c # Order correspond to 'make run_tests' order TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs \ - test_align test_verifier_log test_dev_cgroup test_tcpbpf_user + test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \ + test_sock test_sock_addr TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test_obj_id.o \ test_pkt_md_access.o test_xdp_redirect.o test_xdp_meta.o sockmap_parse_prog.o \ sockmap_verdict_prog.o dev_cgroup.o sample_ret0.o test_tracepoint.o \ test_l4lb_noinline.o test_xdp_noinline.o test_stacktrace_map.o \ sample_map_ret0.o test_tcpbpf_kern.o test_stacktrace_build_id.o \ - sockmap_tcp_msg_prog.o + sockmap_tcp_msg_prog.o connect4_prog.o connect6_prog.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ test_libbpf.sh \ test_xdp_redirect.sh \ test_xdp_meta.sh \ - test_offload.py + test_offload.py \ + test_sock_addr.sh # Compile but not part of 'make run_tests' TEST_GEN_PROGS_EXTENDED = test_libbpf_open @@ -51,6 +53,8 @@ $(TEST_GEN_PROGS): $(BPFOBJ) $(TEST_GEN_PROGS_EXTENDED): $(OUTPUT)/libbpf.a $(OUTPUT)/test_dev_cgroup: cgroup_helpers.c +$(OUTPUT)/test_sock: cgroup_helpers.c +$(OUTPUT)/test_sock_addr: cgroup_helpers.c .PHONY: force diff --git a/tools/testing/selftests/bpf/bpf_helpers.h b/tools/testing/selftests/bpf/bpf_helpers.h index 7cae376d8d0c..d8223d99f96d 100644 --- a/tools/testing/selftests/bpf/bpf_helpers.h +++ b/tools/testing/selftests/bpf/bpf_helpers.h @@ -94,6 +94,8 @@ static int (*bpf_msg_cork_bytes)(void *ctx, int len) = (void *) BPF_FUNC_msg_cork_bytes; static int (*bpf_msg_pull_data)(void *ctx, int start, int end, int flags) = (void *) BPF_FUNC_msg_pull_data; +static int (*bpf_bind)(void *ctx, void *addr, int addr_len) = + (void *) BPF_FUNC_bind; /* llvm builtin functions that eBPF C program may use to * emit BPF_LD_ABS and BPF_LD_IND instructions diff --git a/tools/testing/selftests/bpf/connect4_prog.c b/tools/testing/selftests/bpf/connect4_prog.c new file mode 100644 index 000000000000..5a88a681d2ab --- /dev/null +++ b/tools/testing/selftests/bpf/connect4_prog.c @@ -0,0 +1,45 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <string.h> + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define SRC_REWRITE_IP4 0x7f000004U +#define DST_REWRITE_IP4 0x7f000001U +#define DST_REWRITE_PORT4 4444 + +int _version SEC("version") = 1; + +SEC("cgroup/connect4") +int connect_v4_prog(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in sa; + + /* Rewrite destination. */ + ctx->user_ip4 = bpf_htonl(DST_REWRITE_IP4); + ctx->user_port = bpf_htons(DST_REWRITE_PORT4); + + if (ctx->type == SOCK_DGRAM || ctx->type == SOCK_STREAM) { + ///* Rewrite source. */ + memset(&sa, 0, sizeof(sa)); + + sa.sin_family = AF_INET; + sa.sin_port = bpf_htons(0); + sa.sin_addr.s_addr = bpf_htonl(SRC_REWRITE_IP4); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + } + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/connect6_prog.c b/tools/testing/selftests/bpf/connect6_prog.c new file mode 100644 index 000000000000..8ea3f7d12dee --- /dev/null +++ b/tools/testing/selftests/bpf/connect6_prog.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <string.h> + +#include <linux/stddef.h> +#include <linux/bpf.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <sys/socket.h> + +#include "bpf_helpers.h" +#include "bpf_endian.h" + +#define SRC_REWRITE_IP6_0 0 +#define SRC_REWRITE_IP6_1 0 +#define SRC_REWRITE_IP6_2 0 +#define SRC_REWRITE_IP6_3 6 + +#define DST_REWRITE_IP6_0 0 +#define DST_REWRITE_IP6_1 0 +#define DST_REWRITE_IP6_2 0 +#define DST_REWRITE_IP6_3 1 + +#define DST_REWRITE_PORT6 6666 + +int _version SEC("version") = 1; + +SEC("cgroup/connect6") +int connect_v6_prog(struct bpf_sock_addr *ctx) +{ + struct sockaddr_in6 sa; + + /* Rewrite destination. */ + ctx->user_ip6[0] = bpf_htonl(DST_REWRITE_IP6_0); + ctx->user_ip6[1] = bpf_htonl(DST_REWRITE_IP6_1); + ctx->user_ip6[2] = bpf_htonl(DST_REWRITE_IP6_2); + ctx->user_ip6[3] = bpf_htonl(DST_REWRITE_IP6_3); + + ctx->user_port = bpf_htons(DST_REWRITE_PORT6); + + if (ctx->type == SOCK_DGRAM || ctx->type == SOCK_STREAM) { + /* Rewrite source. */ + memset(&sa, 0, sizeof(sa)); + + sa.sin6_family = AF_INET6; + sa.sin6_port = bpf_htons(0); + + sa.sin6_addr.s6_addr32[0] = bpf_htonl(SRC_REWRITE_IP6_0); + sa.sin6_addr.s6_addr32[1] = bpf_htonl(SRC_REWRITE_IP6_1); + sa.sin6_addr.s6_addr32[2] = bpf_htonl(SRC_REWRITE_IP6_2); + sa.sin6_addr.s6_addr32[3] = bpf_htonl(SRC_REWRITE_IP6_3); + + if (bpf_bind(ctx, (struct sockaddr *)&sa, sizeof(sa)) != 0) + return 0; + } + + return 1; +} + +char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index e9df48b306df..faadbe233966 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -877,7 +877,7 @@ static void test_stacktrace_map() err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); if (CHECK(err, "prog_load", "err %d errno %d\n", err, errno)) - goto out; + return; /* Get the ID for the sched/sched_switch tracepoint */ snprintf(buf, sizeof(buf), @@ -888,8 +888,7 @@ static void test_stacktrace_map() bytes = read(efd, buf, sizeof(buf)); close(efd); - if (CHECK(bytes <= 0 || bytes >= sizeof(buf), - "read", "bytes %d errno %d\n", bytes, errno)) + if (bytes <= 0 || bytes >= sizeof(buf)) goto close_prog; /* Open the perf event and attach bpf progrram */ @@ -906,29 +905,24 @@ static void test_stacktrace_map() goto close_prog; err = ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0); - if (CHECK(err, "perf_event_ioc_enable", "err %d errno %d\n", - err, errno)) - goto close_pmu; + if (err) + goto disable_pmu; err = ioctl(pmu_fd, PERF_EVENT_IOC_SET_BPF, prog_fd); - if (CHECK(err, "perf_event_ioc_set_bpf", "err %d errno %d\n", - err, errno)) + if (err) goto disable_pmu; /* find map fds */ control_map_fd = bpf_find_map(__func__, obj, "control_map"); - if (CHECK(control_map_fd < 0, "bpf_find_map control_map", - "err %d errno %d\n", err, errno)) + if (control_map_fd < 0) goto disable_pmu; stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); - if (CHECK(stackid_hmap_fd < 0, "bpf_find_map stackid_hmap", - "err %d errno %d\n", err, errno)) + if (stackid_hmap_fd < 0) goto disable_pmu; stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); - if (CHECK(stackmap_fd < 0, "bpf_find_map stackmap", "err %d errno %d\n", - err, errno)) + if (stackmap_fd < 0) goto disable_pmu; /* give some time for bpf program run */ @@ -945,24 +939,78 @@ static void test_stacktrace_map() err = compare_map_keys(stackid_hmap_fd, stackmap_fd); if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", "err %d errno %d\n", err, errno)) - goto disable_pmu; + goto disable_pmu_noerr; err = compare_map_keys(stackmap_fd, stackid_hmap_fd); if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", "err %d errno %d\n", err, errno)) - ; /* fall through */ + goto disable_pmu_noerr; + goto disable_pmu_noerr; disable_pmu: + error_cnt++; +disable_pmu_noerr: ioctl(pmu_fd, PERF_EVENT_IOC_DISABLE); - -close_pmu: close(pmu_fd); - close_prog: bpf_object__close(obj); +} -out: - return; +static void test_stacktrace_map_raw_tp() +{ + int control_map_fd, stackid_hmap_fd, stackmap_fd; + const char *file = "./test_stacktrace_map.o"; + int efd, err, prog_fd; + __u32 key, val, duration = 0; + struct bpf_object *obj; + + err = bpf_prog_load(file, BPF_PROG_TYPE_RAW_TRACEPOINT, &obj, &prog_fd); + if (CHECK(err, "prog_load raw tp", "err %d errno %d\n", err, errno)) + return; + + efd = bpf_raw_tracepoint_open("sched_switch", prog_fd); + if (CHECK(efd < 0, "raw_tp_open", "err %d errno %d\n", efd, errno)) + goto close_prog; + + /* find map fds */ + control_map_fd = bpf_find_map(__func__, obj, "control_map"); + if (control_map_fd < 0) + goto close_prog; + + stackid_hmap_fd = bpf_find_map(__func__, obj, "stackid_hmap"); + if (stackid_hmap_fd < 0) + goto close_prog; + + stackmap_fd = bpf_find_map(__func__, obj, "stackmap"); + if (stackmap_fd < 0) + goto close_prog; + + /* give some time for bpf program run */ + sleep(1); + + /* disable stack trace collection */ + key = 0; + val = 1; + bpf_map_update_elem(control_map_fd, &key, &val, 0); + + /* for every element in stackid_hmap, we can find a corresponding one + * in stackmap, and vise versa. + */ + err = compare_map_keys(stackid_hmap_fd, stackmap_fd); + if (CHECK(err, "compare_map_keys stackid_hmap vs. stackmap", + "err %d errno %d\n", err, errno)) + goto close_prog; + + err = compare_map_keys(stackmap_fd, stackid_hmap_fd); + if (CHECK(err, "compare_map_keys stackmap vs. stackid_hmap", + "err %d errno %d\n", err, errno)) + goto close_prog; + + goto close_prog_noerr; +close_prog: + error_cnt++; +close_prog_noerr: + bpf_object__close(obj); } static int extract_build_id(char *build_id, size_t size) @@ -1138,6 +1186,7 @@ int main(void) test_tp_attach_query(); test_stacktrace_map(); test_stacktrace_build_id(); + test_stacktrace_map_raw_tp(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return error_cnt ? EXIT_FAILURE : EXIT_SUCCESS; diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c new file mode 100644 index 000000000000..73bb20cfb9b7 --- /dev/null +++ b/tools/testing/selftests/bpf/test_sock.c @@ -0,0 +1,479 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <stdio.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include <linux/filter.h> + +#include <bpf/bpf.h> + +#include "cgroup_helpers.h" + +#ifndef ARRAY_SIZE +# define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +#define CG_PATH "/foo" +#define MAX_INSNS 512 + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +struct sock_test { + const char *descr; + /* BPF prog properties */ + struct bpf_insn insns[MAX_INSNS]; + enum bpf_attach_type expected_attach_type; + enum bpf_attach_type attach_type; + /* Socket properties */ + int domain; + int type; + /* Endpoint to bind() to */ + const char *ip; + unsigned short port; + /* Expected test result */ + enum { + LOAD_REJECT, + ATTACH_REJECT, + BIND_REJECT, + SUCCESS, + } result; +}; + +static struct sock_test tests[] = { + { + "bind4 load with invalid access: src_ip6", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip6[0])), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET4_POST_BIND, + 0, + 0, + NULL, + 0, + LOAD_REJECT, + }, + { + "bind4 load with invalid access: mark", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, mark)), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET4_POST_BIND, + 0, + 0, + NULL, + 0, + LOAD_REJECT, + }, + { + "bind6 load with invalid access: src_ip4", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip4)), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + 0, + 0, + NULL, + 0, + LOAD_REJECT, + }, + { + "sock_create load with invalid access: src_port", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_port)), + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_INET_SOCK_CREATE, + 0, + 0, + NULL, + 0, + LOAD_REJECT, + }, + { + "sock_create load w/o expected_attach_type (compat mode)", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + 0, + BPF_CGROUP_INET_SOCK_CREATE, + AF_INET, + SOCK_STREAM, + "127.0.0.1", + 8097, + SUCCESS, + }, + { + "sock_create load w/ expected_attach_type", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_INET_SOCK_CREATE, + AF_INET, + SOCK_STREAM, + "127.0.0.1", + 8097, + SUCCESS, + }, + { + "attach type mismatch bind4 vs bind6", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + 0, + 0, + NULL, + 0, + ATTACH_REJECT, + }, + { + "attach type mismatch bind6 vs bind4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_INET4_POST_BIND, + 0, + 0, + NULL, + 0, + ATTACH_REJECT, + }, + { + "attach type mismatch default vs bind4", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + 0, + BPF_CGROUP_INET4_POST_BIND, + 0, + 0, + NULL, + 0, + ATTACH_REJECT, + }, + { + "attach type mismatch bind6 vs sock_create", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_INET_SOCK_CREATE, + 0, + 0, + NULL, + 0, + ATTACH_REJECT, + }, + { + "bind4 reject all", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET4_POST_BIND, + AF_INET, + SOCK_STREAM, + "0.0.0.0", + 0, + BIND_REJECT, + }, + { + "bind6 reject all", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + AF_INET6, + SOCK_STREAM, + "::", + 0, + BIND_REJECT, + }, + { + "bind6 deny specific IP & port", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (ip == expected && port == expected) */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip6[3])), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x01000000, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x2001, 2), + + /* return DENY; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_JMP_A(1), + + /* else return ALLOW; */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + AF_INET6, + SOCK_STREAM, + "::1", + 8193, + BIND_REJECT, + }, + { + "bind4 allow specific IP & port", + .insns = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (ip == expected && port == expected) */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_ip4)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x0100007F, 4), + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock, src_port)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, 0x1002, 2), + + /* return ALLOW; */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_JMP_A(1), + + /* else return DENY; */ + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET4_POST_BIND, + AF_INET, + SOCK_STREAM, + "127.0.0.1", + 4098, + SUCCESS, + }, + { + "bind4 allow all", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET4_POST_BIND, + AF_INET, + SOCK_STREAM, + "0.0.0.0", + 0, + SUCCESS, + }, + { + "bind6 allow all", + .insns = { + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + AF_INET6, + SOCK_STREAM, + "::", + 0, + SUCCESS, + }, +}; + +static size_t probe_prog_length(const struct bpf_insn *fp) +{ + size_t len; + + for (len = MAX_INSNS - 1; len > 0; --len) + if (fp[len].code != 0 || fp[len].imm != 0) + break; + return len + 1; +} + +static int load_sock_prog(const struct bpf_insn *prog, + enum bpf_attach_type attach_type) +{ + struct bpf_load_program_attr attr; + + memset(&attr, 0, sizeof(struct bpf_load_program_attr)); + attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK; + attr.expected_attach_type = attach_type; + attr.insns = prog; + attr.insns_cnt = probe_prog_length(attr.insns); + attr.license = "GPL"; + + return bpf_load_program_xattr(&attr, bpf_log_buf, BPF_LOG_BUF_SIZE); +} + +static int attach_sock_prog(int cgfd, int progfd, + enum bpf_attach_type attach_type) +{ + return bpf_prog_attach(progfd, cgfd, attach_type, BPF_F_ALLOW_OVERRIDE); +} + +static int bind_sock(int domain, int type, const char *ip, unsigned short port) +{ + struct sockaddr_storage addr; + struct sockaddr_in6 *addr6; + struct sockaddr_in *addr4; + int sockfd = -1; + socklen_t len; + int err = 0; + + sockfd = socket(domain, type, 0); + if (sockfd < 0) + goto err; + + memset(&addr, 0, sizeof(addr)); + + if (domain == AF_INET) { + len = sizeof(struct sockaddr_in); + addr4 = (struct sockaddr_in *)&addr; + addr4->sin_family = domain; + addr4->sin_port = htons(port); + if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) + goto err; + } else if (domain == AF_INET6) { + len = sizeof(struct sockaddr_in6); + addr6 = (struct sockaddr_in6 *)&addr; + addr6->sin6_family = domain; + addr6->sin6_port = htons(port); + if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) + goto err; + } else { + goto err; + } + + if (bind(sockfd, (const struct sockaddr *)&addr, len) == -1) + goto err; + + goto out; +err: + err = -1; +out: + close(sockfd); + return err; +} + +static int run_test_case(int cgfd, const struct sock_test *test) +{ + int progfd = -1; + int err = 0; + + printf("Test case: %s .. ", test->descr); + progfd = load_sock_prog(test->insns, test->expected_attach_type); + if (progfd < 0) { + if (test->result == LOAD_REJECT) + goto out; + else + goto err; + } + + if (attach_sock_prog(cgfd, progfd, test->attach_type) == -1) { + if (test->result == ATTACH_REJECT) + goto out; + else + goto err; + } + + if (bind_sock(test->domain, test->type, test->ip, test->port) == -1) { + /* sys_bind() may fail for different reasons, errno has to be + * checked to confirm that BPF program rejected it. + */ + if (test->result == BIND_REJECT && errno == EPERM) + goto out; + else + goto err; + } + + + if (test->result != SUCCESS) + goto err; + + goto out; +err: + err = -1; +out: + /* Detaching w/o checking return code: best effort attempt. */ + if (progfd != -1) + bpf_prog_detach(cgfd, test->attach_type); + close(progfd); + printf("[%s]\n", err ? "FAIL" : "PASS"); + return err; +} + +static int run_tests(int cgfd) +{ + int passes = 0; + int fails = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(tests); ++i) { + if (run_test_case(cgfd, &tests[i])) + ++fails; + else + ++passes; + } + printf("Summary: %d PASSED, %d FAILED\n", passes, fails); + return fails ? -1 : 0; +} + +int main(int argc, char **argv) +{ + int cgfd = -1; + int err = 0; + + if (setup_cgroup_environment()) + goto err; + + cgfd = create_and_get_cgroup(CG_PATH); + if (!cgfd) + goto err; + + if (join_cgroup(CG_PATH)) + goto err; + + if (run_tests(cgfd)) + goto err; + + goto out; +err: + err = -1; +out: + close(cgfd); + cleanup_cgroup_environment(); + return err; +} diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c new file mode 100644 index 000000000000..d488f20926e8 --- /dev/null +++ b/tools/testing/selftests/bpf/test_sock_addr.c @@ -0,0 +1,588 @@ +// SPDX-License-Identifier: GPL-2.0 +// Copyright (c) 2018 Facebook + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +#include <arpa/inet.h> +#include <sys/types.h> +#include <sys/socket.h> + +#include <linux/filter.h> + +#include <bpf/bpf.h> +#include <bpf/libbpf.h> + +#include "cgroup_helpers.h" + +#define CG_PATH "/foo" +#define CONNECT4_PROG_PATH "./connect4_prog.o" +#define CONNECT6_PROG_PATH "./connect6_prog.o" + +#define SERV4_IP "192.168.1.254" +#define SERV4_REWRITE_IP "127.0.0.1" +#define SERV4_PORT 4040 +#define SERV4_REWRITE_PORT 4444 + +#define SERV6_IP "face:b00c:1234:5678::abcd" +#define SERV6_REWRITE_IP "::1" +#define SERV6_PORT 6060 +#define SERV6_REWRITE_PORT 6666 + +#define INET_NTOP_BUF 40 + +typedef int (*load_fn)(enum bpf_attach_type, const char *comment); +typedef int (*info_fn)(int, struct sockaddr *, socklen_t *); + +struct program { + enum bpf_attach_type type; + load_fn loadfn; + int fd; + const char *name; + enum bpf_attach_type invalid_type; +}; + +char bpf_log_buf[BPF_LOG_BUF_SIZE]; + +static int mk_sockaddr(int domain, const char *ip, unsigned short port, + struct sockaddr *addr, socklen_t addr_len) +{ + struct sockaddr_in6 *addr6; + struct sockaddr_in *addr4; + + if (domain != AF_INET && domain != AF_INET6) { + log_err("Unsupported address family"); + return -1; + } + + memset(addr, 0, addr_len); + + if (domain == AF_INET) { + if (addr_len < sizeof(struct sockaddr_in)) + return -1; + addr4 = (struct sockaddr_in *)addr; + addr4->sin_family = domain; + addr4->sin_port = htons(port); + if (inet_pton(domain, ip, (void *)&addr4->sin_addr) != 1) { + log_err("Invalid IPv4: %s", ip); + return -1; + } + } else if (domain == AF_INET6) { + if (addr_len < sizeof(struct sockaddr_in6)) + return -1; + addr6 = (struct sockaddr_in6 *)addr; + addr6->sin6_family = domain; + addr6->sin6_port = htons(port); + if (inet_pton(domain, ip, (void *)&addr6->sin6_addr) != 1) { + log_err("Invalid IPv6: %s", ip); + return -1; + } + } + + return 0; +} + +static int load_insns(enum bpf_attach_type attach_type, + const struct bpf_insn *insns, size_t insns_cnt, + const char *comment) +{ + struct bpf_load_program_attr load_attr; + int ret; + + memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); + load_attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + load_attr.expected_attach_type = attach_type; + load_attr.insns = insns; + load_attr.insns_cnt = insns_cnt; + load_attr.license = "GPL"; + + ret = bpf_load_program_xattr(&load_attr, bpf_log_buf, BPF_LOG_BUF_SIZE); + if (ret < 0 && comment) { + log_err(">>> Loading %s program error.\n" + ">>> Output from verifier:\n%s\n-------\n", + comment, bpf_log_buf); + } + + return ret; +} + +/* [1] These testing programs try to read different context fields, including + * narrow loads of different sizes from user_ip4 and user_ip6, and write to + * those allowed to be overridden. + * + * [2] BPF_LD_IMM64 & BPF_JMP_REG are used below whenever there is a need to + * compare a register with unsigned 32bit integer. BPF_JMP_IMM can't be used + * in such cases since it accepts only _signed_ 32bit integer as IMM + * argument. Also note that BPF_LD_IMM64 contains 2 instructions what matters + * to count jumps properly. + */ + +static int bind4_prog_load(enum bpf_attach_type attach_type, + const char *comment) +{ + union { + uint8_t u4_addr8[4]; + uint16_t u4_addr16[2]; + uint32_t u4_addr32; + } ip4; + struct sockaddr_in addr4_rw; + + if (inet_pton(AF_INET, SERV4_IP, (void *)&ip4) != 1) { + log_err("Invalid IPv4: %s", SERV4_IP); + return -1; + } + + if (mk_sockaddr(AF_INET, SERV4_REWRITE_IP, SERV4_REWRITE_PORT, + (struct sockaddr *)&addr4_rw, sizeof(addr4_rw)) == -1) + return -1; + + /* See [1]. */ + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (sk.family == AF_INET && */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, family)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET, 16), + + /* (sk.type == SOCK_DGRAM || sk.type == SOCK_STREAM) && */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, type)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_DGRAM, 1), + BPF_JMP_A(1), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, SOCK_STREAM, 12), + + /* 1st_byte_of_user_ip4 == expected && */ + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_ip4)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr8[0], 10), + + /* 1st_half_of_user_ip4 == expected && */ + BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_ip4)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip4.u4_addr16[0], 8), + + /* whole_user_ip4 == expected) { */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_ip4)), + BPF_LD_IMM64(BPF_REG_8, ip4.u4_addr32), /* See [2]. */ + BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 4), + + /* user_ip4 = addr4_rw.sin_addr */ + BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_addr.s_addr), + BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, + offsetof(struct bpf_sock_addr, user_ip4)), + + /* user_port = addr4_rw.sin_port */ + BPF_MOV32_IMM(BPF_REG_7, addr4_rw.sin_port), + BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, + offsetof(struct bpf_sock_addr, user_port)), + /* } */ + + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + + return load_insns(attach_type, insns, + sizeof(insns) / sizeof(struct bpf_insn), comment); +} + +static int bind6_prog_load(enum bpf_attach_type attach_type, + const char *comment) +{ + struct sockaddr_in6 addr6_rw; + struct in6_addr ip6; + + if (inet_pton(AF_INET6, SERV6_IP, (void *)&ip6) != 1) { + log_err("Invalid IPv6: %s", SERV6_IP); + return -1; + } + + if (mk_sockaddr(AF_INET6, SERV6_REWRITE_IP, SERV6_REWRITE_PORT, + (struct sockaddr *)&addr6_rw, sizeof(addr6_rw)) == -1) + return -1; + + /* See [1]. */ + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + + /* if (sk.family == AF_INET6 && */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, family)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, AF_INET6, 18), + + /* 5th_byte_of_user_ip6 == expected && */ + BPF_LDX_MEM(BPF_B, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_ip6[1])), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr[4], 16), + + /* 3rd_half_of_user_ip6 == expected && */ + BPF_LDX_MEM(BPF_H, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_ip6[1])), + BPF_JMP_IMM(BPF_JNE, BPF_REG_7, ip6.s6_addr16[2], 14), + + /* last_word_of_user_ip6 == expected) { */ + BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, + offsetof(struct bpf_sock_addr, user_ip6[3])), + BPF_LD_IMM64(BPF_REG_8, ip6.s6_addr32[3]), /* See [2]. */ + BPF_JMP_REG(BPF_JNE, BPF_REG_7, BPF_REG_8, 10), + + +#define STORE_IPV6_WORD(N) \ + BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_addr.s6_addr32[N]), \ + BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, \ + offsetof(struct bpf_sock_addr, user_ip6[N])) + + /* user_ip6 = addr6_rw.sin6_addr */ + STORE_IPV6_WORD(0), + STORE_IPV6_WORD(1), + STORE_IPV6_WORD(2), + STORE_IPV6_WORD(3), + + /* user_port = addr6_rw.sin6_port */ + BPF_MOV32_IMM(BPF_REG_7, addr6_rw.sin6_port), + BPF_STX_MEM(BPF_W, BPF_REG_6, BPF_REG_7, + offsetof(struct bpf_sock_addr, user_port)), + + /* } */ + + /* return 1 */ + BPF_MOV64_IMM(BPF_REG_0, 1), + BPF_EXIT_INSN(), + }; + + return load_insns(attach_type, insns, + sizeof(insns) / sizeof(struct bpf_insn), comment); +} + +static int connect_prog_load_path(const char *path, + enum bpf_attach_type attach_type, + const char *comment) +{ + struct bpf_prog_load_attr attr; + struct bpf_object *obj; + int prog_fd; + + memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); + attr.file = path; + attr.prog_type = BPF_PROG_TYPE_CGROUP_SOCK_ADDR; + attr.expected_attach_type = attach_type; + + if (bpf_prog_load_xattr(&attr, &obj, &prog_fd)) { + if (comment) + log_err(">>> Loading %s program at %s error.\n", + comment, path); + return -1; + } + + return prog_fd; +} + +static int connect4_prog_load(enum bpf_attach_type attach_type, + const char *comment) +{ + return connect_prog_load_path(CONNECT4_PROG_PATH, attach_type, comment); +} + +static int connect6_prog_load(enum bpf_attach_type attach_type, + const char *comment) +{ + return connect_prog_load_path(CONNECT6_PROG_PATH, attach_type, comment); +} + +static void print_ip_port(int sockfd, info_fn fn, const char *fmt) +{ + char addr_buf[INET_NTOP_BUF]; + struct sockaddr_storage addr; + struct sockaddr_in6 *addr6; + struct sockaddr_in *addr4; + socklen_t addr_len; + unsigned short port; + void *nip; + + addr_len = sizeof(struct sockaddr_storage); + memset(&addr, 0, addr_len); + + if (fn(sockfd, (struct sockaddr *)&addr, (socklen_t *)&addr_len) == 0) { + if (addr.ss_family == AF_INET) { + addr4 = (struct sockaddr_in *)&addr; + nip = (void *)&addr4->sin_addr; + port = ntohs(addr4->sin_port); + } else if (addr.ss_family == AF_INET6) { + addr6 = (struct sockaddr_in6 *)&addr; + nip = (void *)&addr6->sin6_addr; + port = ntohs(addr6->sin6_port); + } else { + return; + } + const char *addr_str = + inet_ntop(addr.ss_family, nip, addr_buf, INET_NTOP_BUF); + printf(fmt, addr_str ? addr_str : "??", port); + } +} + +static void print_local_ip_port(int sockfd, const char *fmt) +{ + print_ip_port(sockfd, getsockname, fmt); +} + +static void print_remote_ip_port(int sockfd, const char *fmt) +{ + print_ip_port(sockfd, getpeername, fmt); +} + +static int start_server(int type, const struct sockaddr_storage *addr, + socklen_t addr_len) +{ + + int fd; + + fd = socket(addr->ss_family, type, 0); + if (fd == -1) { + log_err("Failed to create server socket"); + goto out; + } + + if (bind(fd, (const struct sockaddr *)addr, addr_len) == -1) { + log_err("Failed to bind server socket"); + goto close_out; + } + + if (type == SOCK_STREAM) { + if (listen(fd, 128) == -1) { + log_err("Failed to listen on server socket"); + goto close_out; + } + } + + print_local_ip_port(fd, "\t Actual: bind(%s, %d)\n"); + + goto out; +close_out: + close(fd); + fd = -1; +out: + return fd; +} + +static int connect_to_server(int type, const struct sockaddr_storage *addr, + socklen_t addr_len) +{ + int domain; + int fd; + + domain = addr->ss_family; + + if (domain != AF_INET && domain != AF_INET6) { + log_err("Unsupported address family"); + return -1; + } + + fd = socket(domain, type, 0); + if (fd == -1) { + log_err("Failed to creating client socket"); + return -1; + } + + if (connect(fd, (const struct sockaddr *)addr, addr_len) == -1) { + log_err("Fail to connect to server"); + goto err; + } + + print_remote_ip_port(fd, "\t Actual: connect(%s, %d)"); + print_local_ip_port(fd, " from (%s, %d)\n"); + + return 0; +err: + close(fd); + return -1; +} + +static void print_test_case_num(int domain, int type) +{ + static int test_num; + + printf("Test case #%d (%s/%s):\n", ++test_num, + (domain == AF_INET ? "IPv4" : + domain == AF_INET6 ? "IPv6" : + "unknown_domain"), + (type == SOCK_STREAM ? "TCP" : + type == SOCK_DGRAM ? "UDP" : + "unknown_type")); +} + +static int run_test_case(int domain, int type, const char *ip, + unsigned short port) +{ + struct sockaddr_storage addr; + socklen_t addr_len = sizeof(addr); + int servfd = -1; + int err = 0; + + print_test_case_num(domain, type); + + if (mk_sockaddr(domain, ip, port, (struct sockaddr *)&addr, + addr_len) == -1) + return -1; + + printf("\tRequested: bind(%s, %d) ..\n", ip, port); + servfd = start_server(type, &addr, addr_len); + if (servfd == -1) + goto err; + + printf("\tRequested: connect(%s, %d) from (*, *) ..\n", ip, port); + if (connect_to_server(type, &addr, addr_len)) + goto err; + + goto out; +err: + err = -1; +out: + close(servfd); + return err; +} + +static void close_progs_fds(struct program *progs, size_t prog_cnt) +{ + size_t i; + + for (i = 0; i < prog_cnt; ++i) { + close(progs[i].fd); + progs[i].fd = -1; + } +} + +static int load_and_attach_progs(int cgfd, struct program *progs, + size_t prog_cnt) +{ + size_t i; + + for (i = 0; i < prog_cnt; ++i) { + printf("Load %s with invalid type (can pollute stderr) ", + progs[i].name); + fflush(stdout); + progs[i].fd = progs[i].loadfn(progs[i].invalid_type, NULL); + if (progs[i].fd != -1) { + log_err("Load with invalid type accepted for %s", + progs[i].name); + goto err; + } + printf("... REJECTED\n"); + + printf("Load %s with valid type", progs[i].name); + progs[i].fd = progs[i].loadfn(progs[i].type, progs[i].name); + if (progs[i].fd == -1) { + log_err("Failed to load program %s", progs[i].name); + goto err; + } + printf(" ... OK\n"); + + printf("Attach %s with invalid type", progs[i].name); + if (bpf_prog_attach(progs[i].fd, cgfd, progs[i].invalid_type, + BPF_F_ALLOW_OVERRIDE) != -1) { + log_err("Attach with invalid type accepted for %s", + progs[i].name); + goto err; + } + printf(" ... REJECTED\n"); + + printf("Attach %s with valid type", progs[i].name); + if (bpf_prog_attach(progs[i].fd, cgfd, progs[i].type, + BPF_F_ALLOW_OVERRIDE) == -1) { + log_err("Failed to attach program %s", progs[i].name); + goto err; + } + printf(" ... OK\n"); + } + + return 0; +err: + close_progs_fds(progs, prog_cnt); + return -1; +} + +static int run_domain_test(int domain, int cgfd, struct program *progs, + size_t prog_cnt, const char *ip, unsigned short port) +{ + int err = 0; + + if (load_and_attach_progs(cgfd, progs, prog_cnt) == -1) + goto err; + + if (run_test_case(domain, SOCK_STREAM, ip, port) == -1) + goto err; + + if (run_test_case(domain, SOCK_DGRAM, ip, port) == -1) + goto err; + + goto out; +err: + err = -1; +out: + close_progs_fds(progs, prog_cnt); + return err; +} + +static int run_test(void) +{ + size_t inet6_prog_cnt; + size_t inet_prog_cnt; + int cgfd = -1; + int err = 0; + + struct program inet6_progs[] = { + {BPF_CGROUP_INET6_BIND, bind6_prog_load, -1, "bind6", + BPF_CGROUP_INET4_BIND}, + {BPF_CGROUP_INET6_CONNECT, connect6_prog_load, -1, "connect6", + BPF_CGROUP_INET4_CONNECT}, + }; + inet6_prog_cnt = sizeof(inet6_progs) / sizeof(struct program); + + struct program inet_progs[] = { + {BPF_CGROUP_INET4_BIND, bind4_prog_load, -1, "bind4", + BPF_CGROUP_INET6_BIND}, + {BPF_CGROUP_INET4_CONNECT, connect4_prog_load, -1, "connect4", + BPF_CGROUP_INET6_CONNECT}, + }; + inet_prog_cnt = sizeof(inet_progs) / sizeof(struct program); + + if (setup_cgroup_environment()) + goto err; + + cgfd = create_and_get_cgroup(CG_PATH); + if (!cgfd) + goto err; + + if (join_cgroup(CG_PATH)) + goto err; + + if (run_domain_test(AF_INET, cgfd, inet_progs, inet_prog_cnt, SERV4_IP, + SERV4_PORT) == -1) + goto err; + + if (run_domain_test(AF_INET6, cgfd, inet6_progs, inet6_prog_cnt, + SERV6_IP, SERV6_PORT) == -1) + goto err; + + goto out; +err: + err = -1; +out: + close(cgfd); + cleanup_cgroup_environment(); + printf(err ? "### FAIL\n" : "### SUCCESS\n"); + return err; +} + +int main(int argc, char **argv) +{ + if (argc < 2) { + fprintf(stderr, + "%s has to be run via %s.sh. Skip direct run.\n", + argv[0], argv[0]); + exit(0); + } + return run_test(); +} diff --git a/tools/testing/selftests/bpf/test_sock_addr.sh b/tools/testing/selftests/bpf/test_sock_addr.sh new file mode 100755 index 000000000000..c6e1dcf992c4 --- /dev/null +++ b/tools/testing/selftests/bpf/test_sock_addr.sh @@ -0,0 +1,57 @@ +#!/bin/sh + +set -eu + +ping_once() +{ + ping -q -c 1 -W 1 ${1%%/*} >/dev/null 2>&1 +} + +wait_for_ip() +{ + local _i + echo -n "Wait for testing IPv4/IPv6 to become available " + for _i in $(seq ${MAX_PING_TRIES}); do + echo -n "." + if ping_once ${TEST_IPv4} && ping_once ${TEST_IPv6}; then + echo " OK" + return + fi + done + echo 1>&2 "ERROR: Timeout waiting for test IP to become available." + exit 1 +} + +setup() +{ + # Create testing interfaces not to interfere with current environment. + ip link add dev ${TEST_IF} type veth peer name ${TEST_IF_PEER} + ip link set ${TEST_IF} up + ip link set ${TEST_IF_PEER} up + + ip -4 addr add ${TEST_IPv4} dev ${TEST_IF} + ip -6 addr add ${TEST_IPv6} dev ${TEST_IF} + wait_for_ip +} + +cleanup() +{ + ip link del ${TEST_IF} 2>/dev/null || : + ip link del ${TEST_IF_PEER} 2>/dev/null || : +} + +main() +{ + trap cleanup EXIT 2 3 6 15 + setup + ./test_sock_addr setup_done +} + +BASENAME=$(basename $0 .sh) +TEST_IF="${BASENAME}1" +TEST_IF_PEER="${BASENAME}2" +TEST_IPv4="127.0.0.4/8" +TEST_IPv6="::6/128" +MAX_PING_TRIES=5 + +main |