summaryrefslogtreecommitdiff
path: root/net/packet/af_packet.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/packet/af_packet.c')
-rw-r--r--net/packet/af_packet.c156
1 files changed, 100 insertions, 56 deletions
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 8298e676f5a0..91cb1d71f018 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -61,6 +61,7 @@
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
#include <net/net_namespace.h>
#include <net/ip.h>
#include <net/protocol.h>
@@ -163,8 +164,13 @@ struct packet_mreq_max {
static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
int closing, int tx_ring);
+#define PGV_FROM_VMALLOC 1
+struct pgv {
+ char *buffer;
+};
+
struct packet_ring_buffer {
- char **pg_vec;
+ struct pgv *pg_vec;
unsigned int head;
unsigned int frames_per_block;
unsigned int frame_size;
@@ -217,6 +223,13 @@ struct packet_skb_cb {
#define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
+static inline __pure struct page *pgv_to_page(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ return vmalloc_to_page(addr);
+ return virt_to_page(addr);
+}
+
static void __packet_set_status(struct packet_sock *po, void *frame, int status)
{
union {
@@ -229,11 +242,11 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
switch (po->tp_version) {
case TPACKET_V1:
h.h1->tp_status = status;
- flush_dcache_page(virt_to_page(&h.h1->tp_status));
+ flush_dcache_page(pgv_to_page(&h.h1->tp_status));
break;
case TPACKET_V2:
h.h2->tp_status = status;
- flush_dcache_page(virt_to_page(&h.h2->tp_status));
+ flush_dcache_page(pgv_to_page(&h.h2->tp_status));
break;
default:
pr_err("TPACKET version not supported\n");
@@ -256,10 +269,10 @@ static int __packet_get_status(struct packet_sock *po, void *frame)
h.raw = frame;
switch (po->tp_version) {
case TPACKET_V1:
- flush_dcache_page(virt_to_page(&h.h1->tp_status));
+ flush_dcache_page(pgv_to_page(&h.h1->tp_status));
return h.h1->tp_status;
case TPACKET_V2:
- flush_dcache_page(virt_to_page(&h.h2->tp_status));
+ flush_dcache_page(pgv_to_page(&h.h2->tp_status));
return h.h2->tp_status;
default:
pr_err("TPACKET version not supported\n");
@@ -283,7 +296,8 @@ static void *packet_lookup_frame(struct packet_sock *po,
pg_vec_pos = position / rb->frames_per_block;
frame_offset = position % rb->frames_per_block;
- h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
+ h.raw = rb->pg_vec[pg_vec_pos].buffer +
+ (frame_offset * rb->frame_size);
if (status != __packet_get_status(po, h.raw))
return NULL;
@@ -503,7 +517,8 @@ out_free:
return err;
}
-static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
+static inline unsigned int run_filter(const struct sk_buff *skb,
+ const struct sock *sk,
unsigned int res)
{
struct sk_filter *filter;
@@ -511,22 +526,22 @@ static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
rcu_read_lock_bh();
filter = rcu_dereference_bh(sk->sk_filter);
if (filter != NULL)
- res = sk_run_filter(skb, filter->insns, filter->len);
+ res = sk_run_filter(skb, filter->insns);
rcu_read_unlock_bh();
return res;
}
/*
- This function makes lazy skb cloning in hope that most of packets
- are discarded by BPF.
-
- Note tricky part: we DO mangle shared skb! skb->data, skb->len
- and skb->cb are mangled. It works because (and until) packets
- falling here are owned by current CPU. Output packets are cloned
- by dev_queue_xmit_nit(), input packets are processed by net_bh
- sequencially, so that if we return skb to original state on exit,
- we will not harm anyone.
+ * This function makes lazy skb cloning in hope that most of packets
+ * are discarded by BPF.
+ *
+ * Note tricky part: we DO mangle shared skb! skb->data, skb->len
+ * and skb->cb are mangled. It works because (and until) packets
+ * falling here are owned by current CPU. Output packets are cloned
+ * by dev_queue_xmit_nit(), input packets are processed by net_bh
+ * sequencially, so that if we return skb to original state on exit,
+ * we will not harm anyone.
*/
static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
@@ -552,11 +567,11 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
if (dev->header_ops) {
/* The device has an explicit notion of ll header,
- exported to higher levels.
-
- Otherwise, the device hides datails of it frame
- structure, so that corresponding packet head
- never delivered to user.
+ * exported to higher levels.
+ *
+ * Otherwise, the device hides details of its frame
+ * structure, so that corresponding packet head is
+ * never delivered to user.
*/
if (sk->sk_type != SOCK_DGRAM)
skb_push(skb, skb->data - skb_mac_header(skb));
@@ -791,17 +806,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
__packet_set_status(po, h.raw, status);
smp_mb();
+#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
{
- struct page *p_start, *p_end;
- u8 *h_end = h.raw + macoff + snaplen - 1;
-
- p_start = virt_to_page(h.raw);
- p_end = virt_to_page(h_end);
- while (p_start <= p_end) {
- flush_dcache_page(p_start);
- p_start++;
- }
+ u8 *start, *end;
+
+ end = (u8 *)PAGE_ALIGN((unsigned long)h.raw + macoff + snaplen);
+ for (start = h.raw; start < end; start += PAGE_SIZE)
+ flush_dcache_page(pgv_to_page(start));
}
+#endif
sk->sk_data_ready(sk, 0);
@@ -907,7 +920,6 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
}
err = -EFAULT;
- page = virt_to_page(data);
offset = offset_in_page(data);
len_max = PAGE_SIZE - offset;
len = ((to_write > len_max) ? len_max : to_write);
@@ -926,11 +938,11 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
return -EFAULT;
}
+ page = pgv_to_page(data);
+ data += len;
flush_dcache_page(page);
get_page(page);
- skb_fill_page_desc(skb,
- nr_frags,
- page++, offset, len);
+ skb_fill_page_desc(skb, nr_frags, page, offset, len);
to_write -= len;
offset = 0;
len_max = PAGE_SIZE;
@@ -1638,8 +1650,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
if (skb->ip_summed == CHECKSUM_PARTIAL) {
vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
- vnet_hdr.csum_start = skb->csum_start -
- skb_headroom(skb);
+ vnet_hdr.csum_start = skb_checksum_start_offset(skb);
vnet_hdr.csum_offset = skb->csum_offset;
} /* else everything is zero */
@@ -2325,37 +2336,70 @@ static const struct vm_operations_struct packet_mmap_ops = {
.close = packet_mm_close,
};
-static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
+static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
+ unsigned int len)
{
int i;
for (i = 0; i < len; i++) {
- if (likely(pg_vec[i]))
- free_pages((unsigned long) pg_vec[i], order);
+ if (likely(pg_vec[i].buffer)) {
+ if (is_vmalloc_addr(pg_vec[i].buffer))
+ vfree(pg_vec[i].buffer);
+ else
+ free_pages((unsigned long)pg_vec[i].buffer,
+ order);
+ pg_vec[i].buffer = NULL;
+ }
}
kfree(pg_vec);
}
static inline char *alloc_one_pg_vec_page(unsigned long order)
{
- gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+ char *buffer = NULL;
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
+ __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
- return (char *) __get_free_pages(gfp_flags, order);
+ buffer = (char *) __get_free_pages(gfp_flags, order);
+
+ if (buffer)
+ return buffer;
+
+ /*
+ * __get_free_pages failed, fall back to vmalloc
+ */
+ buffer = vzalloc((1 << order) * PAGE_SIZE);
+
+ if (buffer)
+ return buffer;
+
+ /*
+ * vmalloc failed, lets dig into swap here
+ */
+ gfp_flags &= ~__GFP_NORETRY;
+ buffer = (char *)__get_free_pages(gfp_flags, order);
+ if (buffer)
+ return buffer;
+
+ /*
+ * complete and utter failure
+ */
+ return NULL;
}
-static char **alloc_pg_vec(struct tpacket_req *req, int order)
+static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
{
unsigned int block_nr = req->tp_block_nr;
- char **pg_vec;
+ struct pgv *pg_vec;
int i;
- pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
+ pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
if (unlikely(!pg_vec))
goto out;
for (i = 0; i < block_nr; i++) {
- pg_vec[i] = alloc_one_pg_vec_page(order);
- if (unlikely(!pg_vec[i]))
+ pg_vec[i].buffer = alloc_one_pg_vec_page(order);
+ if (unlikely(!pg_vec[i].buffer))
goto out_free_pgvec;
}
@@ -2371,7 +2415,7 @@ out_free_pgvec:
static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
int closing, int tx_ring)
{
- char **pg_vec = NULL;
+ struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
int was_running, order = 0;
struct packet_ring_buffer *rb;
@@ -2456,22 +2500,20 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
mutex_lock(&po->pg_vec_lock);
if (closing || atomic_read(&po->mapped) == 0) {
err = 0;
-#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
spin_lock_bh(&rb_queue->lock);
- pg_vec = XC(rb->pg_vec, pg_vec);
+ swap(rb->pg_vec, pg_vec);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
spin_unlock_bh(&rb_queue->lock);
- order = XC(rb->pg_vec_order, order);
- req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
+ swap(rb->pg_vec_order, order);
+ swap(rb->pg_vec_len, req->tp_block_nr);
rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
po->prot_hook.func = (po->rx_ring.pg_vec) ?
tpacket_rcv : packet_rcv;
skb_queue_purge(rb_queue);
-#undef XC
if (atomic_read(&po->mapped))
pr_err("packet_mmap: vma is busy: %d\n",
atomic_read(&po->mapped));
@@ -2533,15 +2575,17 @@ static int packet_mmap(struct file *file, struct socket *sock,
continue;
for (i = 0; i < rb->pg_vec_len; i++) {
- struct page *page = virt_to_page(rb->pg_vec[i]);
+ struct page *page;
+ void *kaddr = rb->pg_vec[i].buffer;
int pg_num;
- for (pg_num = 0; pg_num < rb->pg_vec_pages;
- pg_num++, page++) {
+ for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
+ page = pgv_to_page(kaddr);
err = vm_insert_page(vma, start, page);
if (unlikely(err))
goto out;
start += PAGE_SIZE;
+ kaddr += PAGE_SIZE;
}
}
}