diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 1da831f352ef..88912a29bd86 100755 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -412,6 +412,7 @@ struct bpf_prog *bpf_prog_get_type_path(const char *name, enum bpf_prog_type typ /* Map specifics */ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key); +struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key); void __dev_map_insert_ctx(struct bpf_map *map, u32 index); void __dev_map_flush(struct bpf_map *map); @@ -484,6 +485,12 @@ static inline struct net_device *__dev_map_lookup_elem(struct bpf_map *map, return NULL; } +static inline struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, + u32 key) +{ + return NULL; +} + static inline void __dev_map_insert_ctx(struct bpf_map *map, u32 index) { } diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index 9d91c9575bb4..43861ad3299a 100755 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -40,6 +40,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) #ifdef CONFIG_NET BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP, dev_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_DEVMAP_HASH, dev_map_hash_ops) #ifdef CONFIG_STREAM_PARSER BPF_MAP_TYPE(BPF_MAP_TYPE_SOCKMAP, sock_map_ops) #endif diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 438fa4ffbe38..91b4e9d8d059 100755 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -114,6 +114,7 @@ enum bpf_map_type { BPF_MAP_TYPE_HASH_OF_MAPS, BPF_MAP_TYPE_DEVMAP, BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_DEVMAP_HASH = 25, }; enum bpf_prog_type { diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 2f36791368e4..54a77df54fe3 100755 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -46,6 +46,12 @@ * notifier hook walks the map we know that new dev references can not be * added by the user because core infrastructure ensures dev_get_by_index() * calls will fail at this point. + * + * The devmap_hash type is a map type which interprets keys as ifindexes and + * indexes these using a hashmap. This allows maps that use ifindex as key to be + * densely packed instead of having holes in the lookup array for unused + * ifindexes. The setup and packet enqueue/send code is shared between the two + * types of devmap; only the lookup and insertion is different. */ #include #include @@ -55,6 +61,7 @@ struct bpf_dtab_netdev { struct net_device *dev; + struct hlist_node index_hlist; struct bpf_dtab *dtab; unsigned int bit; struct rcu_head rcu; @@ -65,11 +72,30 @@ struct bpf_dtab { struct bpf_dtab_netdev **netdev_map; unsigned long __percpu *flush_needed; struct list_head list; + + /* these are only used for DEVMAP_HASH type maps */ + struct hlist_head *dev_index_head; + spinlock_t index_lock; + unsigned int items; + u32 n_buckets; }; static DEFINE_SPINLOCK(dev_map_lock); static LIST_HEAD(dev_map_list); +static struct hlist_head *dev_map_create_hash(unsigned int entries) +{ + int i; + struct hlist_head *hash; + + hash = kmalloc_array(entries, sizeof(*hash), GFP_KERNEL); + if (hash != NULL) + for (i = 0; i < entries; i++) + INIT_HLIST_HEAD(&hash[i]); + + return hash; +} + static u64 dev_map_bitmap_size(const union bpf_attr *attr) { return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long); @@ -109,6 +135,16 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries); + + if (!dtab->n_buckets) { /* Overflow check */ + err = -EINVAL; + goto free_dtab; + } + cost += sizeof(struct hlist_head) * dtab->n_buckets; + } + /* if map size is larger than memlock limit, reject it early */ err = bpf_map_precharge_memlock(dtab->map.pages); if (err) @@ -129,13 +165,24 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (!dtab->netdev_map) goto free_dtab; + if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { + dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets); + if (!dtab->dev_index_head) + goto free_map_area; + + spin_lock_init(&dtab->index_lock); + } + spin_lock(&dev_map_lock); list_add_tail_rcu(&dtab->list, &dev_map_list); spin_unlock(&dev_map_lock); return &dtab->map; +free_map_area: + bpf_map_area_free(dtab->netdev_map); free_dtab: free_percpu(dtab->flush_needed); + kfree(dtab->dev_index_head); kfree(dtab); return ERR_PTR(err); } @@ -187,6 +234,7 @@ static void dev_map_free(struct bpf_map *map) free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); + kfree(dtab->dev_index_head); kfree(dtab); } @@ -207,6 +255,77 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab, + int idx) +{ + return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)]; +} + +static struct bpf_dtab_netdev *__dev_map_hash_lookup_elem_dtab(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct hlist_head *head = dev_map_index_hash(dtab, key); + struct bpf_dtab_netdev *dev; + + hlist_for_each_entry_rcu(dev, head, index_hlist) + if (dev->bit == key) + return dev; + + return NULL; +} + +struct net_device *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key) +{ + struct bpf_dtab_netdev *dev = __dev_map_hash_lookup_elem_dtab(map, key); + + return dev ? dev->dev : NULL; +} + +static int dev_map_hash_get_next_key(struct bpf_map *map, void *key, + void *next_key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + u32 idx, *next = next_key; + struct bpf_dtab_netdev *dev, *next_dev; + struct hlist_head *head; + int i = 0; + + if (!key) + goto find_first; + + idx = *(u32 *)key; + + dev = __dev_map_hash_lookup_elem_dtab(map, idx); + if (!dev) + goto find_first; + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)), + struct bpf_dtab_netdev, index_hlist); + + if (next_dev) { + *next = next_dev->bit; + return 0; + } + + i = idx & (dtab->n_buckets - 1); + i++; + + find_first: + for (; i < dtab->n_buckets; i++) { + head = dev_map_index_hash(dtab, i); + + next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)), + struct bpf_dtab_netdev, + index_hlist); + if (next_dev) { + *next = next_dev->bit; + return 0; + } + } + + return -ENOENT; +} + void __dev_map_insert_ctx(struct bpf_map *map, u32 bit) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -268,6 +387,13 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->ifindex : NULL; } +static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key) +{ + struct net_device *dev = __dev_map_hash_lookup_elem(map, *(u32 *)key); + + return dev ? &dev->ifindex : NULL; +} + static void dev_map_flush_old(struct bpf_dtab_netdev *dev) { if (dev->dev->netdev_ops->ndo_xdp_flush) { @@ -317,6 +443,28 @@ static int dev_map_delete_elem(struct bpf_map *map, void *key) return 0; } +static int dev_map_hash_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *old_dev; + int k = *(u32 *)key; + unsigned long flags; + int ret = -ENOENT; + + spin_lock_irqsave(&dtab->index_lock, flags); + + old_dev = __dev_map_hash_lookup_elem_dtab(map, k); + if (old_dev) { + dtab->items--; + hlist_del_init_rcu(&old_dev->index_hlist); + call_rcu(&old_dev->rcu, __dev_map_entry_free); + ret = 0; + } + spin_unlock_irqrestore(&dtab->index_lock, flags); + + return ret; +} + static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net, struct bpf_dtab *dtab, u32 ifindex, @@ -376,6 +524,57 @@ static int dev_map_update_elem(struct bpf_map *map, void *key, void *value, return 0; } + +static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map, + void *key, void *value, u64 map_flags) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + struct bpf_dtab_netdev *dev, *old_dev; + u32 ifindex = *(u32 *)value; + u32 idx = *(u32 *)key; + unsigned long flags; + + if (unlikely(map_flags > BPF_EXIST || !ifindex)) + return -EINVAL; + + old_dev = __dev_map_hash_lookup_elem_dtab(map, idx); + if (old_dev && (map_flags & BPF_NOEXIST)) + return -EEXIST; + + dev = __dev_map_alloc_node(net, dtab, ifindex, idx); + if (IS_ERR(dev)) + return PTR_ERR(dev); + + spin_lock_irqsave(&dtab->index_lock, flags); + + if (old_dev) { + hlist_del_rcu(&old_dev->index_hlist); + } else { + if (dtab->items >= dtab->map.max_entries) { + spin_unlock_irqrestore(&dtab->index_lock, flags); + call_rcu(&dev->rcu, __dev_map_entry_free); + return -E2BIG; + } + dtab->items++; + } + + hlist_add_head_rcu(&dev->index_hlist, + dev_map_index_hash(dtab, idx)); + spin_unlock_irqrestore(&dtab->index_lock, flags); + + if (old_dev) + call_rcu(&old_dev->rcu, __dev_map_entry_free); + + return 0; +} + +static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return __dev_map_hash_update_elem(current->nsproxy->net_ns, + map, key, value, map_flags); +} + const struct bpf_map_ops dev_map_ops = { .map_alloc = dev_map_alloc, .map_free = dev_map_free, @@ -385,6 +584,15 @@ const struct bpf_map_ops dev_map_ops = { .map_delete_elem = dev_map_delete_elem, }; +const struct bpf_map_ops dev_map_hash_ops = { + .map_alloc = dev_map_alloc, + .map_free = dev_map_free, + .map_get_next_key = dev_map_hash_get_next_key, + .map_lookup_elem = dev_map_hash_lookup_elem, + .map_update_elem = dev_map_hash_update_elem, + .map_delete_elem = dev_map_hash_delete_elem, +}; + static int dev_map_notification(struct notifier_block *notifier, ulong event, void *ptr) { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 74d1a379ca40..6b3706e58b66 100755 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1768,6 +1768,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * for now. */ case BPF_MAP_TYPE_DEVMAP: + case BPF_MAP_TYPE_DEVMAP_HASH: if (func_id != BPF_FUNC_redirect_map) goto error; break; @@ -1807,7 +1808,8 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) goto error; break; case BPF_FUNC_redirect_map: - if (map->map_type != BPF_MAP_TYPE_DEVMAP) + if (map->map_type != BPF_MAP_TYPE_DEVMAP && + map->map_type != BPF_MAP_TYPE_DEVMAP_HASH) goto error; break; case BPF_FUNC_sk_redirect_map: diff --git a/net/core/filter.c b/net/core/filter.c index 76b9e4f11d08..bd3bcdeeda49 100755 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2594,7 +2594,10 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp, goto err; } - fwd = __dev_map_lookup_elem(map, index); + if (map->map_type == BPF_MAP_TYPE_DEVMAP) + fwd = __dev_map_lookup_elem(map, index); + else if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) + fwd = __dev_map_hash_lookup_elem(map, index); if (!fwd) { err = -EINVAL; goto err; @@ -2665,7 +2668,10 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, map = NULL; goto err; } - fwd = __dev_map_lookup_elem(map, index); + if (map->map_type == BPF_MAP_TYPE_DEVMAP) + fwd = __dev_map_lookup_elem(map, index); + else if (map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) + fwd = __dev_map_hash_lookup_elem(map, index); } else { fwd = dev_get_by_index_rcu(dev_net(dev), index); }