mirror of https://github.com/torvalds/linux.git
tcp: Update bind bucket state on port release
Today, once an inet_bind_bucket enters a state where fastreuse >= 0 or fastreuseport >= 0 after a socket is explicitly bound to a port, it remains in that state until all sockets are removed and the bucket is destroyed. In this state, the bucket is skipped during ephemeral port selection in connect(). For applications using a reduced ephemeral port range (IP_LOCAL_PORT_RANGE socket option), this can cause faster port exhaustion since blocked buckets are excluded from reuse. The reason the bucket state isn't updated on port release is unclear. Possibly a performance trade-off to avoid scanning bucket owners, or just an oversight. Fix it by recalculating the bucket state when a socket releases a port. To limit overhead, each inet_bind2_bucket stores its own (fastreuse, fastreuseport) state. On port release, only the relevant port-addr bucket is scanned, and the overall state is derived from these. Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com> Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com> Link: https://patch.msgid.link/20250917-update-bind-bucket-state-on-unhash-v5-1-57168b661b47@cloudflare.com Signed-off-by: Paolo Abeni <pabeni@redhat.com>
This commit is contained in:
parent
3afb106f3f
commit
d57f4b8749
|
|
@ -316,8 +316,9 @@ int inet_csk_listen_start(struct sock *sk);
|
||||||
void inet_csk_listen_stop(struct sock *sk);
|
void inet_csk_listen_stop(struct sock *sk);
|
||||||
|
|
||||||
/* update the fast reuse flag when adding a socket */
|
/* update the fast reuse flag when adding a socket */
|
||||||
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
|
void inet_csk_update_fastreuse(const struct sock *sk,
|
||||||
struct sock *sk);
|
struct inet_bind_bucket *tb,
|
||||||
|
struct inet_bind2_bucket *tb2);
|
||||||
|
|
||||||
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
|
struct dst_entry *inet_csk_update_pmtu(struct sock *sk, u32 mtu);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -108,6 +108,8 @@ struct inet_bind2_bucket {
|
||||||
struct hlist_node bhash_node;
|
struct hlist_node bhash_node;
|
||||||
/* List of sockets hashed to this bucket */
|
/* List of sockets hashed to this bucket */
|
||||||
struct hlist_head owners;
|
struct hlist_head owners;
|
||||||
|
signed char fastreuse;
|
||||||
|
signed char fastreuseport;
|
||||||
};
|
};
|
||||||
|
|
||||||
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
|
static inline struct net *ib_net(const struct inet_bind_bucket *ib)
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,8 @@ struct inet_timewait_sock {
|
||||||
unsigned int tw_transparent : 1,
|
unsigned int tw_transparent : 1,
|
||||||
tw_flowlabel : 20,
|
tw_flowlabel : 20,
|
||||||
tw_usec_ts : 1,
|
tw_usec_ts : 1,
|
||||||
tw_pad : 2, /* 2 bits hole */
|
tw_connect_bind : 1,
|
||||||
|
tw_pad : 1, /* 1 bit hole */
|
||||||
tw_tos : 8;
|
tw_tos : 8;
|
||||||
u32 tw_txhash;
|
u32 tw_txhash;
|
||||||
u32 tw_priority;
|
u32 tw_priority;
|
||||||
|
|
|
||||||
|
|
@ -1494,6 +1494,10 @@ static inline int __sk_prot_rehash(struct sock *sk)
|
||||||
|
|
||||||
#define SOCK_BINDADDR_LOCK 4
|
#define SOCK_BINDADDR_LOCK 4
|
||||||
#define SOCK_BINDPORT_LOCK 8
|
#define SOCK_BINDPORT_LOCK 8
|
||||||
|
/**
|
||||||
|
* define SOCK_CONNECT_BIND - &sock->sk_userlocks flag for auto-bind at connect() time
|
||||||
|
*/
|
||||||
|
#define SOCK_CONNECT_BIND 16
|
||||||
|
|
||||||
struct socket_alloc {
|
struct socket_alloc {
|
||||||
struct socket socket;
|
struct socket socket;
|
||||||
|
|
|
||||||
|
|
@ -423,7 +423,7 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret,
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
|
static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
|
||||||
struct sock *sk)
|
const struct sock *sk)
|
||||||
{
|
{
|
||||||
if (tb->fastreuseport <= 0)
|
if (tb->fastreuseport <= 0)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -453,8 +453,9 @@ static inline int sk_reuseport_match(struct inet_bind_bucket *tb,
|
||||||
ipv6_only_sock(sk), true, false);
|
ipv6_only_sock(sk), true, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
|
void inet_csk_update_fastreuse(const struct sock *sk,
|
||||||
struct sock *sk)
|
struct inet_bind_bucket *tb,
|
||||||
|
struct inet_bind2_bucket *tb2)
|
||||||
{
|
{
|
||||||
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
|
bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
|
||||||
|
|
||||||
|
|
@ -501,6 +502,9 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb,
|
||||||
tb->fastreuseport = 0;
|
tb->fastreuseport = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
tb2->fastreuse = tb->fastreuse;
|
||||||
|
tb2->fastreuseport = tb->fastreuseport;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Obtain a reference to a local port for the given sock,
|
/* Obtain a reference to a local port for the given sock,
|
||||||
|
|
@ -582,7 +586,7 @@ int inet_csk_get_port(struct sock *sk, unsigned short snum)
|
||||||
}
|
}
|
||||||
|
|
||||||
success:
|
success:
|
||||||
inet_csk_update_fastreuse(tb, sk);
|
inet_csk_update_fastreuse(sk, tb, tb2);
|
||||||
|
|
||||||
if (!inet_csk(sk)->icsk_bind_hash)
|
if (!inet_csk(sk)->icsk_bind_hash)
|
||||||
inet_bind_hash(sk, tb, tb2, port);
|
inet_bind_hash(sk, tb, tb2, port);
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,14 @@ static u32 sk_ehashfn(const struct sock *sk)
|
||||||
sk->sk_daddr, sk->sk_dport);
|
sk->sk_daddr, sk->sk_dport);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool sk_is_connect_bind(const struct sock *sk)
|
||||||
|
{
|
||||||
|
if (sk->sk_state == TCP_TIME_WAIT)
|
||||||
|
return inet_twsk(sk)->tw_connect_bind;
|
||||||
|
else
|
||||||
|
return sk->sk_userlocks & SOCK_CONNECT_BIND;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Allocate and initialize a new local port bind bucket.
|
* Allocate and initialize a new local port bind bucket.
|
||||||
* The bindhash mutex for snum's hash chain must be held here.
|
* The bindhash mutex for snum's hash chain must be held here.
|
||||||
|
|
@ -87,10 +95,22 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
|
||||||
*/
|
*/
|
||||||
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
|
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
|
||||||
{
|
{
|
||||||
|
const struct inet_bind2_bucket *tb2;
|
||||||
|
|
||||||
if (hlist_empty(&tb->bhash2)) {
|
if (hlist_empty(&tb->bhash2)) {
|
||||||
hlist_del_rcu(&tb->node);
|
hlist_del_rcu(&tb->node);
|
||||||
kfree_rcu(tb, rcu);
|
kfree_rcu(tb, rcu);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tb->fastreuse == -1 && tb->fastreuseport == -1)
|
||||||
|
return;
|
||||||
|
hlist_for_each_entry(tb2, &tb->bhash2, bhash_node) {
|
||||||
|
if (tb2->fastreuse != -1 || tb2->fastreuseport != -1)
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
tb->fastreuse = -1;
|
||||||
|
tb->fastreuseport = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
|
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
|
||||||
|
|
@ -121,6 +141,8 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb2,
|
||||||
#else
|
#else
|
||||||
tb2->rcv_saddr = sk->sk_rcv_saddr;
|
tb2->rcv_saddr = sk->sk_rcv_saddr;
|
||||||
#endif
|
#endif
|
||||||
|
tb2->fastreuse = 0;
|
||||||
|
tb2->fastreuseport = 0;
|
||||||
INIT_HLIST_HEAD(&tb2->owners);
|
INIT_HLIST_HEAD(&tb2->owners);
|
||||||
hlist_add_head(&tb2->node, &head->chain);
|
hlist_add_head(&tb2->node, &head->chain);
|
||||||
hlist_add_head(&tb2->bhash_node, &tb->bhash2);
|
hlist_add_head(&tb2->bhash_node, &tb->bhash2);
|
||||||
|
|
@ -143,11 +165,23 @@ struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
|
||||||
/* Caller must hold hashbucket lock for this tb with local BH disabled */
|
/* Caller must hold hashbucket lock for this tb with local BH disabled */
|
||||||
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
|
void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
|
||||||
{
|
{
|
||||||
|
const struct sock *sk;
|
||||||
|
|
||||||
if (hlist_empty(&tb->owners)) {
|
if (hlist_empty(&tb->owners)) {
|
||||||
__hlist_del(&tb->node);
|
__hlist_del(&tb->node);
|
||||||
__hlist_del(&tb->bhash_node);
|
__hlist_del(&tb->bhash_node);
|
||||||
kmem_cache_free(cachep, tb);
|
kmem_cache_free(cachep, tb);
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (tb->fastreuse == -1 && tb->fastreuseport == -1)
|
||||||
|
return;
|
||||||
|
sk_for_each_bound(sk, &tb->owners) {
|
||||||
|
if (!sk_is_connect_bind(sk))
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
tb->fastreuse = -1;
|
||||||
|
tb->fastreuseport = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
|
static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
|
||||||
|
|
@ -191,6 +225,7 @@ static void __inet_put_port(struct sock *sk)
|
||||||
tb = inet_csk(sk)->icsk_bind_hash;
|
tb = inet_csk(sk)->icsk_bind_hash;
|
||||||
inet_csk(sk)->icsk_bind_hash = NULL;
|
inet_csk(sk)->icsk_bind_hash = NULL;
|
||||||
inet_sk(sk)->inet_num = 0;
|
inet_sk(sk)->inet_num = 0;
|
||||||
|
sk->sk_userlocks &= ~SOCK_CONNECT_BIND;
|
||||||
|
|
||||||
spin_lock(&head2->lock);
|
spin_lock(&head2->lock);
|
||||||
if (inet_csk(sk)->icsk_bind2_hash) {
|
if (inet_csk(sk)->icsk_bind2_hash) {
|
||||||
|
|
@ -277,7 +312,7 @@ int __inet_inherit_port(const struct sock *sk, struct sock *child)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (update_fastreuse)
|
if (update_fastreuse)
|
||||||
inet_csk_update_fastreuse(tb, child);
|
inet_csk_update_fastreuse(child, tb, tb2);
|
||||||
inet_bind_hash(child, tb, tb2, port);
|
inet_bind_hash(child, tb, tb2, port);
|
||||||
spin_unlock(&head2->lock);
|
spin_unlock(&head2->lock);
|
||||||
spin_unlock(&head->lock);
|
spin_unlock(&head->lock);
|
||||||
|
|
@ -950,6 +985,10 @@ static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family,
|
||||||
if (!tb2) {
|
if (!tb2) {
|
||||||
tb2 = new_tb2;
|
tb2 = new_tb2;
|
||||||
inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
|
inet_bind2_bucket_init(tb2, net, head2, inet_csk(sk)->icsk_bind_hash, sk);
|
||||||
|
if (sk_is_connect_bind(sk)) {
|
||||||
|
tb2->fastreuse = -1;
|
||||||
|
tb2->fastreuseport = -1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
inet_csk(sk)->icsk_bind2_hash = tb2;
|
inet_csk(sk)->icsk_bind2_hash = tb2;
|
||||||
sk_add_bind_node(sk, &tb2->owners);
|
sk_add_bind_node(sk, &tb2->owners);
|
||||||
|
|
@ -1120,6 +1159,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||||
head2, tb, sk);
|
head2, tb, sk);
|
||||||
if (!tb2)
|
if (!tb2)
|
||||||
goto error;
|
goto error;
|
||||||
|
tb2->fastreuse = -1;
|
||||||
|
tb2->fastreuseport = -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Here we want to add a little bit of randomness to the next source
|
/* Here we want to add a little bit of randomness to the next source
|
||||||
|
|
@ -1132,6 +1173,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||||
|
|
||||||
/* Head lock still held and bh's disabled */
|
/* Head lock still held and bh's disabled */
|
||||||
inet_bind_hash(sk, tb, tb2, port);
|
inet_bind_hash(sk, tb, tb2, port);
|
||||||
|
sk->sk_userlocks |= SOCK_CONNECT_BIND;
|
||||||
|
|
||||||
if (sk_unhashed(sk)) {
|
if (sk_unhashed(sk)) {
|
||||||
inet_sk(sk)->inet_sport = htons(port);
|
inet_sk(sk)->inet_sport = htons(port);
|
||||||
|
|
|
||||||
|
|
@ -208,6 +208,7 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
|
||||||
tw->tw_hash = sk->sk_hash;
|
tw->tw_hash = sk->sk_hash;
|
||||||
tw->tw_ipv6only = 0;
|
tw->tw_ipv6only = 0;
|
||||||
tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
|
tw->tw_transparent = inet_test_bit(TRANSPARENT, sk);
|
||||||
|
tw->tw_connect_bind = !!(sk->sk_userlocks & SOCK_CONNECT_BIND);
|
||||||
tw->tw_prot = sk->sk_prot_creator;
|
tw->tw_prot = sk->sk_prot_creator;
|
||||||
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
|
atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
|
||||||
twsk_net_set(tw, sock_net(sk));
|
twsk_net_set(tw, sock_net(sk));
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue