sock.c 69.4 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1 2 3 4 5 6 7 8 9
/*
 * INET		An implementation of the TCP/IP protocol suite for the LINUX
 *		operating system.  INET is implemented using the  BSD Socket
 *		interface as the means of communication with the user level.
 *
 *		Generic socket support routines. Memory allocators, socket lock/release
 *		handler for protocols to use and generic option handler.
 *
 *
10
 * Authors:	Ross Biro
Linus Torvalds's avatar
Linus Torvalds committed
11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
 *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
 *		Florian La Roche, <flla@stud.uni-sb.de>
 *		Alan Cox, <A.Cox@swansea.ac.uk>
 *
 * Fixes:
 *		Alan Cox	: 	Numerous verify_area() problems
 *		Alan Cox	:	Connecting on a connecting socket
 *					now returns an error for tcp.
 *		Alan Cox	:	sock->protocol is set correctly.
 *					and is not sometimes left as 0.
 *		Alan Cox	:	connect handles icmp errors on a
 *					connect properly. Unfortunately there
 *					is a restart syscall nasty there. I
 *					can't match BSD without hacking the C
 *					library. Ideas urgently sought!
 *		Alan Cox	:	Disallow bind() to addresses that are
 *					not ours - especially broadcast ones!!
 *		Alan Cox	:	Socket 1024 _IS_ ok for users. (fencepost)
 *		Alan Cox	:	sock_wfree/sock_rfree don't destroy sockets,
 *					instead they leave that for the DESTROY timer.
 *		Alan Cox	:	Clean up error flag in accept
 *		Alan Cox	:	TCP ack handling is buggy, the DESTROY timer
 *					was buggy. Put a remove_sock() in the handler
 *					for memory when we hit 0. Also altered the timer
35
 *					code. The ACK stuff can wait and needs major
Linus Torvalds's avatar
Linus Torvalds committed
36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91
 *					TCP layer surgery.
 *		Alan Cox	:	Fixed TCP ack bug, removed remove sock
 *					and fixed timer/inet_bh race.
 *		Alan Cox	:	Added zapped flag for TCP
 *		Alan Cox	:	Move kfree_skb into skbuff.c and tidied up surplus code
 *		Alan Cox	:	for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
 *		Alan Cox	:	kfree_s calls now are kfree_skbmem so we can track skb resources
 *		Alan Cox	:	Supports socket option broadcast now as does udp. Packet and raw need fixing.
 *		Alan Cox	:	Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
 *		Rick Sladkey	:	Relaxed UDP rules for matching packets.
 *		C.E.Hawkins	:	IFF_PROMISC/SIOCGHWADDR support
 *	Pauline Middelink	:	identd support
 *		Alan Cox	:	Fixed connect() taking signals I think.
 *		Alan Cox	:	SO_LINGER supported
 *		Alan Cox	:	Error reporting fixes
 *		Anonymous	:	inet_create tidied up (sk->reuse setting)
 *		Alan Cox	:	inet sockets don't set sk->type!
 *		Alan Cox	:	Split socket option code
 *		Alan Cox	:	Callbacks
 *		Alan Cox	:	Nagle flag for Charles & Johannes stuff
 *		Alex		:	Removed restriction on inet fioctl
 *		Alan Cox	:	Splitting INET from NET core
 *		Alan Cox	:	Fixed bogus SO_TYPE handling in getsockopt()
 *		Adam Caldwell	:	Missing return in SO_DONTROUTE/SO_DEBUG code
 *		Alan Cox	:	Split IP from generic code
 *		Alan Cox	:	New kfree_skbmem()
 *		Alan Cox	:	Make SO_DEBUG superuser only.
 *		Alan Cox	:	Allow anyone to clear SO_DEBUG
 *					(compatibility fix)
 *		Alan Cox	:	Added optimistic memory grabbing for AF_UNIX throughput.
 *		Alan Cox	:	Allocator for a socket is settable.
 *		Alan Cox	:	SO_ERROR includes soft errors.
 *		Alan Cox	:	Allow NULL arguments on some SO_ opts
 *		Alan Cox	: 	Generic socket allocation to make hooks
 *					easier (suggested by Craig Metz).
 *		Michael Pall	:	SO_ERROR returns positive errno again
 *              Steve Whitehouse:       Added default destructor to free
 *                                      protocol private data.
 *              Steve Whitehouse:       Added various other default routines
 *                                      common to several socket families.
 *              Chris Evans     :       Call suser() check last on F_SETOWN
 *		Jay Schulist	:	Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
 *		Andi Kleen	:	Add sock_kmalloc()/sock_kfree_s()
 *		Andi Kleen	:	Fix write_space callback
 *		Chris Evans	:	Security fixes - signedness again
 *		Arnaldo C. Melo :       cleanups, use skb_queue_purge
 *
 * To Fix:
 *
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 */

Joe Perches's avatar
Joe Perches committed
92 93
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt

94
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
#include <linux/errno.h>
#include <linux/types.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/sched.h>
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/interrupt.h>
#include <linux/poll.h>
#include <linux/tcp.h>
#include <linux/init.h>
114
#include <linux/highmem.h>
115
#include <linux/user_namespace.h>
116
#include <linux/static_key.h>
117
#include <linux/memcontrol.h>
118
#include <linux/prefetch.h>
Linus Torvalds's avatar
Linus Torvalds committed
119 120 121 122 123 124

#include <asm/uaccess.h>

#include <linux/netdevice.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
125
#include <net/net_namespace.h>
126
#include <net/request_sock.h>
Linus Torvalds's avatar
Linus Torvalds committed
127
#include <net/sock.h>
128
#include <linux/net_tstamp.h>
Linus Torvalds's avatar
Linus Torvalds committed
129 130
#include <net/xfrm.h>
#include <linux/ipsec.h>
131
#include <net/cls_cgroup.h>
132
#include <net/netprio_cgroup.h>
Linus Torvalds's avatar
Linus Torvalds committed
133 134 135

#include <linux/filter.h>

136 137
#include <trace/events/sock.h>

Linus Torvalds's avatar
Linus Torvalds committed
138 139 140 141
#ifdef CONFIG_INET
#include <net/tcp.h>
#endif

142
static DEFINE_MUTEX(proto_list_mutex);
Glauber Costa's avatar
Glauber Costa committed
143 144
static LIST_HEAD(proto_list);

145
#ifdef CONFIG_MEMCG_KMEM
146
int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
Glauber Costa's avatar
Glauber Costa committed
147 148 149 150
{
	struct proto *proto;
	int ret = 0;

151
	mutex_lock(&proto_list_mutex);
Glauber Costa's avatar
Glauber Costa committed
152 153
	list_for_each_entry(proto, &proto_list, node) {
		if (proto->init_cgroup) {
154
			ret = proto->init_cgroup(memcg, ss);
Glauber Costa's avatar
Glauber Costa committed
155 156 157 158 159
			if (ret)
				goto out;
		}
	}

160
	mutex_unlock(&proto_list_mutex);
Glauber Costa's avatar
Glauber Costa committed
161 162 163 164
	return ret;
out:
	list_for_each_entry_continue_reverse(proto, &proto_list, node)
		if (proto->destroy_cgroup)
165
			proto->destroy_cgroup(memcg);
166
	mutex_unlock(&proto_list_mutex);
Glauber Costa's avatar
Glauber Costa committed
167 168 169
	return ret;
}

170
void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
Glauber Costa's avatar
Glauber Costa committed
171 172 173
{
	struct proto *proto;

174
	mutex_lock(&proto_list_mutex);
Glauber Costa's avatar
Glauber Costa committed
175 176
	list_for_each_entry_reverse(proto, &proto_list, node)
		if (proto->destroy_cgroup)
177
			proto->destroy_cgroup(memcg);
178
	mutex_unlock(&proto_list_mutex);
Glauber Costa's avatar
Glauber Costa committed
179 180 181
}
#endif

182 183 184 185
/*
 * Each address family might have different locking rules, so we have
 * one slock key per address family:
 */
186 187 188
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];

189
struct static_key memcg_socket_limit_enabled;
Glauber Costa's avatar
Glauber Costa committed
190 191
EXPORT_SYMBOL(memcg_socket_limit_enabled);

192 193 194 195 196
/*
 * Make lock validator output more readable. (we pre-construct these
 * strings build-time, so that runtime initialization of socket
 * locks is fast):
 */
197
static const char *const af_family_key_strings[AF_MAX+1] = {
198 199 200 201 202 203 204
  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
Andy Grover's avatar
Andy Grover committed
205
  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
206
  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
207
  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
208
  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
209
  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
210
  "sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG"      ,
211
  "sk_lock-AF_NFC"   , "sk_lock-AF_MAX"
212
};
213
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
214 215 216 217 218 219 220
  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
Andy Grover's avatar
Andy Grover committed
221
  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
222
  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
223
  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
224
  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
225
  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
226
  "slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG"      ,
227
  "slock-AF_NFC"   , "slock-AF_MAX"
228
};
229
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
230 231 232 233 234 235 236
  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
Andy Grover's avatar
Andy Grover committed
237
  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
238
  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
239
  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
240
  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
241
  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
242
  "clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG"      ,
243
  "clock-AF_NFC"   , "clock-AF_MAX"
244
};
245 246 247 248 249 250 251

/*
 * sk_callback_lock locking rules are per-address-family,
 * so split the lock classes by using a per-AF key:
 */
static struct lock_class_key af_callback_keys[AF_MAX];

Linus Torvalds's avatar
Linus Torvalds committed
252 253 254 255 256 257
/* Take into consideration the size of the struct sk_buff overhead in the
 * determination of these values, since that is non-constant across
 * platforms.  This makes socket queueing behavior and performance
 * not depend upon such differences.
 */
#define _SK_MEM_PACKETS		256
258
#define _SK_MEM_OVERHEAD	SKB_TRUESIZE(256)
Linus Torvalds's avatar
Linus Torvalds committed
259 260 261 262
#define SK_WMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX		(_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)

/* Run time adjustable parameters. */
263
__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
264
EXPORT_SYMBOL(sysctl_wmem_max);
265
__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
266
EXPORT_SYMBOL(sysctl_rmem_max);
267 268
__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
Linus Torvalds's avatar
Linus Torvalds committed
269

Lucas De Marchi's avatar
Lucas De Marchi committed
270
/* Maximal space eaten by iovec or ancillary data plus some space */
271
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
Eric Dumazet's avatar
Eric Dumazet committed
272
EXPORT_SYMBOL(sysctl_optmem_max);
Linus Torvalds's avatar
Linus Torvalds committed
273

274 275 276
struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE;
EXPORT_SYMBOL_GPL(memalloc_socks);

277 278 279 280 281 282 283 284 285 286 287 288
/**
 * sk_set_memalloc - sets %SOCK_MEMALLOC
 * @sk: socket to set it on
 *
 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 * It's the responsibility of the admin to adjust min_free_kbytes
 * to meet the requirements
 */
void sk_set_memalloc(struct sock *sk)
{
	sock_set_flag(sk, SOCK_MEMALLOC);
	sk->sk_allocation |= __GFP_MEMALLOC;
289
	static_key_slow_inc(&memalloc_socks);
290 291 292 293 294 295 296
}
EXPORT_SYMBOL_GPL(sk_set_memalloc);

void sk_clear_memalloc(struct sock *sk)
{
	sock_reset_flag(sk, SOCK_MEMALLOC);
	sk->sk_allocation &= ~__GFP_MEMALLOC;
297
	static_key_slow_dec(&memalloc_socks);
298 299 300 301 302 303 304 305 306 307 308 309

	/*
	 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
	 * progress of swapping. However, if SOCK_MEMALLOC is cleared while
	 * it has rmem allocations there is a risk that the user of the
	 * socket cannot make forward progress due to exceeding the rmem
	 * limits. By rights, sk_clear_memalloc() should only be called
	 * on sockets being torn down but warn and reset the accounting if
	 * that assumption breaks.
	 */
	if (WARN_ON(sk->sk_forward_alloc))
		sk_mem_reclaim(sk);
310 311 312
}
EXPORT_SYMBOL_GPL(sk_clear_memalloc);

313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328
int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
{
	int ret;
	unsigned long pflags = current->flags;

	/* these should have been dropped before queueing */
	BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));

	current->flags |= PF_MEMALLOC;
	ret = sk->sk_backlog_rcv(sk, skb);
	tsk_restore_flags(current, pflags, PF_MEMALLOC);

	return ret;
}
EXPORT_SYMBOL(__sk_backlog_rcv);

Linus Torvalds's avatar
Linus Torvalds committed
329 330 331 332 333 334 335 336
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
	struct timeval tv;

	if (optlen < sizeof(tv))
		return -EINVAL;
	if (copy_from_user(&tv, optval, sizeof(tv)))
		return -EFAULT;
337 338
	if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
		return -EDOM;
Linus Torvalds's avatar
Linus Torvalds committed
339

340
	if (tv.tv_sec < 0) {
341 342
		static int warned __read_mostly;

343
		*timeo_p = 0;
344
		if (warned < 10 && net_ratelimit()) {
345
			warned++;
Joe Perches's avatar
Joe Perches committed
346 347
			pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
				__func__, current->comm, task_pid_nr(current));
348
		}
349 350
		return 0;
	}
Linus Torvalds's avatar
Linus Torvalds committed
351 352 353 354 355 356 357 358 359 360 361 362
	*timeo_p = MAX_SCHEDULE_TIMEOUT;
	if (tv.tv_sec == 0 && tv.tv_usec == 0)
		return 0;
	if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
		*timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
	return 0;
}

static void sock_warn_obsolete_bsdism(const char *name)
{
	static int warned;
	static char warncomm[TASK_COMM_LEN];
363 364
	if (strcmp(warncomm, current->comm) && warned < 5) {
		strcpy(warncomm,  current->comm);
Joe Perches's avatar
Joe Perches committed
365 366
		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
			warncomm, name);
Linus Torvalds's avatar
Linus Torvalds committed
367 368 369 370
		warned++;
	}
}

371 372 373
#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))

static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
374
{
375 376 377
	if (sk->sk_flags & flags) {
		sk->sk_flags &= ~flags;
		if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
378
			net_disable_timestamp();
Linus Torvalds's avatar
Linus Torvalds committed
379 380 381 382
	}
}


383 384
int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
{
Eric Dumazet's avatar
Eric Dumazet committed
385
	int err;
386
	int skb_len;
387 388
	unsigned long flags;
	struct sk_buff_head *list = &sk->sk_receive_queue;
389

Eric Dumazet's avatar
Eric Dumazet committed
390
	if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
Eric Dumazet's avatar
Eric Dumazet committed
391
		atomic_inc(&sk->sk_drops);
392
		trace_sock_rcvqueue_full(sk, skb);
Eric Dumazet's avatar
Eric Dumazet committed
393
		return -ENOMEM;
394 395
	}

396
	err = sk_filter(sk, skb);
397
	if (err)
Eric Dumazet's avatar
Eric Dumazet committed
398
		return err;
399

400
	if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
Eric Dumazet's avatar
Eric Dumazet committed
401 402
		atomic_inc(&sk->sk_drops);
		return -ENOBUFS;
403 404
	}

405 406
	skb->dev = NULL;
	skb_set_owner_r(skb, sk);
407

408 409 410 411 412 413 414
	/* Cache the SKB length before we tack it onto the receive
	 * queue.  Once it is added it no longer belongs to us and
	 * may be freed by other threads of control pulling packets
	 * from the queue.
	 */
	skb_len = skb->len;

415 416 417 418 419
	/* we escape from rcu protected region, make sure we dont leak
	 * a norefcounted dst
	 */
	skb_dst_force(skb);

420 421 422 423
	spin_lock_irqsave(&list->lock, flags);
	skb->dropcount = atomic_read(&sk->sk_drops);
	__skb_queue_tail(list, skb);
	spin_unlock_irqrestore(&list->lock, flags);
424 425 426

	if (!sock_flag(sk, SOCK_DEAD))
		sk->sk_data_ready(sk, skb_len);
Eric Dumazet's avatar
Eric Dumazet committed
427
	return 0;
428 429 430
}
EXPORT_SYMBOL(sock_queue_rcv_skb);

431
int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
432 433 434
{
	int rc = NET_RX_SUCCESS;

435
	if (sk_filter(sk, skb))
436 437 438 439
		goto discard_and_relse;

	skb->dev = NULL;

440
	if (sk_rcvqueues_full(sk, skb, sk->sk_rcvbuf)) {
441 442 443
		atomic_inc(&sk->sk_drops);
		goto discard_and_relse;
	}
444 445 446 447
	if (nested)
		bh_lock_sock_nested(sk);
	else
		bh_lock_sock(sk);
448 449 450 451 452 453
	if (!sock_owned_by_user(sk)) {
		/*
		 * trylock + unlock semantics:
		 */
		mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);

454
		rc = sk_backlog_rcv(sk, skb);
455 456

		mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
457
	} else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
Zhu Yi's avatar
Zhu Yi committed
458 459 460 461 462
		bh_unlock_sock(sk);
		atomic_inc(&sk->sk_drops);
		goto discard_and_relse;
	}

463 464 465 466 467 468 469 470 471 472
	bh_unlock_sock(sk);
out:
	sock_put(sk);
	return rc;
discard_and_relse:
	kfree_skb(skb);
	goto out;
}
EXPORT_SYMBOL(sk_receive_skb);

473 474 475 476 477 478
void sk_reset_txq(struct sock *sk)
{
	sk_tx_queue_clear(sk);
}
EXPORT_SYMBOL(sk_reset_txq);

479 480
struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
{
481
	struct dst_entry *dst = __sk_dst_get(sk);
482 483

	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
484
		sk_tx_queue_clear(sk);
485
		RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507
		dst_release(dst);
		return NULL;
	}

	return dst;
}
EXPORT_SYMBOL(__sk_dst_check);

struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
{
	struct dst_entry *dst = sk_dst_get(sk);

	if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
		sk_dst_reset(sk);
		dst_release(dst);
		return NULL;
	}

	return dst;
}
EXPORT_SYMBOL(sk_dst_check);

508 509 510 511
static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
{
	int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
512
	struct net *net = sock_net(sk);
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537
	char devname[IFNAMSIZ];
	int index;

	/* Sorry... */
	ret = -EPERM;
	if (!capable(CAP_NET_RAW))
		goto out;

	ret = -EINVAL;
	if (optlen < 0)
		goto out;

	/* Bind this socket to a particular device like "eth0",
	 * as specified in the passed interface name. If the
	 * name is "" or the option length is zero the socket
	 * is not bound.
	 */
	if (optlen > IFNAMSIZ - 1)
		optlen = IFNAMSIZ - 1;
	memset(devname, 0, sizeof(devname));

	ret = -EFAULT;
	if (copy_from_user(devname, optval, optlen))
		goto out;

538 539
	index = 0;
	if (devname[0] != '\0') {
540
		struct net_device *dev;
541

542 543 544 545 546
		rcu_read_lock();
		dev = dev_get_by_name_rcu(net, devname);
		if (dev)
			index = dev->ifindex;
		rcu_read_unlock();
547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564
		ret = -ENODEV;
		if (!dev)
			goto out;
	}

	lock_sock(sk);
	sk->sk_bound_dev_if = index;
	sk_dst_reset(sk);
	release_sock(sk);

	ret = 0;

out:
#endif

	return ret;
}

565 566 567 568 569 570 571 572
static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
{
	if (valbool)
		sock_set_flag(sk, bit);
	else
		sock_reset_flag(sk, bit);
}

Linus Torvalds's avatar
Linus Torvalds committed
573 574 575 576 577 578
/*
 *	This is meant for all protocols to use and covers goings on
 *	at the socket level. Everything here is generic.
 */

int sock_setsockopt(struct socket *sock, int level, int optname,
579
		    char __user *optval, unsigned int optlen)
Linus Torvalds's avatar
Linus Torvalds committed
580
{
Eric Dumazet's avatar
Eric Dumazet committed
581
	struct sock *sk = sock->sk;
Linus Torvalds's avatar
Linus Torvalds committed
582 583 584 585
	int val;
	int valbool;
	struct linger ling;
	int ret = 0;
586

Linus Torvalds's avatar
Linus Torvalds committed
587 588 589 590
	/*
	 *	Options without arguments
	 */

591 592 593
	if (optname == SO_BINDTODEVICE)
		return sock_bindtodevice(sk, optval, optlen);

594 595
	if (optlen < sizeof(int))
		return -EINVAL;
596

Linus Torvalds's avatar
Linus Torvalds committed
597 598
	if (get_user(val, (int __user *)optval))
		return -EFAULT;
599

Eric Dumazet's avatar
Eric Dumazet committed
600
	valbool = val ? 1 : 0;
Linus Torvalds's avatar
Linus Torvalds committed
601 602 603

	lock_sock(sk);

Eric Dumazet's avatar
Eric Dumazet committed
604
	switch (optname) {
605
	case SO_DEBUG:
Eric Dumazet's avatar
Eric Dumazet committed
606
		if (val && !capable(CAP_NET_ADMIN))
607
			ret = -EACCES;
Eric Dumazet's avatar
Eric Dumazet committed
608
		else
609
			sock_valbool_flag(sk, SOCK_DBG, valbool);
610 611
		break;
	case SO_REUSEADDR:
612
		sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
613 614
		break;
	case SO_TYPE:
615
	case SO_PROTOCOL:
616
	case SO_DOMAIN:
617 618 619 620
	case SO_ERROR:
		ret = -ENOPROTOOPT;
		break;
	case SO_DONTROUTE:
621
		sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
622 623 624 625 626 627
		break;
	case SO_BROADCAST:
		sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
		break;
	case SO_SNDBUF:
		/* Don't error on this BSD doesn't and if you think
628 629 630 631 632
		 * about it this is right. Otherwise apps have to
		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
		 * are treated in BSD as hints
		 */
		val = min_t(u32, val, sysctl_wmem_max);
633
set_sndbuf:
634
		sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
635 636
		sk->sk_sndbuf = max_t(u32, val * 2, SOCK_MIN_SNDBUF);
		/* Wake up sending tasks if we upped the value. */
637 638
		sk->sk_write_space(sk);
		break;
Linus Torvalds's avatar
Linus Torvalds committed
639

640 641 642 643 644 645
	case SO_SNDBUFFORCE:
		if (!capable(CAP_NET_ADMIN)) {
			ret = -EPERM;
			break;
		}
		goto set_sndbuf;
646

647 648
	case SO_RCVBUF:
		/* Don't error on this BSD doesn't and if you think
649 650 651 652 653
		 * about it this is right. Otherwise apps have to
		 * play 'guess the biggest size' games. RCVBUF/SNDBUF
		 * are treated in BSD as hints
		 */
		val = min_t(u32, val, sysctl_rmem_max);
654
set_rcvbuf:
655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670
		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
		/*
		 * We double it on the way in to account for
		 * "struct sk_buff" etc. overhead.   Applications
		 * assume that the SO_RCVBUF setting they make will
		 * allow that much actual data to be received on that
		 * socket.
		 *
		 * Applications are unaware that "struct sk_buff" and
		 * other overheads allocate from the receive buffer
		 * during socket buffer allocation.
		 *
		 * And after considering the possible alternatives,
		 * returning the value we actually used in getsockopt
		 * is the most desirable behavior.
		 */
671
		sk->sk_rcvbuf = max_t(u32, val * 2, SOCK_MIN_RCVBUF);
672 673 674 675 676
		break;

	case SO_RCVBUFFORCE:
		if (!capable(CAP_NET_ADMIN)) {
			ret = -EPERM;
Linus Torvalds's avatar
Linus Torvalds committed
677
			break;
678 679
		}
		goto set_rcvbuf;
Linus Torvalds's avatar
Linus Torvalds committed
680

681
	case SO_KEEPALIVE:
Linus Torvalds's avatar
Linus Torvalds committed
682
#ifdef CONFIG_INET
683 684
		if (sk->sk_protocol == IPPROTO_TCP &&
		    sk->sk_type == SOCK_STREAM)
685
			tcp_set_keepalive(sk, valbool);
Linus Torvalds's avatar
Linus Torvalds committed
686
#endif
687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707
		sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
		break;

	case SO_OOBINLINE:
		sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
		break;

	case SO_NO_CHECK:
		sk->sk_no_check = valbool;
		break;

	case SO_PRIORITY:
		if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
			sk->sk_priority = val;
		else
			ret = -EPERM;
		break;

	case SO_LINGER:
		if (optlen < sizeof(ling)) {
			ret = -EINVAL;	/* 1003.1g */
Linus Torvalds's avatar
Linus Torvalds committed
708
			break;
709
		}
Eric Dumazet's avatar
Eric Dumazet committed
710
		if (copy_from_user(&ling, optval, sizeof(ling))) {
711
			ret = -EFAULT;
Linus Torvalds's avatar
Linus Torvalds committed
712
			break;
713 714 715 716
		}
		if (!ling.l_onoff)
			sock_reset_flag(sk, SOCK_LINGER);
		else {
Linus Torvalds's avatar
Linus Torvalds committed
717
#if (BITS_PER_LONG == 32)
718 719
			if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
				sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
Linus Torvalds's avatar
Linus Torvalds committed
720
			else
721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738
#endif
				sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
			sock_set_flag(sk, SOCK_LINGER);
		}
		break;

	case SO_BSDCOMPAT:
		sock_warn_obsolete_bsdism("setsockopt");
		break;

	case SO_PASSCRED:
		if (valbool)
			set_bit(SOCK_PASSCRED, &sock->flags);
		else
			clear_bit(SOCK_PASSCRED, &sock->flags);
		break;

	case SO_TIMESTAMP:
739
	case SO_TIMESTAMPNS:
740
		if (valbool)  {
741 742 743 744
			if (optname == SO_TIMESTAMP)
				sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
			else
				sock_set_flag(sk, SOCK_RCVTSTAMPNS);
745
			sock_set_flag(sk, SOCK_RCVTSTAMP);
746
			sock_enable_timestamp(sk, SOCK_TIMESTAMP);
747
		} else {
748
			sock_reset_flag(sk, SOCK_RCVTSTAMP);
749 750
			sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
		}
751 752
		break;

753 754
	case SO_TIMESTAMPING:
		if (val & ~SOF_TIMESTAMPING_MASK) {
755
			ret = -EINVAL;
756 757 758 759 760 761 762 763 764 765 766 767 768
			break;
		}
		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
				  val & SOF_TIMESTAMPING_TX_HARDWARE);
		sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
				  val & SOF_TIMESTAMPING_TX_SOFTWARE);
		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
				  val & SOF_TIMESTAMPING_RX_HARDWARE);
		if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
			sock_enable_timestamp(sk,
					      SOCK_TIMESTAMPING_RX_SOFTWARE);
		else
			sock_disable_timestamp(sk,
769
					       (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
770 771 772 773 774 775 776 777
		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
				  val & SOF_TIMESTAMPING_SOFTWARE);
		sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
				  val & SOF_TIMESTAMPING_SYS_HARDWARE);
		sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
				  val & SOF_TIMESTAMPING_RAW_HARDWARE);
		break;

778 779 780 781 782 783 784 785 786 787 788 789 790
	case SO_RCVLOWAT:
		if (val < 0)
			val = INT_MAX;
		sk->sk_rcvlowat = val ? : 1;
		break;

	case SO_RCVTIMEO:
		ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
		break;

	case SO_SNDTIMEO:
		ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
		break;
Linus Torvalds's avatar
Linus Torvalds committed
791

792 793 794 795
	case SO_ATTACH_FILTER:
		ret = -EINVAL;
		if (optlen == sizeof(struct sock_fprog)) {
			struct sock_fprog fprog;
Linus Torvalds's avatar
Linus Torvalds committed
796

797 798
			ret = -EFAULT;
			if (copy_from_user(&fprog, optval, sizeof(fprog)))
Linus Torvalds's avatar
Linus Torvalds committed
799
				break;
800 801 802 803 804 805

			ret = sk_attach_filter(&fprog, sk);
		}
		break;

	case SO_DETACH_FILTER:
806
		ret = sk_detach_filter(sk);
807
		break;
Linus Torvalds's avatar
Linus Torvalds committed
808

809 810 811 812 813 814
	case SO_PASSSEC:
		if (valbool)
			set_bit(SOCK_PASSSEC, &sock->flags);
		else
			clear_bit(SOCK_PASSSEC, &sock->flags);
		break;
815 816 817
	case SO_MARK:
		if (!capable(CAP_NET_ADMIN))
			ret = -EPERM;
Eric Dumazet's avatar
Eric Dumazet committed
818
		else
819 820
			sk->sk_mark = val;
		break;
821

Linus Torvalds's avatar
Linus Torvalds committed
822 823
		/* We implement the SO_SNDLOWAT etc to
		   not be settable (1003.1g 5.3) */
824
	case SO_RXQ_OVFL:
825
		sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
826
		break;
827 828 829 830 831

	case SO_WIFI_STATUS:
		sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
		break;

832 833 834 835 836 837
	case SO_PEEK_OFF:
		if (sock->ops->set_peek_off)
			sock->ops->set_peek_off(sk, val);
		else
			ret = -EOPNOTSUPP;
		break;
838 839 840 841 842

	case SO_NOFCS:
		sock_valbool_flag(sk, SOCK_NOFCS, valbool);
		break;

843 844 845
	default:
		ret = -ENOPROTOOPT;
		break;
846
	}
Linus Torvalds's avatar
Linus Torvalds committed
847 848 849
	release_sock(sk);
	return ret;
}
Eric Dumazet's avatar
Eric Dumazet committed
850
EXPORT_SYMBOL(sock_setsockopt);
Linus Torvalds's avatar
Linus Torvalds committed
851 852


853 854 855 856 857 858 859 860
void cred_to_ucred(struct pid *pid, const struct cred *cred,
		   struct ucred *ucred)
{
	ucred->pid = pid_vnr(pid);
	ucred->uid = ucred->gid = -1;
	if (cred) {
		struct user_namespace *current_ns = current_user_ns();

861 862
		ucred->uid = from_kuid_munged(current_ns, cred->euid);
		ucred->gid = from_kgid_munged(current_ns, cred->egid);
863 864
	}
}
865
EXPORT_SYMBOL_GPL(cred_to_ucred);
866

Linus Torvalds's avatar
Linus Torvalds committed
867 868 869 870
int sock_getsockopt(struct socket *sock, int level, int optname,
		    char __user *optval, int __user *optlen)
{
	struct sock *sk = sock->sk;
871

872
	union {
873 874
		int val;
		struct linger ling;
Linus Torvalds's avatar
Linus Torvalds committed
875 876
		struct timeval tm;
	} v;
877

878
	int lv = sizeof(int);
Linus Torvalds's avatar
Linus Torvalds committed
879
	int len;
880

881
	if (get_user(len, optlen))
882
		return -EFAULT;
883
	if (len < 0)
Linus Torvalds's avatar
Linus Torvalds committed
884
		return -EINVAL;
885

886
	memset(&v, 0, sizeof(v));
887

Eric Dumazet's avatar
Eric Dumazet committed
888
	switch (optname) {
889 890 891 892 893 894 895 896 897
	case SO_DEBUG:
		v.val = sock_flag(sk, SOCK_DBG);
		break;

	case SO_DONTROUTE:
		v.val = sock_flag(sk, SOCK_LOCALROUTE);
		break;

	case SO_BROADCAST:
Eric Dumazet's avatar
Eric Dumazet committed
898
		v.val = sock_flag(sk, SOCK_BROADCAST);
899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
		break;

	case SO_SNDBUF:
		v.val = sk->sk_sndbuf;
		break;

	case SO_RCVBUF:
		v.val = sk->sk_rcvbuf;
		break;

	case SO_REUSEADDR:
		v.val = sk->sk_reuse;
		break;

	case SO_KEEPALIVE:
Eric Dumazet's avatar
Eric Dumazet committed
914
		v.val = sock_flag(sk, SOCK_KEEPOPEN);
915 916 917 918 919 920
		break;

	case SO_TYPE:
		v.val = sk->sk_type;
		break;

921 922 923 924
	case SO_PROTOCOL:
		v.val = sk->sk_protocol;
		break;

925 926 927 928
	case SO_DOMAIN:
		v.val = sk->sk_family;
		break;

929 930
	case SO_ERROR:
		v.val = -sock_error(sk);
Eric Dumazet's avatar
Eric Dumazet committed
931
		if (v.val == 0)
932 933 934 935
			v.val = xchg(&sk->sk_err_soft, 0);
		break;

	case SO_OOBINLINE:
Eric Dumazet's avatar
Eric Dumazet committed
936
		v.val = sock_flag(sk, SOCK_URGINLINE);
937 938 939 940 941 942 943 944 945 946 947 948
		break;

	case SO_NO_CHECK:
		v.val = sk->sk_no_check;
		break;

	case SO_PRIORITY:
		v.val = sk->sk_priority;
		break;

	case SO_LINGER:
		lv		= sizeof(v.ling);
Eric Dumazet's avatar
Eric Dumazet committed
949
		v.ling.l_onoff	= sock_flag(sk, SOCK_LINGER);
950 951 952 953 954 955 956 957
		v.ling.l_linger	= sk->sk_lingertime / HZ;
		break;

	case SO_BSDCOMPAT:
		sock_warn_obsolete_bsdism("getsockopt");
		break;

	case SO_TIMESTAMP:
958 959 960 961 962 963
		v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
				!sock_flag(sk, SOCK_RCVTSTAMPNS);
		break;

	case SO_TIMESTAMPNS:
		v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
964 965
		break;

966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983
	case SO_TIMESTAMPING:
		v.val = 0;
		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
			v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
		if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
			v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
			v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
		if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
			v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
		if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
			v.val |= SOF_TIMESTAMPING_SOFTWARE;
		if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
			v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
		if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
			v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
		break;

984
	case SO_RCVTIMEO:
Eric Dumazet's avatar
Eric Dumazet committed
985
		lv = sizeof(struct timeval);
986 987 988 989 990 991 992 993 994 995
		if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
			v.tm.tv_sec = 0;
			v.tm.tv_usec = 0;
		} else {
			v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
			v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
		}
		break;

	case SO_SNDTIMEO:
Eric Dumazet's avatar
Eric Dumazet committed
996
		lv = sizeof(struct timeval);
997 998 999 1000 1001 1002 1003 1004
		if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
			v.tm.tv_sec = 0;
			v.tm.tv_usec = 0;
		} else {
			v.tm.tv_sec = sk->sk_sndtimeo / HZ;
			v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
		}
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1005

1006 1007 1008
	case SO_RCVLOWAT:
		v.val = sk->sk_rcvlowat;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1009

1010
	case SO_SNDLOWAT:
Eric Dumazet's avatar
Eric Dumazet committed
1011
		v.val = 1;
1012
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1013

1014
	case SO_PASSCRED:
1015
		v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1016
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1017

1018
	case SO_PEERCRED:
1019 1020 1021 1022 1023 1024
	{
		struct ucred peercred;
		if (len > sizeof(peercred))
			len = sizeof(peercred);
		cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
		if (copy_to_user(optval, &peercred, len))
1025 1026
			return -EFAULT;
		goto lenout;
1027
	}
Linus Torvalds's avatar
Linus Torvalds committed
1028

1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
	case SO_PEERNAME:
	{
		char address[128];

		if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
			return -ENOTCONN;
		if (lv < len)
			return -EINVAL;
		if (copy_to_user(optval, address, len))
			return -EFAULT;
		goto lenout;
	}
Linus Torvalds's avatar
Linus Torvalds committed
1041

1042 1043 1044 1045 1046 1047
	/* Dubious BSD thing... Probably nobody even uses it, but
	 * the UNIX standard wants it for whatever reason... -DaveM
	 */
	case SO_ACCEPTCONN:
		v.val = sk->sk_state == TCP_LISTEN;
		break;
Linus Torvalds's avatar
Linus Torvalds committed
1048

1049
	case SO_PASSSEC:
1050
		v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1051
		break;
1052

1053 1054
	case SO_PEERSEC:
		return security_socket_getpeersec_stream(sock, optval, optlen, len);
Linus Torvalds's avatar
Linus Torvalds committed
1055

1056 1057 1058 1059
	case SO_MARK:
		v.val = sk->sk_mark;
		break;

1060
	case SO_RXQ_OVFL:
Eric Dumazet's avatar
Eric Dumazet committed
1061
		v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1062 1063
		break;

1064
	case SO_WIFI_STATUS:
Eric Dumazet's avatar
Eric Dumazet committed
1065
		v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1066 1067
		break;

1068 1069 1070 1071 1072 1073
	case SO_PEEK_OFF:
		if (!sock->ops->set_peek_off)
			return -EOPNOTSUPP;

		v.val = sk->sk_peek_off;
		break;
1074
	case SO_NOFCS:
Eric Dumazet's avatar
Eric Dumazet committed
1075
		v.val = sock_flag(sk, SOCK_NOFCS);
1076
		break;
1077 1078 1079
	case SO_BINDTODEVICE:
		v.val = sk->sk_bound_dev_if;
		break;
1080 1081 1082 1083 1084 1085
	case SO_GET_FILTER:
		len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
		if (len < 0)
			return len;

		goto lenout;
1086 1087
	default:
		return -ENOPROTOOPT;
Linus Torvalds's avatar
Linus Torvalds committed
1088
	}
1089

Linus Torvalds's avatar
Linus Torvalds committed
1090 1091 1092 1093 1094
	if (len > lv)
		len = lv;
	if (copy_to_user(optval, &v, len))
		return -EFAULT;
lenout:
1095 1096 1097
	if (put_user(len, optlen))
		return -EFAULT;
	return 0;
Linus Torvalds's avatar
Linus Torvalds committed
1098 1099
}

1100 1101 1102 1103 1104
/*
 * Initialize an sk_lock.
 *
 * (We also register the sk_lock with the lock validator.)
 */
1105
static inline void sock_lock_init(struct sock *sk)
1106
{
1107 1108 1109 1110 1111
	sock_lock_init_class_and_name(sk,
			af_family_slock_key_strings[sk->sk_family],
			af_family_slock_keys + sk->sk_family,
			af_family_key_strings[sk->sk_family],
			af_family_keys + sk->sk_family);
1112 1113
}

Eric Dumazet's avatar
Eric Dumazet committed
1114 1115 1116
/*
 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
 * even temporarly, because of RCU lookups. sk_node should also be left as is.
1117
 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
Eric Dumazet's avatar
Eric Dumazet committed
1118
 */
1119 1120 1121 1122 1123
static void sock_copy(struct sock *nsk, const struct sock *osk)
{
#ifdef CONFIG_SECURITY_NETWORK
	void *sptr = nsk->sk_security;
#endif
1124 1125 1126 1127 1128
	memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));

	memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
	       osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));

1129 1130 1131 1132 1133 1134
#ifdef CONFIG_SECURITY_NETWORK
	nsk->sk_security = sptr;
	security_sk_clone(osk, nsk);
#endif
}

1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164
/*
 * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes
 * un-modified. Special care is taken when initializing object to zero.
 */
static inline void sk_prot_clear_nulls(struct sock *sk, int size)
{
	if (offsetof(struct sock, sk_node.next) != 0)
		memset(sk, 0, offsetof(struct sock, sk_node.next));
	memset(&sk->sk_node.pprev, 0,
	       size - offsetof(struct sock, sk_node.pprev));
}

void sk_prot_clear_portaddr_nulls(struct sock *sk, int size)
{
	unsigned long nulls1, nulls2;

	nulls1 = offsetof(struct sock, __sk_common.skc_node.next);
	nulls2 = offsetof(struct sock, __sk_common.skc_portaddr_node.next);
	if (nulls1 > nulls2)
		swap(nulls1, nulls2);

	if (nulls1 != 0)
		memset((char *)sk, 0, nulls1);
	memset((char *)sk + nulls1 + sizeof(void *), 0,
	       nulls2 - nulls1 - sizeof(void *));
	memset((char *)sk + nulls2 + sizeof(void *), 0,
	       size - nulls2 - sizeof(void *));
}
EXPORT_SYMBOL(sk_prot_clear_portaddr_nulls);

1165 1166
static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
		int family)
1167 1168 1169 1170 1171
{
	struct sock *sk;
	struct kmem_cache *slab;

	slab = prot->slab;
1172 1173 1174 1175 1176
	if (slab != NULL) {
		sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
		if (!sk)
			return sk;
		if (priority & __GFP_ZERO) {
1177 1178 1179 1180
			if (prot->clear_sk)
				prot->clear_sk(sk, prot->obj_size);
			else
				sk_prot_clear_nulls(sk, prot->obj_size);
1181
		}
1182
	} else
1183 1184
		sk = kmalloc(prot->obj_size, priority);

1185
	if (sk != NULL) {
1186 1187
		kmemcheck_annotate_bitfield(sk, flags);

1188 1189 1190 1191 1192
		if (security_sk_alloc(sk, family, priority))
			goto out_free;

		if (!try_module_get(prot->owner))
			goto out_free_sec;
1193
		sk_tx_queue_clear(sk);
1194 1195
	}

1196
	return sk;
1197 1198 1199 1200 1201 1202 1203 1204 1205

out_free_sec:
	security_sk_free(sk);
out_free:
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
	return NULL;
1206 1207 1208 1209 1210
}

static void sk_prot_free(struct proto *prot, struct sock *sk)
{
	struct kmem_cache *slab;
1211
	struct module *owner;
1212

1213
	owner = prot->owner;
1214
	slab = prot->slab;
1215 1216

	security_sk_free(sk);
1217 1218 1219 1220
	if (slab != NULL)
		kmem_cache_free(slab, sk);
	else
		kfree(sk);
1221
	module_put(owner);
1222 1223
}

1224
#ifdef CONFIG_CGROUPS
1225
#if IS_ENABLED(CONFIG_NET_CLS_CGROUP)
1226
void sock_update_classid(struct sock *sk, struct task_struct *task)
1227
{
1228
	u32 classid;
1229

1230
	classid = task_cls_classid(task);
1231
	if (classid != sk->sk_classid)
1232 1233
		sk->sk_classid = classid;
}
1234
EXPORT_SYMBOL(sock_update_classid);
1235
#endif
1236

1237
#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
1238
void sock_update_netprioidx(struct sock *sk, struct task_struct *task)
1239 1240 1241
{
	if (in_interrupt())
		return;
1242

1243
	sk->sk_cgrp_prioidx = task_netprioidx(task);
1244 1245
}
EXPORT_SYMBOL_GPL(sock_update_netprioidx);
1246
#endif
1247
#endif
1248

Linus Torvalds's avatar
Linus Torvalds committed
1249 1250
/**
 *	sk_alloc - All socket objects are allocated here
1251
 *	@net: the applicable net namespace
1252 1253 1254
 *	@family: protocol family
 *	@priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
 *	@prot: struct proto associated with this new sock instance
Linus Torvalds's avatar
Linus Torvalds committed
1255
 */
1256
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1257
		      struct proto *prot)
Linus Torvalds's avatar
Linus Torvalds committed
1258
{
1259
	struct sock *sk;
Linus Torvalds's avatar
Linus Torvalds committed
1260

1261
	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
Linus Torvalds's avatar
Linus Torvalds committed
1262
	if (sk) {
1263 1264 1265 1266 1267 1268 1269
		sk->sk_family = family;
		/*
		 * See comment in struct sock definition to understand
		 * why we need sk_prot_creator -acme
		 */
		sk->sk_prot = sk->sk_prot_creator = prot;
		sock_lock_init(sk);
1270
		sock_net_set(sk, get_net(net));
1271
		atomic_set(&sk->sk_wmem_alloc, 1);
1272

1273
		sock_update_classid(sk, current);
1274
		sock_update_netprioidx(sk, current);
Linus Torvalds's avatar
Linus Torvalds committed
1275
	}
1276

1277
	return sk;
Linus Torvalds's avatar
Linus Torvalds committed
1278
}
Eric Dumazet's avatar
Eric Dumazet committed
1279
EXPORT_SYMBOL(sk_alloc);
Linus Torvalds's avatar
Linus Torvalds committed
1280

1281
static void __sk_free(struct sock *sk)
Linus Torvalds's avatar
Linus Torvalds committed
1282 1283 1284 1285 1286 1287
{
	struct sk_filter *filter;

	if (sk->sk_destruct)
		sk->sk_destruct(sk);