pid_namespace.c 11.7 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12
/*
 * Pid namespaces
 *
 * Authors:
 *    (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc.
 *    (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM
 *     Many thanks to Oleg Nesterov for comments and help
 *
 */

#include <linux/pid.h>
#include <linux/pid_namespace.h>
13
#include <linux/user_namespace.h>
14
#include <linux/syscalls.h>
15
#include <linux/cred.h>
16
#include <linux/err.h>
17
#include <linux/acct.h>
18
#include <linux/slab.h>
19
#include <linux/proc_ns.h>
20
#include <linux/reboot.h>
21
#include <linux/export.h>
22
#include <linux/sched/task.h>
23
#include <linux/sched/signal.h>
24
#include <linux/idr.h>
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

struct pid_cache {
	int nr_ids;
	char name[16];
	struct kmem_cache *cachep;
	struct list_head list;
};

static LIST_HEAD(pid_caches_lh);
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;

/*
 * creates the kmem cache to allocate pids from.
 * @nr_ids: the number of numerical ids this pid will have to carry
 */

static struct kmem_cache *create_pid_cachep(int nr_ids)
{
	struct pid_cache *pcache;
	struct kmem_cache *cachep;

	mutex_lock(&pid_caches_mutex);
	list_for_each_entry(pcache, &pid_caches_lh, list)
		if (pcache->nr_ids == nr_ids)
			goto out;

	pcache = kmalloc(sizeof(struct pid_cache), GFP_KERNEL);
	if (pcache == NULL)
		goto err_alloc;

	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
	cachep = kmem_cache_create(pcache->name,
			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
			0, SLAB_HWCACHE_ALIGN, NULL);
	if (cachep == NULL)
		goto err_cachep;

	pcache->nr_ids = nr_ids;
	pcache->cachep = cachep;
	list_add(&pcache->list, &pid_caches_lh);
out:
	mutex_unlock(&pid_caches_mutex);
	return pcache->cachep;

err_cachep:
	kfree(pcache);
err_alloc:
	mutex_unlock(&pid_caches_mutex);
	return NULL;
}

77 78 79 80 81 82
static void proc_cleanup_work(struct work_struct *work)
{
	struct pid_namespace *ns = container_of(work, struct pid_namespace, proc_work);
	pid_ns_release_proc(ns);
}

83 84 85
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32

86 87 88 89 90 91 92 93 94 95
static struct ucounts *inc_pid_namespaces(struct user_namespace *ns)
{
	return inc_ucount(ns, current_euid(), UCOUNT_PID_NAMESPACES);
}

static void dec_pid_namespaces(struct ucounts *ucounts)
{
	dec_ucount(ucounts, UCOUNT_PID_NAMESPACES);
}

96 97
static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns,
	struct pid_namespace *parent_pid_ns)
98 99
{
	struct pid_namespace *ns;
100
	unsigned int level = parent_pid_ns->level + 1;
101
	struct ucounts *ucounts;
102 103
	int err;

104 105 106 107
	err = -EINVAL;
	if (!in_userns(parent_pid_ns->user_ns, user_ns))
		goto out;

108
	err = -ENOSPC;
109 110 111 112
	if (level > MAX_PID_NS_LEVEL)
		goto out;
	ucounts = inc_pid_namespaces(user_ns);
	if (!ucounts)
113
		goto out;
114

115
	err = -ENOMEM;
116
	ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
117
	if (ns == NULL)
118
		goto out_dec;
119

120
	idr_init(&ns->idr);
121 122 123

	ns->pid_cachep = create_pid_cachep(level + 1);
	if (ns->pid_cachep == NULL)
124
		goto out_free_idr;
125

126
	err = ns_alloc_inum(&ns->ns);
127
	if (err)
128
		goto out_free_idr;
129
	ns->ns.ops = &pidns_operations;
130

131 132
	kref_init(&ns->kref);
	ns->level = level;
133
	ns->parent = get_pid_ns(parent_pid_ns);
134
	ns->user_ns = get_user_ns(user_ns);
135
	ns->ucounts = ucounts;
Gargi Sharma's avatar
Gargi Sharma committed
136
	ns->pid_allocated = PIDNS_ADDING;
137
	INIT_WORK(&ns->proc_work, proc_cleanup_work);
138 139 140

	return ns;

141 142
out_free_idr:
	idr_destroy(&ns->idr);
143
	kmem_cache_free(pid_ns_cachep, ns);
144 145
out_dec:
	dec_pid_namespaces(ucounts);
146
out:
147
	return ERR_PTR(err);
148 149
}

150 151
static void delayed_free_pidns(struct rcu_head *p)
{
152 153 154 155 156 157
	struct pid_namespace *ns = container_of(p, struct pid_namespace, rcu);

	dec_pid_namespaces(ns->ucounts);
	put_user_ns(ns->user_ns);

	kmem_cache_free(pid_ns_cachep, ns);
158 159
}

160 161
static void destroy_pid_namespace(struct pid_namespace *ns)
{
162
	ns_free_inum(&ns->ns);
163 164

	idr_destroy(&ns->idr);
165
	call_rcu(&ns->rcu, delayed_free_pidns);
166 167
}

168 169
struct pid_namespace *copy_pid_ns(unsigned long flags,
	struct user_namespace *user_ns, struct pid_namespace *old_ns)
170 171
{
	if (!(flags & CLONE_NEWPID))
172
		return get_pid_ns(old_ns);
173 174
	if (task_active_pid_ns(current) != old_ns)
		return ERR_PTR(-EINVAL);
175
	return create_pid_namespace(user_ns, old_ns);
176 177
}

178
static void free_pid_ns(struct kref *kref)
179
{
180
	struct pid_namespace *ns;
181 182 183

	ns = container_of(kref, struct pid_namespace, kref);
	destroy_pid_namespace(ns);
184
}
185

186 187 188 189 190 191 192 193 194 195
void put_pid_ns(struct pid_namespace *ns)
{
	struct pid_namespace *parent;

	while (ns != &init_pid_ns) {
		parent = ns->parent;
		if (!kref_put(&ns->kref, free_pid_ns))
			break;
		ns = parent;
	}
196
}
197
EXPORT_SYMBOL_GPL(put_pid_ns);
198 199 200 201 202

void zap_pid_ns_processes(struct pid_namespace *pid_ns)
{
	int nr;
	int rc;
203
	struct task_struct *task, *me = current;
204
	int init_pids = thread_group_leader(me) ? 1 : 2;
205
	struct pid *pid;
206

207 208 209
	/* Don't allow any more processes into the pid namespace */
	disable_pid_allocation(pid_ns);

210 211 212 213 214
	/*
	 * Ignore SIGCHLD causing any terminated children to autoreap.
	 * This speeds up the namespace shutdown, plus see the comment
	 * below.
	 */
215 216 217
	spin_lock_irq(&me->sighand->siglock);
	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
	spin_unlock_irq(&me->sighand->siglock);
218 219 220 221 222 223 224 225 226 227 228 229 230 231

	/*
	 * The last thread in the cgroup-init thread group is terminating.
	 * Find remaining pid_ts in the namespace, signal and wait for them
	 * to exit.
	 *
	 * Note:  This signals each threads in the namespace - even those that
	 * 	  belong to the same thread group, To avoid this, we would have
	 * 	  to walk the entire tasklist looking a processes in this
	 * 	  namespace, but that could be unnecessarily expensive if the
	 * 	  pid namespace has just a few processes. Or we need to
	 * 	  maintain a tasklist for each pid namespace.
	 *
	 */
232
	rcu_read_lock();
233
	read_lock(&tasklist_lock);
234 235 236
	nr = 2;
	idr_for_each_entry_continue(&pid_ns->idr, pid, nr) {
		task = pid_task(pid, PIDTYPE_PID);
237 238
		if (task && !__fatal_signal_pending(task))
			send_sig_info(SIGKILL, SEND_SIG_FORCED, task);
239 240
	}
	read_unlock(&tasklist_lock);
241
	rcu_read_unlock();
242

243 244
	/*
	 * Reap the EXIT_ZOMBIE children we had before we ignored SIGCHLD.
245
	 * kernel_wait4() will also block until our children traced from the
246 247
	 * parent namespace are detached and become EXIT_DEAD.
	 */
248 249
	do {
		clear_thread_flag(TIF_SIGPENDING);
250
		rc = kernel_wait4(-1, NULL, __WALL, NULL);
251 252
	} while (rc != -ECHILD);

253
	/*
254
	 * kernel_wait4() above can't reap the EXIT_DEAD children but we do not
255 256
	 * really care, we could reparent them to the global init. We could
	 * exit and reap ->child_reaper even if it is not the last thread in
Gargi Sharma's avatar
Gargi Sharma committed
257
	 * this pid_ns, free_pid(pid_allocated == 0) calls proc_cleanup_work(),
258 259 260 261 262 263 264 265 266 267
	 * pid_ns can not go away until proc_kill_sb() drops the reference.
	 *
	 * But this ns can also have other tasks injected by setns()+fork().
	 * Again, ignoring the user visible semantics we do not really need
	 * to wait until they are all reaped, but they can be reparented to
	 * us and thus we need to ensure that pid->child_reaper stays valid
	 * until they all go away. See free_pid()->wake_up_process().
	 *
	 * We rely on ignored SIGCHLD, an injected zombie must be autoreaped
	 * if reparented.
268 269
	 */
	for (;;) {
270
		set_current_state(TASK_INTERRUPTIBLE);
Gargi Sharma's avatar
Gargi Sharma committed
271
		if (pid_ns->pid_allocated == init_pids)
272 273 274
			break;
		schedule();
	}
275
	__set_current_state(TASK_RUNNING);
276

277 278 279
	if (pid_ns->reboot)
		current->signal->group_exit_code = pid_ns->reboot;

280
	acct_exit_ns(pid_ns);
281 282 283
	return;
}

284
#ifdef CONFIG_CHECKPOINT_RESTORE
285 286 287
static int pid_ns_ctl_handler(struct ctl_table *table, int write,
		void __user *buffer, size_t *lenp, loff_t *ppos)
{
288
	struct pid_namespace *pid_ns = task_active_pid_ns(current);
289
	struct ctl_table tmp = *table;
290
	int ret, next;
291

292
	if (write && !ns_capable(pid_ns->user_ns, CAP_SYS_ADMIN))
293 294 295 296 297 298 299 300
		return -EPERM;

	/*
	 * Writing directly to ns' last_pid field is OK, since this field
	 * is volatile in a living namespace anyway and a code writing to
	 * it should synchronize its usage with external means.
	 */

301 302 303 304 305 306 307 308
	next = idr_get_cursor(&pid_ns->idr) - 1;

	tmp.data = &next;
	ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
	if (!ret && write)
		idr_set_cursor(&pid_ns->idr, next + 1);

	return ret;
309 310
}

311 312
extern int pid_max;
static int zero = 0;
313 314 315 316 317 318
static struct ctl_table pid_ns_ctl_table[] = {
	{
		.procname = "ns_last_pid",
		.maxlen = sizeof(int),
		.mode = 0666, /* permissions are checked in the handler */
		.proc_handler = pid_ns_ctl_handler,
319 320
		.extra1 = &zero,
		.extra2 = &pid_max,
321 322 323 324
	},
	{ }
};
static struct ctl_path kern_path[] = { { .procname = "kernel", }, { } };
325
#endif	/* CONFIG_CHECKPOINT_RESTORE */
326

327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355
int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
	if (pid_ns == &init_pid_ns)
		return 0;

	switch (cmd) {
	case LINUX_REBOOT_CMD_RESTART2:
	case LINUX_REBOOT_CMD_RESTART:
		pid_ns->reboot = SIGHUP;
		break;

	case LINUX_REBOOT_CMD_POWER_OFF:
	case LINUX_REBOOT_CMD_HALT:
		pid_ns->reboot = SIGINT;
		break;
	default:
		return -EINVAL;
	}

	read_lock(&tasklist_lock);
	force_sig(SIGKILL, pid_ns->child_reaper);
	read_unlock(&tasklist_lock);

	do_exit(0);

	/* Not reached */
	return 0;
}

356 357 358 359 360
static inline struct pid_namespace *to_pid_ns(struct ns_common *ns)
{
	return container_of(ns, struct pid_namespace, ns);
}

361
static struct ns_common *pidns_get(struct task_struct *task)
Eric W. Biederman's avatar
Eric W. Biederman committed
362 363 364 365
{
	struct pid_namespace *ns;

	rcu_read_lock();
366 367 368
	ns = task_active_pid_ns(task);
	if (ns)
		get_pid_ns(ns);
Eric W. Biederman's avatar
Eric W. Biederman committed
369 370
	rcu_read_unlock();

371
	return ns ? &ns->ns : NULL;
Eric W. Biederman's avatar
Eric W. Biederman committed
372 373
}

374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396
static struct ns_common *pidns_for_children_get(struct task_struct *task)
{
	struct pid_namespace *ns = NULL;

	task_lock(task);
	if (task->nsproxy) {
		ns = task->nsproxy->pid_ns_for_children;
		get_pid_ns(ns);
	}
	task_unlock(task);

	if (ns) {
		read_lock(&tasklist_lock);
		if (!ns->child_reaper) {
			put_pid_ns(ns);
			ns = NULL;
		}
		read_unlock(&tasklist_lock);
	}

	return ns ? &ns->ns : NULL;
}

397
static void pidns_put(struct ns_common *ns)
Eric W. Biederman's avatar
Eric W. Biederman committed
398
{
399
	put_pid_ns(to_pid_ns(ns));
Eric W. Biederman's avatar
Eric W. Biederman committed
400 401
}

402
static int pidns_install(struct nsproxy *nsproxy, struct ns_common *ns)
Eric W. Biederman's avatar
Eric W. Biederman committed
403 404
{
	struct pid_namespace *active = task_active_pid_ns(current);
405
	struct pid_namespace *ancestor, *new = to_pid_ns(ns);
Eric W. Biederman's avatar
Eric W. Biederman committed
406

407
	if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
408
	    !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
Eric W. Biederman's avatar
Eric W. Biederman committed
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427
		return -EPERM;

	/*
	 * Only allow entering the current active pid namespace
	 * or a child of the current active pid namespace.
	 *
	 * This is required for fork to return a usable pid value and
	 * this maintains the property that processes and their
	 * children can not escape their current pid namespace.
	 */
	if (new->level < active->level)
		return -EINVAL;

	ancestor = new;
	while (ancestor->level > active->level)
		ancestor = ancestor->parent;
	if (ancestor != active)
		return -EINVAL;

428 429
	put_pid_ns(nsproxy->pid_ns_for_children);
	nsproxy->pid_ns_for_children = get_pid_ns(new);
Eric W. Biederman's avatar
Eric W. Biederman committed
430 431 432
	return 0;
}

433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450
static struct ns_common *pidns_get_parent(struct ns_common *ns)
{
	struct pid_namespace *active = task_active_pid_ns(current);
	struct pid_namespace *pid_ns, *p;

	/* See if the parent is in the current namespace */
	pid_ns = p = to_pid_ns(ns)->parent;
	for (;;) {
		if (!p)
			return ERR_PTR(-EPERM);
		if (p == active)
			break;
		p = p->parent;
	}

	return &get_pid_ns(pid_ns)->ns;
}

451 452 453 454 455
static struct user_namespace *pidns_owner(struct ns_common *ns)
{
	return to_pid_ns(ns)->user_ns;
}

Eric W. Biederman's avatar
Eric W. Biederman committed
456 457 458 459 460 461
const struct proc_ns_operations pidns_operations = {
	.name		= "pid",
	.type		= CLONE_NEWPID,
	.get		= pidns_get,
	.put		= pidns_put,
	.install	= pidns_install,
462
	.owner		= pidns_owner,
463
	.get_parent	= pidns_get_parent,
Eric W. Biederman's avatar
Eric W. Biederman committed
464 465
};

466 467 468 469 470 471 472 473 474 475 476
const struct proc_ns_operations pidns_for_children_operations = {
	.name		= "pid_for_children",
	.real_ns_name	= "pid",
	.type		= CLONE_NEWPID,
	.get		= pidns_for_children_get,
	.put		= pidns_put,
	.install	= pidns_install,
	.owner		= pidns_owner,
	.get_parent	= pidns_get_parent,
};

477 478 479
static __init int pid_namespaces_init(void)
{
	pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC);
480 481

#ifdef CONFIG_CHECKPOINT_RESTORE
482
	register_sysctl_paths(kern_path, pid_ns_ctl_table);
483
#endif
484 485 486 487
	return 0;
}

__initcall(pid_namespaces_init);