sched.c 266 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
/*
 *  kernel/sched.c
 *
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *		make semaphores SMP safe
 *  1998-11-19	Implemented schedule_timeout() and related stuff
 *		by Andrea Arcangeli
 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
 *		hybrid priority-list and round-robin design with
 *		an array-switch method of distributing timeslices
 *		and per-CPU runqueues.  Cleanups and useful suggestions
 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03	Interactivity tuning by Con Kolivas.
 *  2004-04-02	Scheduler domains code by Nick Piggin
Ingo Molnar's avatar
Ingo Molnar committed
19
20
21
22
23
24
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
 *              fair scheduling design by Con Kolivas.
 *  2007-05-05  Load balancing (smp-nice) and other improvements
 *              by Peter Williams
 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25
26
 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 *              Thomas Gleixner, Mike Kravetz
Linus Torvalds's avatar
Linus Torvalds committed
27
28
29
30
31
32
 */

#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
33
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
34
35
36
37
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
38
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
#include <linux/completion.h>
#include <linux/kernel_stat.h>
41
#include <linux/debug_locks.h>
42
#include <linux/perf_event.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
46
#include <linux/freezer.h>
47
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
49
#include <linux/blkdev.h>
#include <linux/delay.h>
50
#include <linux/pid_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
51
52
53
54
55
56
57
58
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
59
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
60
#include <linux/seq_file.h>
61
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
62
63
#include <linux/syscalls.h>
#include <linux/times.h>
64
#include <linux/tsacct_kern.h>
65
#include <linux/kprobes.h>
66
#include <linux/delayacct.h>
67
#include <linux/unistd.h>
Jens Axboe's avatar
Jens Axboe committed
68
#include <linux/pagemap.h>
69
#include <linux/hrtimer.h>
Reynes Philippe's avatar
Reynes Philippe committed
70
#include <linux/tick.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
71
72
#include <linux/debugfs.h>
#include <linux/ctype.h>
73
#include <linux/ftrace.h>
Linus Torvalds's avatar
Linus Torvalds committed
74

75
#include <asm/tlb.h>
76
#include <asm/irq_regs.h>
Linus Torvalds's avatar
Linus Torvalds committed
77

78
79
#include "sched_cpupri.h"

80
#define CREATE_TRACE_POINTS
81
#include <trace/events/sched.h>
82

Linus Torvalds's avatar
Linus Torvalds committed
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
 * and back.
 */
#define NICE_TO_PRIO(nice)	(MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio)	((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p)		PRIO_TO_NICE((p)->static_prio)

/*
 * 'User priority' is the nice value converted to something we
 * can work with better when scaling various scheduler parameters,
 * it's a [ 0 ... 39 ] range.
 */
#define USER_PRIO(p)		((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p)	USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))

/*
102
 * Helpers for converting nanosecond timing to jiffy resolution
Linus Torvalds's avatar
Linus Torvalds committed
103
 */
104
#define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
Linus Torvalds's avatar
Linus Torvalds committed
105

Ingo Molnar's avatar
Ingo Molnar committed
106
107
108
#define NICE_0_LOAD		SCHED_LOAD_SCALE
#define NICE_0_SHIFT		SCHED_LOAD_SHIFT

Linus Torvalds's avatar
Linus Torvalds committed
109
110
111
/*
 * These are the 'tuning knobs' of the scheduler:
 *
Dmitry Adamushko's avatar
Dmitry Adamushko committed
112
 * default timeslice is 100 msecs (used only for SCHED_RR tasks).
Linus Torvalds's avatar
Linus Torvalds committed
113
114
115
 * Timeslices get refilled after they expire.
 */
#define DEF_TIMESLICE		(100 * HZ / 1000)
116

117
118
119
120
121
/*
 * single value that denotes runtime == period, ie unlimited time.
 */
#define RUNTIME_INF	((u64)~0ULL)

122
123
static inline int rt_policy(int policy)
{
124
	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
125
126
127
128
129
130
131
132
133
		return 1;
	return 0;
}

static inline int task_has_rt_policy(struct task_struct *p)
{
	return rt_policy(p->policy);
}

Linus Torvalds's avatar
Linus Torvalds committed
134
/*
Ingo Molnar's avatar
Ingo Molnar committed
135
 * This is the priority-queue data structure of the RT scheduling class:
Linus Torvalds's avatar
Linus Torvalds committed
136
 */
Ingo Molnar's avatar
Ingo Molnar committed
137
138
139
140
141
struct rt_prio_array {
	DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
	struct list_head queue[MAX_RT_PRIO];
};

142
struct rt_bandwidth {
Ingo Molnar's avatar
Ingo Molnar committed
143
144
145
146
147
	/* nests inside the rq lock: */
	spinlock_t		rt_runtime_lock;
	ktime_t			rt_period;
	u64			rt_runtime;
	struct hrtimer		rt_period_timer;
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
};

static struct rt_bandwidth def_rt_bandwidth;

static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);

static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
	struct rt_bandwidth *rt_b =
		container_of(timer, struct rt_bandwidth, rt_period_timer);
	ktime_t now;
	int overrun;
	int idle = 0;

	for (;;) {
		now = hrtimer_cb_get_time(timer);
		overrun = hrtimer_forward(timer, now, rt_b->rt_period);

		if (!overrun)
			break;

		idle = do_sched_rt_period_timer(rt_b, overrun);
	}

	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}

static
void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
{
	rt_b->rt_period = ns_to_ktime(period);
	rt_b->rt_runtime = runtime;

181
182
	spin_lock_init(&rt_b->rt_runtime_lock);

183
184
185
186
187
	hrtimer_init(&rt_b->rt_period_timer,
			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rt_b->rt_period_timer.function = sched_rt_period_timer;
}

188
189
190
static inline int rt_bandwidth_enabled(void)
{
	return sysctl_sched_rt_runtime >= 0;
191
192
193
194
195
196
}

static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
	ktime_t now;

197
	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
198
199
200
201
202
203
204
		return;

	if (hrtimer_active(&rt_b->rt_period_timer))
		return;

	spin_lock(&rt_b->rt_runtime_lock);
	for (;;) {
205
206
207
		unsigned long delta;
		ktime_t soft, hard;

208
209
210
211
212
		if (hrtimer_active(&rt_b->rt_period_timer))
			break;

		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
213
214
215
216
217

		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
		delta = ktime_to_ns(ktime_sub(hard, soft));
		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
218
				HRTIMER_MODE_ABS_PINNED, 0);
219
220
221
222
223
224
225
226
227
228
229
	}
	spin_unlock(&rt_b->rt_runtime_lock);
}

#ifdef CONFIG_RT_GROUP_SCHED
static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
{
	hrtimer_cancel(&rt_b->rt_period_timer);
}
#endif

230
231
232
233
234
235
/*
 * sched_domains_mutex serializes calls to arch_init_sched_domains,
 * detach_destroy_domains and partition_sched_domains.
 */
static DEFINE_MUTEX(sched_domains_mutex);

236
#ifdef CONFIG_GROUP_SCHED
237

238
239
#include <linux/cgroup.h>

240
241
struct cfs_rq;

Peter Zijlstra's avatar
Peter Zijlstra committed
242
243
static LIST_HEAD(task_groups);

244
/* task group related information */
245
struct task_group {
246
#ifdef CONFIG_CGROUP_SCHED
247
248
	struct cgroup_subsys_state css;
#endif
249

250
251
252
253
#ifdef CONFIG_USER_SCHED
	uid_t uid;
#endif

254
#ifdef CONFIG_FAIR_GROUP_SCHED
255
256
257
258
259
	/* schedulable entities of this group on each cpu */
	struct sched_entity **se;
	/* runqueue "owned" by this group on each cpu */
	struct cfs_rq **cfs_rq;
	unsigned long shares;
260
261
262
263
264
265
#endif

#ifdef CONFIG_RT_GROUP_SCHED
	struct sched_rt_entity **rt_se;
	struct rt_rq **rt_rq;

266
	struct rt_bandwidth rt_bandwidth;
267
#endif
268

269
	struct rcu_head rcu;
Peter Zijlstra's avatar
Peter Zijlstra committed
270
	struct list_head list;
Peter Zijlstra's avatar
Peter Zijlstra committed
271
272
273
274

	struct task_group *parent;
	struct list_head siblings;
	struct list_head children;
275
276
};

Dhaval Giani's avatar
Dhaval Giani committed
277
#ifdef CONFIG_USER_SCHED
278

279
280
281
282
283
284
/* Helper function to pass uid information to create_sched_user() */
void set_tg_uid(struct user_struct *user)
{
	user->tg->uid = user->uid;
}

285
286
/*
 * Root task group.
287
288
 *	Every UID task group (including init_task_group aka UID-0) will
 *	be a child to this group.
289
290
291
 */
struct task_group root_task_group;

292
#ifdef CONFIG_FAIR_GROUP_SCHED
293
294
295
/* Default task group's sched entity on each cpu */
static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
/* Default task group's cfs_rq on each cpu */
296
static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297
#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299
300

#ifdef CONFIG_RT_GROUP_SCHED
static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
302
#endif /* CONFIG_RT_GROUP_SCHED */
Peter Zijlstra's avatar
Peter Zijlstra committed
303
#else /* !CONFIG_USER_SCHED */
304
#define root_task_group init_task_group
Peter Zijlstra's avatar
Peter Zijlstra committed
305
#endif /* CONFIG_USER_SCHED */
Peter Zijlstra's avatar
Peter Zijlstra committed
306

307
/* task_group_lock serializes add/remove of task groups and also changes to
308
309
 * a task group's cpu shares.
 */
310
static DEFINE_SPINLOCK(task_group_lock);
311

312
313
314
315
316
317
318
#ifdef CONFIG_SMP
static int root_task_group_empty(void)
{
	return list_empty(&root_task_group.children);
}
#endif

319
320
321
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_USER_SCHED
# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
322
#else /* !CONFIG_USER_SCHED */
323
# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
324
#endif /* CONFIG_USER_SCHED */
325

326
/*
327
328
329
330
 * A weight of 0 or 1 can cause arithmetics problems.
 * A weight of a cfs_rq is the sum of weights of which entities
 * are queued on this cfs_rq, so a weight of a entity should not be
 * too large, so as the shares value of a task group.
331
332
333
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
334
#define MIN_SHARES	2
335
#define MAX_SHARES	(1UL << 18)
336

337
338
339
static int init_task_group_load = INIT_TASK_GROUP_LOAD;
#endif

340
/* Default task group.
Ingo Molnar's avatar
Ingo Molnar committed
341
 *	Every task in system belong to this group at bootup.
342
 */
343
struct task_group init_task_group;
344
345

/* return group to which a task belongs */
346
static inline struct task_group *task_group(struct task_struct *p)
347
{
348
	struct task_group *tg;
349

350
#ifdef CONFIG_USER_SCHED
351
352
353
	rcu_read_lock();
	tg = __task_cred(p)->user->tg;
	rcu_read_unlock();
354
#elif defined(CONFIG_CGROUP_SCHED)
355
356
	tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
				struct task_group, css);
357
#else
Ingo Molnar's avatar
Ingo Molnar committed
358
	tg = &init_task_group;
359
#endif
360
	return tg;
361
362
363
}

/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
Peter Zijlstra's avatar
Peter Zijlstra committed
364
static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
365
{
366
#ifdef CONFIG_FAIR_GROUP_SCHED
367
368
	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
	p->se.parent = task_group(p)->se[cpu];
369
#endif
Peter Zijlstra's avatar
Peter Zijlstra committed
370

371
#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra's avatar
Peter Zijlstra committed
372
373
	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
	p->rt.parent = task_group(p)->rt_se[cpu];
374
#endif
375
376
377
378
}

#else

Peter Zijlstra's avatar
Peter Zijlstra committed
379
static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
380
381
382
383
static inline struct task_group *task_group(struct task_struct *p)
{
	return NULL;
}
384

385
#endif	/* CONFIG_GROUP_SCHED */
386

Ingo Molnar's avatar
Ingo Molnar committed
387
388
389
390
391
392
/* CFS-related fields in a runqueue */
struct cfs_rq {
	struct load_weight load;
	unsigned long nr_running;

	u64 exec_clock;
Ingo Molnar's avatar
Ingo Molnar committed
393
	u64 min_vruntime;
Ingo Molnar's avatar
Ingo Molnar committed
394
395
396

	struct rb_root tasks_timeline;
	struct rb_node *rb_leftmost;
397
398
399
400
401
402

	struct list_head tasks;
	struct list_head *balance_iterator;

	/*
	 * 'curr' points to currently running entity on this cfs_rq.
Ingo Molnar's avatar
Ingo Molnar committed
403
404
	 * It is set to NULL otherwise (i.e when none are currently running).
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
405
	struct sched_entity *curr, *next, *last;
Peter Zijlstra's avatar
Peter Zijlstra committed
406

Peter Zijlstra's avatar
Peter Zijlstra committed
407
	unsigned int nr_spread_over;
Peter Zijlstra's avatar
Peter Zijlstra committed
408

409
#ifdef CONFIG_FAIR_GROUP_SCHED
Ingo Molnar's avatar
Ingo Molnar committed
410
411
	struct rq *rq;	/* cpu runqueue to which this cfs_rq is attached */

Ingo Molnar's avatar
Ingo Molnar committed
412
413
	/*
	 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
Ingo Molnar's avatar
Ingo Molnar committed
414
415
416
417
418
419
	 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
	 * (like users, containers etc.)
	 *
	 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
	 * list is used during load balance.
	 */
Ingo Molnar's avatar
Ingo Molnar committed
420
421
	struct list_head leaf_cfs_rq_list;
	struct task_group *tg;	/* group that "owns" this runqueue */
422
423
424

#ifdef CONFIG_SMP
	/*
425
	 * the part of load.weight contributed by tasks
426
	 */
427
	unsigned long task_weight;
428

429
430
431
432
433
434
435
	/*
	 *   h_load = weight * f(tg)
	 *
	 * Where f(tg) is the recursive weight fraction assigned to
	 * this group.
	 */
	unsigned long h_load;
436

437
438
439
440
	/*
	 * this cpu's part of tg->shares
	 */
	unsigned long shares;
441
442
443
444
445

	/*
	 * load.weight at the time we set shares
	 */
	unsigned long rq_weight;
446
#endif
Ingo Molnar's avatar
Ingo Molnar committed
447
448
#endif
};
Linus Torvalds's avatar
Linus Torvalds committed
449

Ingo Molnar's avatar
Ingo Molnar committed
450
451
452
/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
	struct rt_prio_array active;
453
	unsigned long rt_nr_running;
454
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
455
456
	struct {
		int curr; /* highest queued rt task prio */
457
#ifdef CONFIG_SMP
458
		int next; /* next highest */
459
#endif
460
	} highest_prio;
Peter Zijlstra's avatar
Peter Zijlstra committed
461
#endif
Peter Zijlstra's avatar
Peter Zijlstra committed
462
#ifdef CONFIG_SMP
463
	unsigned long rt_nr_migratory;
464
	unsigned long rt_nr_total;
Gregory Haskins's avatar
Gregory Haskins committed
465
	int overloaded;
466
	struct plist_head pushable_tasks;
Peter Zijlstra's avatar
Peter Zijlstra committed
467
#endif
Peter Zijlstra's avatar
Peter Zijlstra committed
468
	int rt_throttled;
Peter Zijlstra's avatar
Peter Zijlstra committed
469
	u64 rt_time;
470
	u64 rt_runtime;
Ingo Molnar's avatar
Ingo Molnar committed
471
	/* Nests inside the rq lock: */
472
	spinlock_t rt_runtime_lock;
Peter Zijlstra's avatar
Peter Zijlstra committed
473

474
#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra's avatar
Peter Zijlstra committed
475
476
	unsigned long rt_nr_boosted;

Peter Zijlstra's avatar
Peter Zijlstra committed
477
478
479
480
481
	struct rq *rq;
	struct list_head leaf_rt_rq_list;
	struct task_group *tg;
	struct sched_rt_entity *rt_se;
#endif
Ingo Molnar's avatar
Ingo Molnar committed
482
483
};

484
485
486
487
#ifdef CONFIG_SMP

/*
 * We add the notion of a root-domain which will be used to define per-domain
Ingo Molnar's avatar
Ingo Molnar committed
488
489
 * variables. Each exclusive cpuset essentially defines an island domain by
 * fully partitioning the member cpus from any other cpuset. Whenever a new
490
491
492
493
494
495
 * exclusive cpuset is created, we also create and attach a new root-domain
 * object.
 *
 */
struct root_domain {
	atomic_t refcount;
496
497
	cpumask_var_t span;
	cpumask_var_t online;
498

Ingo Molnar's avatar
Ingo Molnar committed
499
	/*
500
501
502
	 * The "RT overload" flag: it gets set if a CPU has more than
	 * one runnable RT task.
	 */
503
	cpumask_var_t rto_mask;
Ingo Molnar's avatar
Ingo Molnar committed
504
	atomic_t rto_count;
505
506
507
#ifdef CONFIG_SMP
	struct cpupri cpupri;
#endif
508
509
};

510
511
512
513
/*
 * By default the system creates a single root-domain with all cpus as
 * members (mimicking the global state we have today).
 */
514
515
516
517
static struct root_domain def_root_domain;

#endif

Linus Torvalds's avatar
Linus Torvalds committed
518
519
520
521
522
523
524
/*
 * This is the main, per-CPU runqueue data structure.
 *
 * Locking rule: those places that want to lock multiple runqueues
 * (such as the load balancing or the thread migration code), lock
 * acquire operations must be ordered by ascending &runqueue.
 */
525
struct rq {
526
527
	/* runqueue lock: */
	spinlock_t lock;
Linus Torvalds's avatar
Linus Torvalds committed
528
529
530
531
532
533

	/*
	 * nr_running and cpu_load should be in the same cacheline because
	 * remote CPUs use both these fields when doing load calculation.
	 */
	unsigned long nr_running;
Ingo Molnar's avatar
Ingo Molnar committed
534
535
	#define CPU_LOAD_IDX_MAX 5
	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
536
#ifdef CONFIG_NO_HZ
537
	unsigned long last_tick_seen;
538
539
	unsigned char in_nohz_recently;
#endif
540
541
	/* capture load from *all* tasks on this cpu: */
	struct load_weight load;
Ingo Molnar's avatar
Ingo Molnar committed
542
543
	unsigned long nr_load_updates;
	u64 nr_switches;
544
	u64 nr_migrations_in;
Ingo Molnar's avatar
Ingo Molnar committed
545
546

	struct cfs_rq cfs;
Peter Zijlstra's avatar
Peter Zijlstra committed
547
548
	struct rt_rq rt;

Ingo Molnar's avatar
Ingo Molnar committed
549
#ifdef CONFIG_FAIR_GROUP_SCHED
550
551
	/* list of leaf cfs_rq on this cpu: */
	struct list_head leaf_cfs_rq_list;
552
553
#endif
#ifdef CONFIG_RT_GROUP_SCHED
Peter Zijlstra's avatar
Peter Zijlstra committed
554
	struct list_head leaf_rt_rq_list;
Linus Torvalds's avatar
Linus Torvalds committed
555
556
557
558
559
560
561
562
563
564
#endif

	/*
	 * This is part of a global counter where only the total sum
	 * over all CPUs matters. A task can increase this counter on
	 * one CPU and if it got migrated afterwards it may decrease
	 * it on another CPU. Always updated under the runqueue lock:
	 */
	unsigned long nr_uninterruptible;

565
	struct task_struct *curr, *idle;
566
	unsigned long next_balance;
Linus Torvalds's avatar
Linus Torvalds committed
567
	struct mm_struct *prev_mm;
Ingo Molnar's avatar
Ingo Molnar committed
568

569
	u64 clock;
Ingo Molnar's avatar
Ingo Molnar committed
570

Linus Torvalds's avatar
Linus Torvalds committed
571
572
573
	atomic_t nr_iowait;

#ifdef CONFIG_SMP
Ingo Molnar's avatar
Ingo Molnar committed
574
	struct root_domain *rd;
Linus Torvalds's avatar
Linus Torvalds committed
575
576
	struct sched_domain *sd;

577
	unsigned char idle_at_tick;
Linus Torvalds's avatar
Linus Torvalds committed
578
	/* For active balancing */
579
	int post_schedule;
Linus Torvalds's avatar
Linus Torvalds committed
580
581
	int active_balance;
	int push_cpu;
582
583
	/* cpu of this runqueue: */
	int cpu;
584
	int online;
Linus Torvalds's avatar
Linus Torvalds committed
585

586
	unsigned long avg_load_per_task;
Linus Torvalds's avatar
Linus Torvalds committed
587

588
	struct task_struct *migration_thread;
Linus Torvalds's avatar
Linus Torvalds committed
589
	struct list_head migration_queue;
590
591
592

	u64 rt_avg;
	u64 age_stamp;
Linus Torvalds's avatar
Linus Torvalds committed
593
594
#endif

595
596
597
598
	/* calc_load related fields */
	unsigned long calc_load_update;
	long calc_load_active;

599
#ifdef CONFIG_SCHED_HRTICK
600
601
602
603
#ifdef CONFIG_SMP
	int hrtick_csd_pending;
	struct call_single_data hrtick_csd;
#endif
604
605
606
	struct hrtimer hrtick_timer;
#endif

Linus Torvalds's avatar
Linus Torvalds committed
607
608
609
#ifdef CONFIG_SCHEDSTATS
	/* latency stats */
	struct sched_info rq_sched_info;
610
611
	unsigned long long rq_cpu_time;
	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
Linus Torvalds's avatar
Linus Torvalds committed
612
613

	/* sys_sched_yield() stats */
614
	unsigned int yld_count;
Linus Torvalds's avatar
Linus Torvalds committed
615
616

	/* schedule() stats */
617
618
619
	unsigned int sched_switch;
	unsigned int sched_count;
	unsigned int sched_goidle;
Linus Torvalds's avatar
Linus Torvalds committed
620
621

	/* try_to_wake_up() stats */
622
623
	unsigned int ttwu_count;
	unsigned int ttwu_local;
624
625

	/* BKL stats */
626
	unsigned int bkl_count;
Linus Torvalds's avatar
Linus Torvalds committed
627
628
629
#endif
};

630
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
Linus Torvalds's avatar
Linus Torvalds committed
631

Peter Zijlstra's avatar
Peter Zijlstra committed
632
633
static inline
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
Ingo Molnar's avatar
Ingo Molnar committed
634
{
Peter Zijlstra's avatar
Peter Zijlstra committed
635
	rq->curr->sched_class->check_preempt_curr(rq, p, flags);
Ingo Molnar's avatar
Ingo Molnar committed
636
637
}

638
639
640
641
642
643
644
645
646
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
	return rq->cpu;
#else
	return 0;
#endif
}

Nick Piggin's avatar
Nick Piggin committed
647
648
/*
 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
649
 * See detach_destroy_domains: synchronize_sched for details.
Nick Piggin's avatar
Nick Piggin committed
650
651
652
653
 *
 * The domain tree of any CPU may only be accessed from within
 * preempt-disabled sections.
 */
654
655
#define for_each_domain(cpu, __sd) \
	for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
Linus Torvalds's avatar
Linus Torvalds committed
656
657
658
659
660

#define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
#define this_rq()		(&__get_cpu_var(runqueues))
#define task_rq(p)		cpu_rq(task_cpu(p))
#define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
661
#define raw_rq()		(&__raw_get_cpu_var(runqueues))
Linus Torvalds's avatar
Linus Torvalds committed
662

663
inline void update_rq_clock(struct rq *rq)
664
665
666
667
{
	rq->clock = sched_clock_cpu(cpu_of(rq));
}

668
669
670
671
672
673
674
675
676
/*
 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
 */
#ifdef CONFIG_SCHED_DEBUG
# define const_debug __read_mostly
#else
# define const_debug static const
#endif

Ingo Molnar's avatar
Ingo Molnar committed
677
678
679
680
681
682
683
/**
 * runqueue_is_locked
 *
 * Returns true if the current cpu runqueue is locked.
 * This interface allows printk to be called with the runqueue lock
 * held and know whether or not it is OK to wake up the klogd.
 */
684
int runqueue_is_locked(int cpu)
Ingo Molnar's avatar
Ingo Molnar committed
685
{
686
	return spin_is_locked(&cpu_rq(cpu)->lock);
Ingo Molnar's avatar
Ingo Molnar committed
687
688
}

689
690
691
/*
 * Debugging: various feature bits
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
692
693
694
695

#define SCHED_FEAT(name, enabled)	\
	__SCHED_FEAT_##name ,

696
enum {
Peter Zijlstra's avatar
Peter Zijlstra committed
697
#include "sched_features.h"
698
699
};

Peter Zijlstra's avatar
Peter Zijlstra committed
700
701
702
703
704
#undef SCHED_FEAT

#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |

705
const_debug unsigned int sysctl_sched_features =
Peter Zijlstra's avatar
Peter Zijlstra committed
706
707
708
709
710
711
712
713
714
#include "sched_features.h"
	0;

#undef SCHED_FEAT

#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled)	\
	#name ,

715
static __read_mostly char *sched_feat_names[] = {
Peter Zijlstra's avatar
Peter Zijlstra committed
716
717
718
719
720
721
#include "sched_features.h"
	NULL
};

#undef SCHED_FEAT

722
static int sched_feat_show(struct seq_file *m, void *v)
Peter Zijlstra's avatar
Peter Zijlstra committed
723
724
725
726
{
	int i;

	for (i = 0; sched_feat_names[i]; i++) {
727
728
729
		if (!(sysctl_sched_features & (1UL << i)))
			seq_puts(m, "NO_");
		seq_printf(m, "%s ", sched_feat_names[i]);
Peter Zijlstra's avatar
Peter Zijlstra committed
730
	}
731
	seq_puts(m, "\n");
Peter Zijlstra's avatar
Peter Zijlstra committed
732

733
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
}

static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
		size_t cnt, loff_t *ppos)
{
	char buf[64];
	char *cmp = buf;
	int neg = 0;
	int i;

	if (cnt > 63)
		cnt = 63;

	if (copy_from_user(&buf, ubuf, cnt))
		return -EFAULT;

	buf[cnt] = 0;

Ingo Molnar's avatar
Ingo Molnar committed
753
	if (strncmp(buf, "NO_", 3) == 0) {
Peter Zijlstra's avatar
Peter Zijlstra committed
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
		neg = 1;
		cmp += 3;
	}

	for (i = 0; sched_feat_names[i]; i++) {
		int len = strlen(sched_feat_names[i]);

		if (strncmp(cmp, sched_feat_names[i], len) == 0) {
			if (neg)
				sysctl_sched_features &= ~(1UL << i);
			else
				sysctl_sched_features |= (1UL << i);
			break;
		}
	}

	if (!sched_feat_names[i])
		return -EINVAL;

	filp->f_pos += cnt;

	return cnt;
}

778
779
780
781
782
static int sched_feat_open(struct inode *inode, struct file *filp)
{
	return single_open(filp, sched_feat_show, NULL);
}

783
static const struct file_operations sched_feat_fops = {
784
785
786
787
788
	.open		= sched_feat_open,
	.write		= sched_feat_write,
	.read		= seq_read,
	.llseek		= seq_lseek,
	.release	= single_release,
Peter Zijlstra's avatar
Peter Zijlstra committed
789
790
791
792
793
794
795
796
797
798
799
800
801
802
};

static __init int sched_init_debug(void)
{
	debugfs_create_file("sched_features", 0644, NULL, NULL,
			&sched_feat_fops);

	return 0;
}
late_initcall(sched_init_debug);

#endif

#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
803

804
805
806
807
808
809
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

810
811
/*
 * ratelimit for updating the group shares.
812
 * default: 0.25ms
813
 */
814
unsigned int sysctl_sched_shares_ratelimit = 250000;
815

816
817
818
819
820
821
822
/*
 * Inject some fuzzyness into changing the per-cpu group shares
 * this avoids remote rq-locks at the expense of fairness.
 * default: 4
 */
unsigned int sysctl_sched_shares_thresh = 4;

823
824
825
826
827
828
829
830
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
831
/*
Peter Zijlstra's avatar
Peter Zijlstra committed
832
 * period over which we measure -rt task cpu usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
833
834
 * default: 1s
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
835
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
836

837
838
static __read_mostly int scheduler_running;

Peter Zijlstra's avatar
Peter Zijlstra committed
839
840
841
842
843
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
844

845
846
847
848
849
850
851
static inline u64 global_rt_period(void)
{
	return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
}

static inline u64 global_rt_runtime(void)
{
852
	if (sysctl_sched_rt_runtime < 0)
853
854
855
856
		return RUNTIME_INF;

	return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
Peter Zijlstra's avatar
Peter Zijlstra committed
857

Linus Torvalds's avatar
Linus Torvalds committed
858
#ifndef prepare_arch_switch
859
860
861
862
863
864
# define prepare_arch_switch(next)	do { } while (0)
#endif
#ifndef finish_arch_switch
# define finish_arch_switch(prev)	do { } while (0)
#endif

865
866
867
868
869
static inline int task_current(struct rq *rq, struct task_struct *p)
{
	return rq->curr == p;
}

870
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
871
static inline int task_running(struct rq *rq, struct task_struct *p)
872
{
873
	return task_current(rq, p);
874
875
}

876
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
877
878
879
{
}

880
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
881
{
882
883
884
885
#ifdef CONFIG_DEBUG_SPINLOCK
	/* this is a valid case when another task releases the spinlock */
	rq->lock.owner = current;
#endif
886
887
888
889
890
891
892
	/*
	 * If we are tracking spinlock dependencies then we have to
	 * fix up the runqueue lock - which gets 'carried over' from
	 * prev into current:
	 */
	spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);

893
894
895
896
	spin_unlock_irq(&rq->lock);
}

#else /* __ARCH_WANT_UNLOCKED_CTXSW */
897
static inline int task_running(struct rq *rq, struct task_struct *p)
898
899
900
901
{
#ifdef CONFIG_SMP
	return p->oncpu;
#else
902
	return task_current(rq, p);
903
904
905
#endif
}

906
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
{
#ifdef CONFIG_SMP
	/*
	 * We can optimise this out completely for !SMP, because the
	 * SMP rebalancing from interrupt is the only thing that cares
	 * here.
	 */
	next->oncpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
	spin_unlock_irq(&rq->lock);
#else
	spin_unlock(&rq->lock);
#endif
}

923
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
924
925
926
927
928
929
930
931
932
933
934
935
{
#ifdef CONFIG_SMP
	/*
	 * After ->oncpu is cleared, the task can be moved to a different CPU.
	 * We must ensure this doesn't happen until the switch is completely
	 * finished.
	 */
	smp_wmb();
	prev->oncpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
	local_irq_enable();
Linus Torvalds's avatar
Linus Torvalds committed
936
#endif
937
938
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
Linus Torvalds's avatar
Linus Torvalds committed
939

940
941
942
943
/*
 * __task_rq_lock - lock the runqueue a given task resides on.
 * Must be called interrupts disabled.
 */
944
static inline struct rq *__task_rq_lock(struct task_struct *p)
945
946
	__acquires(rq->lock)
{
947
948
949
950
951
	for (;;) {
		struct rq *rq = task_rq(p);
		spin_lock(&rq->lock);
		if (likely(rq == task_rq(p)))
			return rq;
952
953
954
955
		spin_unlock(&rq->lock);
	}
}

Linus Torvalds's avatar
Linus Torvalds committed
956
957
/*
 * task_rq_lock - lock the runqueue a given task resides on and disable
Ingo Molnar's avatar
Ingo Molnar committed
958
 * interrupts. Note the ordering: we can safely lookup the task_rq without
Linus Torvalds's avatar
Linus Torvalds committed
959
960
 * explicitly disabling preemption.
 */
961
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
Linus Torvalds's avatar
Linus Torvalds committed
962
963
	__acquires(rq->lock)
{
964
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
965

966
967
968
969
970
971
	for (;;) {
		local_irq_save(*flags);
		rq = task_rq(p);
		spin_lock(&rq->lock);
		if (likely(rq == task_rq(p)))
			return rq;
Linus Torvalds's avatar
Linus Torvalds committed
972
973
974
975
		spin_unlock_irqrestore(&rq->lock, *flags);
	}
}

976
977
978
979
980
981
982
983
void task_rq_unlock_wait(struct task_struct *p)
{
	struct rq *rq = task_rq(p);

	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
	spin_unlock_wait(&rq->lock);
}

Alexey Dobriyan's avatar
Alexey Dobriyan committed
984
static void __task_rq_unlock(struct rq *rq)
985
986
987
988
989
	__releases(rq->lock)
{
	spin_unlock(&rq->lock);
}

990
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
Linus Torvalds's avatar
Linus Torvalds committed
991
992
993
994
995
996
	__releases(rq->lock)
{
	spin_unlock_irqrestore(&rq->lock, *flags);
}

/*
997
 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds's avatar
Linus Torvalds committed
998
 */
Alexey Dobriyan's avatar
Alexey Dobriyan committed
999
static struct rq *this_rq_lock(void)
Linus Torvalds's avatar
Linus Torvalds committed
1000
1001
	__acquires(rq->lock)
{
1002
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
1003
1004
1005
1006
1007
1008
1009
1010

	local_irq_disable();
	rq = this_rq();
	spin_lock(&rq->lock);

	return rq;
}

1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 *
 * Its all a bit involved since we cannot program an hrt while holding the
 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
 * reschedule event.
 *
 * When we get rescheduled we reprogram the hrtick_timer outside of the
 * rq->lock.
 */

/*
 * Use hrtick when:
 *  - enabled by features
 *  - hrtimer is actually high res
 */
static inline int hrtick_enabled(struct rq *rq)
{
	if (!sched_feat(HRTICK))
		return 0;
1032
	if (!cpu_active(cpu_of(rq)))
1033
		return 0;
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
	return hrtimer_is_hres_active(&rq->hrtick_timer);
}

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

	spin_lock(&rq->lock);
1054
	update_rq_clock(rq);
1055
1056
1057
1058
1059
1060
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
	spin_unlock(&rq->lock);

	return HRTIMER_NORESTART;
}

1061
#ifdef CONFIG_SMP
1062
1063
1064
1065
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
1066
{
1067
	struct rq *rq = arg;
1068

1069
1070
1071
1072
	spin_lock(&rq->lock);
	hrtimer_restart(&rq->hrtick_timer);
	rq->hrtick_csd_pending = 0;
	spin_unlock(&rq->lock);
1073
1074
}

1075
1076
1077
1078
1079
1080
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
static void hrtick_start(struct rq *rq, u64 delay)
1081
{
1082
1083
	struct hrtimer *timer = &rq->hrtick_timer;
	ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1084

1085
	hrtimer_set_expires(timer, time);
1086
1087
1088
1089

	if (rq == this_rq()) {
		hrtimer_restart(timer);
	} else if (!rq->hrtick_csd_pending) {
1090
		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1091
1092
		rq->hrtick_csd_pending = 1;
	}
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
}

static int
hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
	int cpu = (int)(long)hcpu;

	switch (action) {
	case CPU_UP_CANCELED:
	case CPU_UP_CANCELED_FROZEN:
	case CPU_DOWN_PREPARE:
	case CPU_DOWN_PREPARE_FROZEN:
	case CPU_DEAD:
	case CPU_DEAD_FROZEN:
1107
		hrtick_clear(cpu_rq(cpu));
1108
1109
1110
1111
1112
1113
		return NOTIFY_OK;
	}

	return NOTIFY_DONE;
}

1114
static __init void init_hrtick(void)
1115
1116
1117
{
	hotcpu_notifier(hotplug_hrtick, 0);
}
1118
1119
1120
1121
1122
1123
1124
1125
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
static void hrtick_start(struct rq *rq, u64 delay)
{
1126
	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1127
			HRTIMER_MODE_REL_PINNED, 0);
1128
}
1129

Andrew Morton's avatar
Andrew Morton committed
1130
static inline void init_hrtick(void)
1131
1132
{
}
1133
#endif /* CONFIG_SMP */
1134

1135
static void init_rq_hrtick(struct rq *rq)
1136
{
1137
1138
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
1139

1140
1141
1142
1143
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
1144

1145
1146
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
1147
}
Andrew Morton's avatar
Andrew Morton committed
1148
#else	/* CONFIG_SCHED_HRTICK */
1149
1150
1151
1152
1153
1154
1155
1156
static inline void hrtick_clear(struct rq *rq)
{
}

static inline void init_rq_hrtick(struct rq *rq)
{
}

1157
1158
1159
static inline void init_hrtick(void)
{
}
Andrew Morton's avatar
Andrew Morton committed
1160
#endif	/* CONFIG_SCHED_HRTICK */
1161

1162
1163