perf_event.c 123 KB
Newer Older
1
/*
2
 * Performance events core code:
3
 *
4
5
6
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7
 *  Copyright    2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8
 *
9
 * For licensing details see kernel-base/COPYING
10
11
12
 */

#include <linux/fs.h>
13
#include <linux/mm.h>
14
15
#include <linux/cpu.h>
#include <linux/smp.h>
16
#include <linux/file.h>
17
18
#include <linux/poll.h>
#include <linux/sysfs.h>
19
#include <linux/dcache.h>
20
#include <linux/percpu.h>
21
#include <linux/ptrace.h>
22
#include <linux/vmstat.h>
23
#include <linux/vmalloc.h>
24
25
#include <linux/hardirq.h>
#include <linux/rculist.h>
26
27
28
#include <linux/uaccess.h>
#include <linux/syscalls.h>
#include <linux/anon_inodes.h>
29
#include <linux/kernel_stat.h>
30
#include <linux/perf_event.h>
Li Zefan's avatar
Li Zefan committed
31
#include <linux/ftrace_event.h>
32
#include <linux/hw_breakpoint.h>
33

34
35
#include <asm/irq_regs.h>

36
/*
37
 * Each CPU has a list of per CPU events:
38
 */
39
static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
40

41
int perf_max_events __read_mostly = 1;
42
43
44
static int perf_reserved_percpu __read_mostly;
static int perf_overcommit __read_mostly = 1;

45
46
47
48
static atomic_t nr_events __read_mostly;
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
49

50
/*
51
 * perf event paranoia level:
52
53
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
54
 *   1 - disallow cpu events for unpriv
55
 *   2 - disallow kernel profiling for unpriv
56
 */
57
int sysctl_perf_event_paranoid __read_mostly = 1;
58

59
60
static inline bool perf_paranoid_tracepoint_raw(void)
{
61
	return sysctl_perf_event_paranoid > -1;
62
63
}

64
65
static inline bool perf_paranoid_cpu(void)
{
66
	return sysctl_perf_event_paranoid > 0;
67
68
69
70
}

static inline bool perf_paranoid_kernel(void)
{
71
	return sysctl_perf_event_paranoid > 1;
72
73
}

74
int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
75
76

/*
77
 * max perf event sample rate
78
 */
79
int sysctl_perf_event_sample_rate __read_mostly = 100000;
80

81
static atomic64_t perf_event_id;
82

83
/*
84
 * Lock for (sysadmin-configurable) event reservations:
85
 */
86
static DEFINE_SPINLOCK(perf_resource_lock);
87
88
89
90

/*
 * Architecture provided APIs - weak aliases:
 */
91
extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event)
92
{
93
	return NULL;
94
95
}

96
97
98
void __weak hw_perf_disable(void)		{ barrier(); }
void __weak hw_perf_enable(void)		{ barrier(); }

99
100
void __weak hw_perf_event_setup(int cpu)	{ barrier(); }
void __weak hw_perf_event_setup_online(int cpu)	{ barrier(); }
101
102

int __weak
103
hw_perf_group_sched_in(struct perf_event *group_leader,
104
	       struct perf_cpu_context *cpuctx,
105
	       struct perf_event_context *ctx, int cpu)
106
107
108
{
	return 0;
}
109

110
void __weak perf_event_print_debug(void)	{ }
111

112
static DEFINE_PER_CPU(int, perf_disable_count);
113
114
115

void __perf_disable(void)
{
116
	__get_cpu_var(perf_disable_count)++;
117
118
119
120
}

bool __perf_enable(void)
{
121
	return !--__get_cpu_var(perf_disable_count);
122
123
124
125
126
127
128
129
130
131
132
133
134
135
}

void perf_disable(void)
{
	__perf_disable();
	hw_perf_disable();
}

void perf_enable(void)
{
	if (__perf_enable())
		hw_perf_enable();
}

136
static void get_ctx(struct perf_event_context *ctx)
137
{
138
	WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
139
140
}

141
142
static void free_ctx(struct rcu_head *head)
{
143
	struct perf_event_context *ctx;
144

145
	ctx = container_of(head, struct perf_event_context, rcu_head);
146
147
148
	kfree(ctx);
}

149
static void put_ctx(struct perf_event_context *ctx)
150
{
151
152
153
	if (atomic_dec_and_test(&ctx->refcount)) {
		if (ctx->parent_ctx)
			put_ctx(ctx->parent_ctx);
154
155
156
		if (ctx->task)
			put_task_struct(ctx->task);
		call_rcu(&ctx->rcu_head, free_ctx);
157
	}
158
159
}

160
static void unclone_ctx(struct perf_event_context *ctx)
161
162
163
164
165
166
167
{
	if (ctx->parent_ctx) {
		put_ctx(ctx->parent_ctx);
		ctx->parent_ctx = NULL;
	}
}

168
/*
169
 * If we inherit events we want to return the parent event id
170
171
 * to userspace.
 */
172
static u64 primary_event_id(struct perf_event *event)
173
{
174
	u64 id = event->id;
175

176
177
	if (event->parent)
		id = event->parent->id;
178
179
180
181

	return id;
}

182
/*
183
 * Get the perf_event_context for a task and lock it.
184
185
186
 * This has to cope with with the fact that until it is locked,
 * the context could get moved to another task.
 */
187
static struct perf_event_context *
188
perf_lock_task_context(struct task_struct *task, unsigned long *flags)
189
{
190
	struct perf_event_context *ctx;
191
192
193

	rcu_read_lock();
 retry:
194
	ctx = rcu_dereference(task->perf_event_ctxp);
195
196
197
198
	if (ctx) {
		/*
		 * If this context is a clone of another, it might
		 * get swapped for another underneath us by
199
		 * perf_event_task_sched_out, though the
200
201
202
203
204
205
		 * rcu_read_lock() protects us from any context
		 * getting freed.  Lock the context and check if it
		 * got swapped before we could get the lock, and retry
		 * if so.  If we locked the right context, then it
		 * can't get swapped on us any more.
		 */
206
		raw_spin_lock_irqsave(&ctx->lock, *flags);
207
		if (ctx != rcu_dereference(task->perf_event_ctxp)) {
208
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
209
210
			goto retry;
		}
211
212

		if (!atomic_inc_not_zero(&ctx->refcount)) {
213
			raw_spin_unlock_irqrestore(&ctx->lock, *flags);
214
215
			ctx = NULL;
		}
216
217
218
219
220
221
222
223
224
225
	}
	rcu_read_unlock();
	return ctx;
}

/*
 * Get the context for a task and increment its pin_count so it
 * can't get swapped to another task.  This also increments its
 * reference count so that the context can't get freed.
 */
226
static struct perf_event_context *perf_pin_task_context(struct task_struct *task)
227
{
228
	struct perf_event_context *ctx;
229
230
231
232
233
	unsigned long flags;

	ctx = perf_lock_task_context(task, &flags);
	if (ctx) {
		++ctx->pin_count;
234
		raw_spin_unlock_irqrestore(&ctx->lock, flags);
235
236
237
238
	}
	return ctx;
}

239
static void perf_unpin_context(struct perf_event_context *ctx)
240
241
242
{
	unsigned long flags;

243
	raw_spin_lock_irqsave(&ctx->lock, flags);
244
	--ctx->pin_count;
245
	raw_spin_unlock_irqrestore(&ctx->lock, flags);
246
247
248
	put_ctx(ctx);
}

249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
static inline u64 perf_clock(void)
{
	return cpu_clock(smp_processor_id());
}

/*
 * Update the record of the current time in a context.
 */
static void update_context_time(struct perf_event_context *ctx)
{
	u64 now = perf_clock();

	ctx->time += now - ctx->timestamp;
	ctx->timestamp = now;
}

/*
 * Update the total_time_enabled and total_time_running fields for a event.
 */
static void update_event_times(struct perf_event *event)
{
	struct perf_event_context *ctx = event->ctx;
	u64 run_end;

	if (event->state < PERF_EVENT_STATE_INACTIVE ||
	    event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
		return;

277
278
279
280
281
282
	if (ctx->is_active)
		run_end = ctx->time;
	else
		run_end = event->tstamp_stopped;

	event->total_time_enabled = run_end - event->tstamp_enabled;
283
284
285
286
287
288
289
290
291

	if (event->state == PERF_EVENT_STATE_INACTIVE)
		run_end = event->tstamp_stopped;
	else
		run_end = ctx->time;

	event->total_time_running = run_end - event->tstamp_running;
}

292
293
294
295
296
297
298
299
300
static struct list_head *
ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
{
	if (event->attr.pinned)
		return &ctx->pinned_groups;
	else
		return &ctx->flexible_groups;
}

301
/*
302
 * Add a event from the lists for its context.
303
304
 * Must be called with ctx->mutex and ctx->lock held.
 */
305
static void
306
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
307
{
308
	struct perf_event *group_leader = event->group_leader;
309
310

	/*
311
312
	 * Depending on whether it is a standalone or sibling event,
	 * add it straight to the context's event list, or to the group
313
314
	 * leader's sibling list:
	 */
315
316
317
	if (group_leader == event) {
		struct list_head *list;

318
319
320
		if (is_software_event(event))
			event->group_flags |= PERF_GROUP_SOFTWARE;

321
322
323
		list = ctx_group_list(event, ctx);
		list_add_tail(&event->group_entry, list);
	} else {
324
325
326
327
		if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
		    !is_software_event(event))
			group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;

328
		list_add_tail(&event->group_entry, &group_leader->sibling_list);
Peter Zijlstra's avatar
Peter Zijlstra committed
329
330
		group_leader->nr_siblings++;
	}
331

332
333
334
	list_add_rcu(&event->event_entry, &ctx->event_list);
	ctx->nr_events++;
	if (event->attr.inherit_stat)
335
		ctx->nr_stat++;
336
337
}

338
/*
339
 * Remove a event from the lists for its context.
340
 * Must be called with ctx->mutex and ctx->lock held.
341
 */
342
static void
343
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
344
{
345
	struct perf_event *sibling, *tmp;
346

347
	if (list_empty(&event->group_entry))
348
		return;
349
350
	ctx->nr_events--;
	if (event->attr.inherit_stat)
351
		ctx->nr_stat--;
352

353
354
	list_del_init(&event->group_entry);
	list_del_rcu(&event->event_entry);
355

356
357
	if (event->group_leader != event)
		event->group_leader->nr_siblings--;
Peter Zijlstra's avatar
Peter Zijlstra committed
358

359
	update_event_times(event);
360
361
362
363
364
365
366
367
368
369

	/*
	 * If event was in error state, then keep it
	 * that way, otherwise bogus counts will be
	 * returned on read(). The only way to get out
	 * of error state is by explicit re-enabling
	 * of the event
	 */
	if (event->state > PERF_EVENT_STATE_OFF)
		event->state = PERF_EVENT_STATE_OFF;
370

371
	/*
372
373
	 * If this was a group event with sibling events then
	 * upgrade the siblings to singleton events by adding them
374
375
	 * to the context list directly:
	 */
376
	list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
377
		struct list_head *list;
378

379
380
		list = ctx_group_list(event, ctx);
		list_move_tail(&sibling->group_entry, list);
381
		sibling->group_leader = sibling;
382
383
384

		/* Inherit group flags from the previous leader */
		sibling->group_flags = event->group_flags;
385
386
387
	}
}

388
static void
389
event_sched_out(struct perf_event *event,
390
		  struct perf_cpu_context *cpuctx,
391
		  struct perf_event_context *ctx)
392
{
393
	if (event->state != PERF_EVENT_STATE_ACTIVE)
394
395
		return;

396
397
398
399
	event->state = PERF_EVENT_STATE_INACTIVE;
	if (event->pending_disable) {
		event->pending_disable = 0;
		event->state = PERF_EVENT_STATE_OFF;
400
	}
401
402
403
	event->tstamp_stopped = ctx->time;
	event->pmu->disable(event);
	event->oncpu = -1;
404

405
	if (!is_software_event(event))
406
407
		cpuctx->active_oncpu--;
	ctx->nr_active--;
408
	if (event->attr.exclusive || !cpuctx->active_oncpu)
409
410
411
		cpuctx->exclusive = 0;
}

412
static void
413
group_sched_out(struct perf_event *group_event,
414
		struct perf_cpu_context *cpuctx,
415
		struct perf_event_context *ctx)
416
{
417
	struct perf_event *event;
418

419
	if (group_event->state != PERF_EVENT_STATE_ACTIVE)
420
421
		return;

422
	event_sched_out(group_event, cpuctx, ctx);
423
424
425
426

	/*
	 * Schedule out siblings (if any):
	 */
427
428
	list_for_each_entry(event, &group_event->sibling_list, group_entry)
		event_sched_out(event, cpuctx, ctx);
429

430
	if (group_event->attr.exclusive)
431
432
433
		cpuctx->exclusive = 0;
}

434
/*
435
 * Cross CPU call to remove a performance event
436
 *
437
 * We disable the event on the hardware level first. After that we
438
439
 * remove it from the context list.
 */
440
static void __perf_event_remove_from_context(void *info)
441
442
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
443
444
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
445
446
447
448
449
450

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
	 */
451
	if (ctx->task && cpuctx->task_ctx != ctx)
452
453
		return;

454
	raw_spin_lock(&ctx->lock);
455
456
	/*
	 * Protect the list operation against NMI by disabling the
457
	 * events on a global level.
458
459
	 */
	perf_disable();
460

461
	event_sched_out(event, cpuctx, ctx);
462

463
	list_del_event(event, ctx);
464
465
466

	if (!ctx->task) {
		/*
467
		 * Allow more per task events with respect to the
468
469
470
		 * reservation:
		 */
		cpuctx->max_pertask =
471
472
			min(perf_max_events - ctx->nr_events,
			    perf_max_events - perf_reserved_percpu);
473
474
	}

475
	perf_enable();
476
	raw_spin_unlock(&ctx->lock);
477
478
479
480
}


/*
481
 * Remove the event from a task's (or a CPU's) list of events.
482
 *
483
 * Must be called with ctx->mutex held.
484
 *
485
 * CPU events are removed with a smp call. For task events we only
486
 * call when the task is on a CPU.
487
 *
488
489
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
490
491
 * remains valid.  This is OK when called from perf_release since
 * that only calls us on the top-level context, which can't be a clone.
492
 * When called from perf_event_exit_task, it's OK because the
493
 * context has been detached from its task.
494
 */
495
static void perf_event_remove_from_context(struct perf_event *event)
496
{
497
	struct perf_event_context *ctx = event->ctx;
498
499
500
501
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
502
		 * Per cpu events are removed via an smp call and
503
		 * the removal is always successful.
504
		 */
505
506
507
		smp_call_function_single(event->cpu,
					 __perf_event_remove_from_context,
					 event, 1);
508
509
510
511
		return;
	}

retry:
512
513
	task_oncpu_function_call(task, __perf_event_remove_from_context,
				 event);
514

515
	raw_spin_lock_irq(&ctx->lock);
516
517
518
	/*
	 * If the context is active we need to retry the smp call.
	 */
519
	if (ctx->nr_active && !list_empty(&event->group_entry)) {
520
		raw_spin_unlock_irq(&ctx->lock);
521
522
523
524
525
		goto retry;
	}

	/*
	 * The lock prevents that this context is scheduled in so we
526
	 * can remove the event safely, if the call above did not
527
528
	 * succeed.
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
529
	if (!list_empty(&event->group_entry))
530
		list_del_event(event, ctx);
531
	raw_spin_unlock_irq(&ctx->lock);
532
533
}

534
/*
535
 * Update total_time_enabled and total_time_running for all events in a group.
536
 */
537
static void update_group_times(struct perf_event *leader)
538
{
539
	struct perf_event *event;
540

541
542
543
	update_event_times(leader);
	list_for_each_entry(event, &leader->sibling_list, group_entry)
		update_event_times(event);
544
545
}

546
/*
547
 * Cross CPU call to disable a performance event
548
 */
549
static void __perf_event_disable(void *info)
550
{
551
	struct perf_event *event = info;
552
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
553
	struct perf_event_context *ctx = event->ctx;
554
555

	/*
556
557
	 * If this is a per-task event, need to check whether this
	 * event's task is the current task on this cpu.
558
	 */
559
	if (ctx->task && cpuctx->task_ctx != ctx)
560
561
		return;

562
	raw_spin_lock(&ctx->lock);
563
564

	/*
565
	 * If the event is on, turn it off.
566
567
	 * If it is in error state, leave it in error state.
	 */
568
	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
569
		update_context_time(ctx);
570
571
572
		update_group_times(event);
		if (event == event->group_leader)
			group_sched_out(event, cpuctx, ctx);
573
		else
574
575
			event_sched_out(event, cpuctx, ctx);
		event->state = PERF_EVENT_STATE_OFF;
576
577
	}

578
	raw_spin_unlock(&ctx->lock);
579
580
581
}

/*
582
 * Disable a event.
583
 *
584
585
 * If event->ctx is a cloned context, callers must make sure that
 * every task struct that event->ctx->task could possibly point to
586
 * remains valid.  This condition is satisifed when called through
587
588
589
590
 * perf_event_for_each_child or perf_event_for_each because they
 * hold the top-level event's child_mutex, so any descendant that
 * goes to exit will block in sync_child_event.
 * When called from perf_pending_event it's OK because event->ctx
591
 * is the current context on this CPU and preemption is disabled,
592
 * hence we can't get into perf_event_task_sched_out for this context.
593
 */
594
void perf_event_disable(struct perf_event *event)
595
{
596
	struct perf_event_context *ctx = event->ctx;
597
598
599
600
	struct task_struct *task = ctx->task;

	if (!task) {
		/*
601
		 * Disable the event on the cpu that it's on
602
		 */
603
604
		smp_call_function_single(event->cpu, __perf_event_disable,
					 event, 1);
605
606
607
608
		return;
	}

 retry:
609
	task_oncpu_function_call(task, __perf_event_disable, event);
610

611
	raw_spin_lock_irq(&ctx->lock);
612
	/*
613
	 * If the event is still active, we need to retry the cross-call.
614
	 */
615
	if (event->state == PERF_EVENT_STATE_ACTIVE) {
616
		raw_spin_unlock_irq(&ctx->lock);
617
618
619
620
621
622
623
		goto retry;
	}

	/*
	 * Since we have the lock this context can't be scheduled
	 * in, so we can change the state safely.
	 */
624
625
626
	if (event->state == PERF_EVENT_STATE_INACTIVE) {
		update_group_times(event);
		event->state = PERF_EVENT_STATE_OFF;
627
	}
628

629
	raw_spin_unlock_irq(&ctx->lock);
630
631
}

632
static int
633
event_sched_in(struct perf_event *event,
634
		 struct perf_cpu_context *cpuctx,
635
		 struct perf_event_context *ctx,
636
637
		 int cpu)
{
638
	if (event->state <= PERF_EVENT_STATE_OFF)
639
640
		return 0;

641
642
	event->state = PERF_EVENT_STATE_ACTIVE;
	event->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
643
644
645
646
647
	/*
	 * The new state must be visible before we turn it on in the hardware:
	 */
	smp_wmb();

648
649
650
	if (event->pmu->enable(event)) {
		event->state = PERF_EVENT_STATE_INACTIVE;
		event->oncpu = -1;
651
652
653
		return -EAGAIN;
	}

654
	event->tstamp_running += ctx->time - event->tstamp_stopped;
655

656
	if (!is_software_event(event))
657
		cpuctx->active_oncpu++;
658
659
	ctx->nr_active++;

660
	if (event->attr.exclusive)
661
662
		cpuctx->exclusive = 1;

663
664
665
	return 0;
}

666
static int
667
group_sched_in(struct perf_event *group_event,
668
	       struct perf_cpu_context *cpuctx,
669
	       struct perf_event_context *ctx,
670
671
	       int cpu)
{
672
	struct perf_event *event, *partial_group;
673
674
	int ret;

675
	if (group_event->state == PERF_EVENT_STATE_OFF)
676
677
		return 0;

678
	ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu);
679
680
681
	if (ret)
		return ret < 0 ? ret : 0;

682
	if (event_sched_in(group_event, cpuctx, ctx, cpu))
683
684
685
686
687
		return -EAGAIN;

	/*
	 * Schedule in siblings as one group (if any):
	 */
688
689
690
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event_sched_in(event, cpuctx, ctx, cpu)) {
			partial_group = event;
691
692
693
694
695
696
697
698
699
700
701
			goto group_error;
		}
	}

	return 0;

group_error:
	/*
	 * Groups can be scheduled in as one unit only, so undo any
	 * partial group before returning:
	 */
702
703
	list_for_each_entry(event, &group_event->sibling_list, group_entry) {
		if (event == partial_group)
704
			break;
705
		event_sched_out(event, cpuctx, ctx);
706
	}
707
	event_sched_out(group_event, cpuctx, ctx);
708
709
710
711

	return -EAGAIN;
}

712
/*
713
 * Work out whether we can put this event group on the CPU now.
714
 */
715
static int group_can_go_on(struct perf_event *event,
716
717
718
719
			   struct perf_cpu_context *cpuctx,
			   int can_add_hw)
{
	/*
720
	 * Groups consisting entirely of software events can always go on.
721
	 */
722
	if (event->group_flags & PERF_GROUP_SOFTWARE)
723
724
725
		return 1;
	/*
	 * If an exclusive group is already on, no other hardware
726
	 * events can go on.
727
728
729
730
731
	 */
	if (cpuctx->exclusive)
		return 0;
	/*
	 * If this group is exclusive and there are already
732
	 * events on the CPU, it can't go on.
733
	 */
734
	if (event->attr.exclusive && cpuctx->active_oncpu)
735
736
737
738
739
740
741
742
		return 0;
	/*
	 * Otherwise, try to add it if all previous groups were able
	 * to go on.
	 */
	return can_add_hw;
}

743
744
static void add_event_to_ctx(struct perf_event *event,
			       struct perf_event_context *ctx)
745
{
746
747
748
749
	list_add_event(event, ctx);
	event->tstamp_enabled = ctx->time;
	event->tstamp_running = ctx->time;
	event->tstamp_stopped = ctx->time;
750
751
}

752
/*
753
 * Cross CPU call to install and enable a performance event
754
755
 *
 * Must be called with ctx->mutex held
756
757
758
759
 */
static void __perf_install_in_context(void *info)
{
	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
760
761
762
	struct perf_event *event = info;
	struct perf_event_context *ctx = event->ctx;
	struct perf_event *leader = event->group_leader;
763
	int cpu = smp_processor_id();
764
	int err;
765
766
767
768
769

	/*
	 * If this is a task context, we need to check whether it is
	 * the current task context of this cpu. If not it has been
	 * scheduled out before the smp call arrived.
770
	 * Or possibly this is the right context but it isn't
771
	 * on this cpu because it had no events.
772
	 */
773
	if (ctx->task && cpuctx->task_ctx != ctx) {
774
		if (cpuctx->task_ctx || ctx->task != current)
775
776
777
			return;
		cpuctx->task_ctx = ctx;
	}
778

779
	raw_spin_lock(&ctx->lock);
780
	ctx->is_active = 1;
781
	update_context_time(ctx);
782
783
784

	/*
	 * Protect the list operation against NMI by disabling the
785
	 * events on a global level. NOP for non NMI based events.
786
	 */
787
	perf_disable();
788

789
	add_event_to_ctx(event, ctx);
790

791
792
793
	if (event->cpu != -1 && event->cpu != smp_processor_id())
		goto unlock;

794
	/*
795
	 * Don't put the event on if it is disabled or if
796
797
	 * it is in a group and the group isn't on.
	 */
798
799
	if (event->state != PERF_EVENT_STATE_INACTIVE ||
	    (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE))
800
801
		goto unlock;

802
	/*
803
804
805
	 * An exclusive event can't go on if there are already active
	 * hardware events, and no hardware event can go on if there
	 * is already an exclusive event on.
806
	 */
807
	if (!group_can_go_on(event, cpuctx, 1))
808
809
		err = -EEXIST;
	else
810
		err = event_sched_in(event, cpuctx, ctx, cpu);
811

812
813
	if (err) {
		/*
814
		 * This event couldn't go on.  If it is in a group
815
		 * then we have to pull the whole group off.
816
		 * If the event group is pinned then put it in error state.
817
		 */
818
		if (leader != event