core.c 58.2 KB
Newer Older
1
/*
2
 * Performance events x86 architecture code
3
 *
4 5 6 7
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20 21
#include <linux/export.h>
#include <linux/init.h>
22 23
#include <linux/kdebug.h>
#include <linux/sched.h>
24
#include <linux/uaccess.h>
25
#include <linux/slab.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
28
#include <linux/device.h>
29 30

#include <asm/apic.h>
31
#include <asm/stacktrace.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
32
#include <asm/nmi.h>
33
#include <asm/smp.h>
34
#include <asm/alternative.h>
35
#include <asm/mmu_context.h>
36
#include <asm/tlbflush.h>
37
#include <asm/timer.h>
38 39
#include <asm/desc.h>
#include <asm/ldt.h>
40
#include <asm/unwind.h>
41

42
#include "perf_event.h"
43 44

struct x86_pmu x86_pmu __read_mostly;
45

46
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
47 48
	.enabled = 1,
};
49

50 51
struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;

52
u64 __read_mostly hw_cache_event_ids
53 54 55
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
56
u64 __read_mostly hw_cache_extra_regs
57 58 59
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
60

61
/*
62 63
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
64 65
 * Returns the delta events processed.
 */
66
u64 x86_perf_event_update(struct perf_event *event)
67
{
68
	struct hw_perf_event *hwc = &event->hw;
69
	int shift = 64 - x86_pmu.cntval_bits;
70
	u64 prev_raw_count, new_raw_count;
71
	int idx = hwc->idx;
72
	u64 delta;
73

74
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
75 76
		return 0;

77
	/*
78
	 * Careful: an NMI might modify the previous event value.
79 80 81
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
82
	 * count to the generic event atomically:
83 84
	 */
again:
85
	prev_raw_count = local64_read(&hwc->prev_count);
86
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
87

88
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
89 90 91 92 93 94
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
95
	 * (event-)time and add that to the generic event.
96 97
	 *
	 * Careful, not all hw sign-extends above the physical width
98
	 * of the count.
99
	 */
100 101
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
102

103 104
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
105 106

	return new_raw_count;
107 108
}

109 110 111 112 113
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
114
	struct hw_perf_event_extra *reg;
115 116
	struct extra_reg *er;

117
	reg = &event->hw.extra_reg;
118 119 120 121 122 123 124 125 126

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
127 128 129
		/* Check if the extra msrs can be safely accessed*/
		if (!er->extra_msr_access)
			return -ENXIO;
130 131 132 133

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
134 135 136 137 138
		break;
	}
	return 0;
}

139
static atomic_t active_events;
140
static atomic_t pmc_refcount;
Peter Zijlstra's avatar
Peter Zijlstra committed
141 142
static DEFINE_MUTEX(pmc_reserve_mutex);

143 144
#ifdef CONFIG_X86_LOCAL_APIC

Peter Zijlstra's avatar
Peter Zijlstra committed
145 146 147 148
static bool reserve_pmc_hardware(void)
{
	int i;

149
	for (i = 0; i < x86_pmu.num_counters; i++) {
150
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
151 152 153
			goto perfctr_fail;
	}

154
	for (i = 0; i < x86_pmu.num_counters; i++) {
155
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
156 157 158 159 160 161 162
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
163
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
164

165
	i = x86_pmu.num_counters;
Peter Zijlstra's avatar
Peter Zijlstra committed
166 167 168

perfctr_fail:
	for (i--; i >= 0; i--)
169
		release_perfctr_nmi(x86_pmu_event_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
170 171 172 173 174 175 176 177

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

178
	for (i = 0; i < x86_pmu.num_counters; i++) {
179 180
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
181 182 183
	}
}

184 185 186 187 188 189 190
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

191 192
static bool check_hw_exists(void)
{
193 194 195
	u64 val, val_fail, val_new= ~0;
	int i, reg, reg_fail, ret = 0;
	int bios_fail = 0;
196
	int reg_safe = -1;
197

198 199 200 201 202
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
203
		reg = x86_pmu_config_addr(i);
204 205 206
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
207 208 209 210
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
			bios_fail = 1;
			val_fail = val;
			reg_fail = reg;
211 212
		} else {
			reg_safe = i;
213
		}
214 215 216 217 218 219 220 221
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
222 223 224 225 226
			if (val & (0x03 << i*4)) {
				bios_fail = 1;
				val_fail = val;
				reg_fail = reg;
			}
227 228 229
		}
	}

230 231 232 233 234 235 236 237 238 239 240
	/*
	 * If all the counters are enabled, the below test will always
	 * fail.  The tools will also become useless in this scenario.
	 * Just fail and disable the hardware counters.
	 */

	if (reg_safe == -1) {
		reg = reg_safe;
		goto msr_fail;
	}

241
	/*
242 243 244
	 * Read the current value, change it and read it back to see if it
	 * matches, this is needed to detect certain hardware emulators
	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
245
	 */
246
	reg = x86_pmu_event_addr(reg_safe);
247 248 249
	if (rdmsrl_safe(reg, &val))
		goto msr_fail;
	val ^= 0xffffUL;
250 251
	ret = wrmsrl_safe(reg, val);
	ret |= rdmsrl_safe(reg, &val_new);
252
	if (ret || val != val_new)
253
		goto msr_fail;
254

255 256 257
	/*
	 * We still allow the PMU driver to operate:
	 */
258
	if (bios_fail) {
259 260 261
		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
			      reg_fail, val_fail);
262
	}
263 264

	return true;
265 266

msr_fail:
267 268 269 270 271 272 273
	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
		pr_cont("PMU not available due to virtualization, using software events only.\n");
	} else {
		pr_cont("Broken PMU hardware detected, using software events only.\n");
		pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
		       reg, val_new);
	}
274

275
	return false;
276 277
}

278
static void hw_perf_event_destroy(struct perf_event *event)
Peter Zijlstra's avatar
Peter Zijlstra committed
279
{
280
	x86_release_hardware();
281
	atomic_dec(&active_events);
Peter Zijlstra's avatar
Peter Zijlstra committed
282 283
}

284 285 286 287 288 289 290 291
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
	hw_perf_event_destroy(event);

	/* undo the lbr/bts event accounting */
	x86_del_exclusive(x86_lbr_exclusive_lbr);
}

292 293 294 295 296
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

297
static inline int
298
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
299
{
300
	struct perf_event_attr *attr = &event->attr;
301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
327 328
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
329 330
}

331 332 333 334
int x86_reserve_hardware(void)
{
	int err = 0;

335
	if (!atomic_inc_not_zero(&pmc_refcount)) {
336
		mutex_lock(&pmc_reserve_mutex);
337
		if (atomic_read(&pmc_refcount) == 0) {
338 339 340 341 342 343
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
				reserve_ds_buffers();
		}
		if (!err)
344
			atomic_inc(&pmc_refcount);
345 346 347 348 349 350 351 352
		mutex_unlock(&pmc_reserve_mutex);
	}

	return err;
}

void x86_release_hardware(void)
{
353
	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
354 355 356 357 358 359
		release_pmc_hardware();
		release_ds_buffers();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

360 361 362 363 364 365
/*
 * Check if we can create event of a certain type (that no conflicting events
 * are present).
 */
int x86_add_exclusive(unsigned int what)
{
366
	int i;
367

368 369 370 371 372
	/*
	 * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS.
	 * LBR and BTS are still mutually exclusive.
	 */
	if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
373 374
		return 0;

375 376 377 378 379 380 381 382
	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
		mutex_lock(&pmc_reserve_mutex);
		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
				goto fail_unlock;
		}
		atomic_inc(&x86_pmu.lbr_exclusive[what]);
		mutex_unlock(&pmc_reserve_mutex);
383
	}
384

385 386
	atomic_inc(&active_events);
	return 0;
387

388
fail_unlock:
389
	mutex_unlock(&pmc_reserve_mutex);
390
	return -EBUSY;
391 392 393 394
}

void x86_del_exclusive(unsigned int what)
{
395
	if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt)
396 397
		return;

398
	atomic_dec(&x86_pmu.lbr_exclusive[what]);
399
	atomic_dec(&active_events);
400 401
}

402
int x86_setup_perfctr(struct perf_event *event)
403 404 405 406 407
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

408
	if (!is_sampling_event(event)) {
409 410
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
411
		local64_set(&hwc->period_left, hwc->sample_period);
412 413 414
	}

	if (attr->type == PERF_TYPE_RAW)
415
		return x86_pmu_extra_regs(event->attr.config, event);
416 417

	if (attr->type == PERF_TYPE_HW_CACHE)
418
		return set_ext_hw_attr(hwc, event);
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
437 438
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
439
		/* BTS is not supported by this architecture. */
440
		if (!x86_pmu.bts_active)
441 442 443 444 445
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
446 447 448 449 450 451

		/* disallow bts if conflicting events are present */
		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
			return -EBUSY;

		event->destroy = hw_perf_lbr_event_destroy;
452 453 454 455 456 457
	}

	hwc->config |= config;

	return 0;
}
458

459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

489
int x86_pmu_hw_config(struct perf_event *event)
490
{
491 492 493 494
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
495
		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
496 497
			precise++;

498
			/* Support for IP fixup */
Andi Kleen's avatar
Andi Kleen committed
499
			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
500
				precise++;
501 502 503

			if (x86_pmu.pebs_prec_dist)
				precise++;
504
		}
505 506 507

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
	}
	/*
	 * check that PEBS LBR correction does not conflict with
	 * whatever the user is asking with attr->branch_sample_type
	 */
	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
		u64 *br_type = &event->attr.branch_sample_type;

		if (has_branch_stack(event)) {
			if (!precise_br_compat(event))
				return -EOPNOTSUPP;

			/* branch_sample_type is compatible */

		} else {
			/*
			 * user did not specify  branch_sample_type
			 *
			 * For PEBS fixups, we capture all
			 * the branches at the priv level of the
			 * event.
			 */
			*br_type = PERF_SAMPLE_BRANCH_ANY;

			if (!event->attr.exclude_user)
				*br_type |= PERF_SAMPLE_BRANCH_USER;

			if (!event->attr.exclude_kernel)
				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
537
		}
538 539
	}

540 541 542
	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
		event->attach_state |= PERF_ATTACH_TASK_DATA;

543 544 545 546
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
547
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
548 549 550 551

	/*
	 * Count user and OS events unless requested not to
	 */
552 553 554 555
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
556

557 558
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
559

560 561 562 563 564 565
	if (event->attr.sample_period && x86_pmu.limit_period) {
		if (x86_pmu.limit_period(event, event->attr.sample_period) >
				event->attr.sample_period)
			return -EINVAL;
	}

566
	return x86_setup_perfctr(event);
567 568
}

569
/*
570
 * Setup the hardware configuration for a given attr_type
571
 */
572
static int __x86_pmu_event_init(struct perf_event *event)
573
{
Peter Zijlstra's avatar
Peter Zijlstra committed
574
	int err;
575

576 577
	if (!x86_pmu_initialized())
		return -ENODEV;
578

579
	err = x86_reserve_hardware();
Peter Zijlstra's avatar
Peter Zijlstra committed
580 581 582
	if (err)
		return err;

583
	atomic_inc(&active_events);
584
	event->destroy = hw_perf_event_destroy;
585

586 587 588
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
589

590 591
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
592 593
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

594
	return x86_pmu.hw_config(event);
595 596
}

597
void x86_pmu_disable_all(void)
598
{
599
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
600 601
	int idx;

602
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
603 604
		u64 val;

605
		if (!test_bit(idx, cpuc->active_mask))
606
			continue;
607
		rdmsrl(x86_pmu_config_addr(idx), val);
608
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
609
			continue;
610
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
611
		wrmsrl(x86_pmu_config_addr(idx), val);
612 613 614
	}
}

615 616 617 618 619 620 621 622 623 624 625 626 627
/*
 * There may be PMI landing after enabled=0. The PMI hitting could be before or
 * after disable_all.
 *
 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 * It will not be re-enabled in the NMI handler again, because enabled=0. After
 * handling the NMI, disable_all will be called, which will not change the
 * state either. If PMI hits after disable_all, the PMU is already disabled
 * before entering NMI handler. The NMI handler will not change the state
 * either.
 *
 * So either situation is harmless.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
628
static void x86_pmu_disable(struct pmu *pmu)
629
{
630
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
631

632
	if (!x86_pmu_initialized())
633
		return;
634

635 636 637 638 639 640
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
641 642

	x86_pmu.disable_all();
643
}
644

645
void x86_pmu_enable_all(int added)
646
{
647
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
648 649
	int idx;

650
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
651
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
652

653
		if (!test_bit(idx, cpuc->active_mask))
654
			continue;
655

656
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
657 658 659
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
660
static struct pmu pmu;
661 662 663 664 665 666

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

667 668 669 670 671 672 673 674 675 676 677 678
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
679
	int	nr_gp;		/* number of GP counters used */
680 681 682
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

683 684 685
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

686 687 688
struct perf_sched {
	int			max_weight;
	int			max_events;
689 690
	int			max_gp;
	int			saved_states;
691
	struct event_constraint	**constraints;
692
	struct sched_state	state;
693
	struct sched_state	saved[SCHED_STATES_MAX];
694 695 696 697 698
};

/*
 * Initialize interator that runs through all events and counters.
 */
699
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
700
			    int num, int wmin, int wmax, int gpmax)
701 702 703 704 705 706
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
707
	sched->max_gp		= gpmax;
708
	sched->constraints	= constraints;
709 710

	for (idx = 0; idx < num; idx++) {
711
		if (constraints[idx]->weight == wmin)
712 713 714 715 716 717 718 719
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

743 744 745 746
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
747
static bool __perf_sched_find_counter(struct perf_sched *sched)
748 749 750 751 752 753 754 755 756 757
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

758
	c = sched->constraints[sched->state.event];
759
	/* Prefer fixed purpose counters */
760 761
	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
		idx = INTEL_PMC_IDX_FIXED;
762
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
763 764 765 766
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
767

768 769
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
770
	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
771 772 773 774
		if (!__test_and_set_bit(idx, sched->state.used)) {
			if (sched->state.nr_gp++ >= sched->max_gp)
				return false;

775
			goto done;
776
		}
777 778
	}

779 780 781 782
	return false;

done:
	sched->state.counter = idx;
783

784 785 786 787 788 789 790 791 792 793 794 795 796
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
821
		c = sched->constraints[sched->state.event];
822 823 824 825 826 827 828 829 830 831
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
832
int perf_assign_events(struct event_constraint **constraints, int n,
833
			int wmin, int wmax, int gpmax, int *assign)
834 835 836
{
	struct perf_sched sched;

837
	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
838 839 840 841 842 843 844 845 846 847

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}
848
EXPORT_SYMBOL_GPL(perf_assign_events);
849

850
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
851
{
852
	struct event_constraint *c;
853
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
854
	struct perf_event *e;
855
	int i, wmin, wmax, unsched = 0;
856 857 858 859
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

860 861 862
	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);

863
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
864
		cpuc->event_constraint[i] = NULL;
865
		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
866
		cpuc->event_constraint[i] = c;
867

868 869
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
870 871
	}

872 873 874
	/*
	 * fastpath, try to reuse previous register
	 */
875
	for (i = 0; i < n; i++) {
876
		hwc = &cpuc->event_list[i]->hw;
877
		c = cpuc->event_constraint[i];
878 879 880 881 882 883

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
884
		if (!test_bit(hwc->idx, c->idxmsk))
885 886 887 888 889 890
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

891
		__set_bit(hwc->idx, used_mask);
892 893 894 895
		if (assign)
			assign[i] = hwc->idx;
	}

896
	/* slow path */
897
	if (i != n) {
898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
		int gpmax = x86_pmu.num_counters;

		/*
		 * Do not allow scheduling of more than half the available
		 * generic counters.
		 *
		 * This helps avoid counter starvation of sibling thread by
		 * ensuring at most half the counters cannot be in exclusive
		 * mode. There is no designated counters for the limits. Any
		 * N/2 counters can be used. This helps with events with
		 * specific counter constraints.
		 */
		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
			gpmax /= 2;

914
		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
915
					     wmax, gpmax, assign);
916
	}
917

918
	/*
919 920 921 922 923 924 925 926
	 * In case of success (unsched = 0), mark events as committed,
	 * so we do not put_constraint() in case new events are added
	 * and fail to be scheduled
	 *
	 * We invoke the lower level commit callback to lock the resource
	 *
	 * We do not need to do all of this in case we are called to
	 * validate an event group (assign == NULL)
927
	 */
928
	if (!unsched && assign) {
929 930 931
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
932
			if (x86_pmu.commit_scheduling)
933
				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
934
		}
935
	} else {
936
		for (i = 0; i < n; i++) {
937 938 939 940 941 942 943 944
			e = cpuc->event_list[i];
			/*
			 * do not put_constraint() on comitted events,
			 * because they are good to go
			 */
			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
				continue;

945 946 947
			/*
			 * release events that failed scheduling
			 */
948
			if (x86_pmu.put_event_constraints)
949
				x86_pmu.put_event_constraints(cpuc, e);
950 951
		}
	}
952 953 954 955

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);

956
	return unsched ? -EINVAL : 0;
957 958 959 960 961 962 963 964 965 966 967
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

968
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
969 970 971 972 973 974

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
975
			return -EINVAL;
976 977 978 979 980 981 982 983
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
984
		    event->state <= PERF_EVENT_STATE_OFF)
985 986 987
			continue;

		if (n >= max_count)
988
			return -EINVAL;
989 990 991 992 993 994 995 996

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
997
				struct cpu_hw_events *cpuc, int i)