core.c 56.2 KB
Newer Older
1
/*
2
 * Performance events x86 architecture code
3
 *
4 5 6 7
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
27
#include <linux/device.h>
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
31
#include <asm/nmi.h>
32
#include <asm/smp.h>
33
#include <asm/alternative.h>
34
#include <asm/mmu_context.h>
35
#include <asm/tlbflush.h>
36
#include <asm/timer.h>
37 38
#include <asm/desc.h>
#include <asm/ldt.h>
39

40
#include "perf_event.h"
41 42

struct x86_pmu x86_pmu __read_mostly;
43

44
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
45 46
	.enabled = 1,
};
47

48 49
struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;

50
u64 __read_mostly hw_cache_event_ids
51 52 53
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
54
u64 __read_mostly hw_cache_extra_regs
55 56 57
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
58

59
/*
60 61
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
62 63
 * Returns the delta events processed.
 */
64
u64 x86_perf_event_update(struct perf_event *event)
65
{
66
	struct hw_perf_event *hwc = &event->hw;
67
	int shift = 64 - x86_pmu.cntval_bits;
68
	u64 prev_raw_count, new_raw_count;
69
	int idx = hwc->idx;
70
	s64 delta;
71

72
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
73 74
		return 0;

75
	/*
76
	 * Careful: an NMI might modify the previous event value.
77 78 79
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
80
	 * count to the generic event atomically:
81 82
	 */
again:
83
	prev_raw_count = local64_read(&hwc->prev_count);
84
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
85

86
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
87 88 89 90 91 92
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
93
	 * (event-)time and add that to the generic event.
94 95
	 *
	 * Careful, not all hw sign-extends above the physical width
96
	 * of the count.
97
	 */
98 99
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
100

101 102
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
103 104

	return new_raw_count;
105 106
}

107 108 109 110 111
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
112
	struct hw_perf_event_extra *reg;
113 114
	struct extra_reg *er;

115
	reg = &event->hw.extra_reg;
116 117 118 119 120 121 122 123 124

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
125 126 127
		/* Check if the extra msrs can be safely accessed*/
		if (!er->extra_msr_access)
			return -ENXIO;
128 129 130 131

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
132 133 134 135 136
		break;
	}
	return 0;
}

137
static atomic_t active_events;
138
static atomic_t pmc_refcount;
Peter Zijlstra's avatar
Peter Zijlstra committed
139 140
static DEFINE_MUTEX(pmc_reserve_mutex);

141 142
#ifdef CONFIG_X86_LOCAL_APIC

Peter Zijlstra's avatar
Peter Zijlstra committed
143 144 145 146
static bool reserve_pmc_hardware(void)
{
	int i;

147
	for (i = 0; i < x86_pmu.num_counters; i++) {
148
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
149 150 151
			goto perfctr_fail;
	}

152
	for (i = 0; i < x86_pmu.num_counters; i++) {
153
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
154 155 156 157 158 159 160
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
161
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
162

163
	i = x86_pmu.num_counters;
Peter Zijlstra's avatar
Peter Zijlstra committed
164 165 166

perfctr_fail:
	for (i--; i >= 0; i--)
167
		release_perfctr_nmi(x86_pmu_event_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
168 169 170 171 172 173 174 175

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

176
	for (i = 0; i < x86_pmu.num_counters; i++) {
177 178
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
179 180 181
	}
}

182 183 184 185 186 187 188
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

189 190
static bool check_hw_exists(void)
{
191 192 193
	u64 val, val_fail, val_new= ~0;
	int i, reg, reg_fail, ret = 0;
	int bios_fail = 0;
194
	int reg_safe = -1;
195

196 197 198 199 200
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
201
		reg = x86_pmu_config_addr(i);
202 203 204
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
205 206 207 208
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
			bios_fail = 1;
			val_fail = val;
			reg_fail = reg;
209 210
		} else {
			reg_safe = i;
211
		}
212 213 214 215 216 217 218 219
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
220 221 222 223 224
			if (val & (0x03 << i*4)) {
				bios_fail = 1;
				val_fail = val;
				reg_fail = reg;
			}
225 226 227
		}
	}

228 229 230 231 232 233 234 235 236 237 238
	/*
	 * If all the counters are enabled, the below test will always
	 * fail.  The tools will also become useless in this scenario.
	 * Just fail and disable the hardware counters.
	 */

	if (reg_safe == -1) {
		reg = reg_safe;
		goto msr_fail;
	}

239
	/*
240 241 242
	 * Read the current value, change it and read it back to see if it
	 * matches, this is needed to detect certain hardware emulators
	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
243
	 */
244
	reg = x86_pmu_event_addr(reg_safe);
245 246 247
	if (rdmsrl_safe(reg, &val))
		goto msr_fail;
	val ^= 0xffffUL;
248 249
	ret = wrmsrl_safe(reg, val);
	ret |= rdmsrl_safe(reg, &val_new);
250
	if (ret || val != val_new)
251
		goto msr_fail;
252

253 254 255
	/*
	 * We still allow the PMU driver to operate:
	 */
256
	if (bios_fail) {
257 258 259
		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
			      reg_fail, val_fail);
260
	}
261 262

	return true;
263 264

msr_fail:
265 266
	pr_cont("Broken PMU hardware detected, using software events only.\n");
	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
267 268
		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
		reg, val_new);
269

270
	return false;
271 272
}

273
static void hw_perf_event_destroy(struct perf_event *event)
Peter Zijlstra's avatar
Peter Zijlstra committed
274
{
275
	x86_release_hardware();
276
	atomic_dec(&active_events);
Peter Zijlstra's avatar
Peter Zijlstra committed
277 278
}

279 280 281 282 283 284 285 286
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
	hw_perf_event_destroy(event);

	/* undo the lbr/bts event accounting */
	x86_del_exclusive(x86_lbr_exclusive_lbr);
}

287 288 289 290 291
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

292
static inline int
293
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
294
{
295
	struct perf_event_attr *attr = &event->attr;
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
322 323
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
324 325
}

326 327 328 329
int x86_reserve_hardware(void)
{
	int err = 0;

330
	if (!atomic_inc_not_zero(&pmc_refcount)) {
331
		mutex_lock(&pmc_reserve_mutex);
332
		if (atomic_read(&pmc_refcount) == 0) {
333 334 335 336 337 338
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
				reserve_ds_buffers();
		}
		if (!err)
339
			atomic_inc(&pmc_refcount);
340 341 342 343 344 345 346 347
		mutex_unlock(&pmc_reserve_mutex);
	}

	return err;
}

void x86_release_hardware(void)
{
348
	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
349 350 351 352 353 354
		release_pmc_hardware();
		release_ds_buffers();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

355 356 357 358 359 360
/*
 * Check if we can create event of a certain type (that no conflicting events
 * are present).
 */
int x86_add_exclusive(unsigned int what)
{
361
	int i;
362

363 364 365 366 367 368 369 370
	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
		mutex_lock(&pmc_reserve_mutex);
		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
				goto fail_unlock;
		}
		atomic_inc(&x86_pmu.lbr_exclusive[what]);
		mutex_unlock(&pmc_reserve_mutex);
371
	}
372

373 374
	atomic_inc(&active_events);
	return 0;
375

376
fail_unlock:
377
	mutex_unlock(&pmc_reserve_mutex);
378
	return -EBUSY;
379 380 381 382 383
}

void x86_del_exclusive(unsigned int what)
{
	atomic_dec(&x86_pmu.lbr_exclusive[what]);
384
	atomic_dec(&active_events);
385 386
}

387
int x86_setup_perfctr(struct perf_event *event)
388 389 390 391 392
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

393
	if (!is_sampling_event(event)) {
394 395
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
396
		local64_set(&hwc->period_left, hwc->sample_period);
397 398 399
	}

	if (attr->type == PERF_TYPE_RAW)
400
		return x86_pmu_extra_regs(event->attr.config, event);
401 402

	if (attr->type == PERF_TYPE_HW_CACHE)
403
		return set_ext_hw_attr(hwc, event);
404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
422 423
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
424
		/* BTS is not supported by this architecture. */
425
		if (!x86_pmu.bts_active)
426 427 428 429 430
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
431 432 433 434 435 436

		/* disallow bts if conflicting events are present */
		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
			return -EBUSY;

		event->destroy = hw_perf_lbr_event_destroy;
437 438 439 440 441 442
	}

	hwc->config |= config;

	return 0;
}
443

444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

474
int x86_pmu_hw_config(struct perf_event *event)
475
{
476 477 478 479
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
480
		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
481 482
			precise++;

483
			/* Support for IP fixup */
Andi Kleen's avatar
Andi Kleen committed
484
			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
485
				precise++;
486 487 488

			if (x86_pmu.pebs_prec_dist)
				precise++;
489
		}
490 491 492

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521
	}
	/*
	 * check that PEBS LBR correction does not conflict with
	 * whatever the user is asking with attr->branch_sample_type
	 */
	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
		u64 *br_type = &event->attr.branch_sample_type;

		if (has_branch_stack(event)) {
			if (!precise_br_compat(event))
				return -EOPNOTSUPP;

			/* branch_sample_type is compatible */

		} else {
			/*
			 * user did not specify  branch_sample_type
			 *
			 * For PEBS fixups, we capture all
			 * the branches at the priv level of the
			 * event.
			 */
			*br_type = PERF_SAMPLE_BRANCH_ANY;

			if (!event->attr.exclude_user)
				*br_type |= PERF_SAMPLE_BRANCH_USER;

			if (!event->attr.exclude_kernel)
				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
522
		}
523 524
	}

525 526 527
	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
		event->attach_state |= PERF_ATTACH_TASK_DATA;

528 529 530 531
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
532
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
533 534 535 536

	/*
	 * Count user and OS events unless requested not to
	 */
537 538 539 540
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
541

542 543
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
544

545 546 547 548 549 550
	if (event->attr.sample_period && x86_pmu.limit_period) {
		if (x86_pmu.limit_period(event, event->attr.sample_period) >
				event->attr.sample_period)
			return -EINVAL;
	}

551
	return x86_setup_perfctr(event);
552 553
}

554
/*
555
 * Setup the hardware configuration for a given attr_type
556
 */
557
static int __x86_pmu_event_init(struct perf_event *event)
558
{
Peter Zijlstra's avatar
Peter Zijlstra committed
559
	int err;
560

561 562
	if (!x86_pmu_initialized())
		return -ENODEV;
563

564
	err = x86_reserve_hardware();
Peter Zijlstra's avatar
Peter Zijlstra committed
565 566 567
	if (err)
		return err;

568
	atomic_inc(&active_events);
569
	event->destroy = hw_perf_event_destroy;
570

571 572 573
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
574

575 576
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
577 578
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

579
	return x86_pmu.hw_config(event);
580 581
}

582
void x86_pmu_disable_all(void)
583
{
584
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
585 586
	int idx;

587
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
588 589
		u64 val;

590
		if (!test_bit(idx, cpuc->active_mask))
591
			continue;
592
		rdmsrl(x86_pmu_config_addr(idx), val);
593
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
594
			continue;
595
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
596
		wrmsrl(x86_pmu_config_addr(idx), val);
597 598 599
	}
}

600 601 602 603 604 605 606 607 608 609 610 611 612
/*
 * There may be PMI landing after enabled=0. The PMI hitting could be before or
 * after disable_all.
 *
 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 * It will not be re-enabled in the NMI handler again, because enabled=0. After
 * handling the NMI, disable_all will be called, which will not change the
 * state either. If PMI hits after disable_all, the PMU is already disabled
 * before entering NMI handler. The NMI handler will not change the state
 * either.
 *
 * So either situation is harmless.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
613
static void x86_pmu_disable(struct pmu *pmu)
614
{
615
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
616

617
	if (!x86_pmu_initialized())
618
		return;
619

620 621 622 623 624 625
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
626 627

	x86_pmu.disable_all();
628
}
629

630
void x86_pmu_enable_all(int added)
631
{
632
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
633 634
	int idx;

635
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
636
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
637

638
		if (!test_bit(idx, cpuc->active_mask))
639
			continue;
640

641
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
642 643 644
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
645
static struct pmu pmu;
646 647 648 649 650 651

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

652 653 654 655 656 657 658 659 660 661 662 663
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
664
	int	nr_gp;		/* number of GP counters used */
665 666 667
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

668 669 670
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

671 672 673
struct perf_sched {
	int			max_weight;
	int			max_events;
674 675
	int			max_gp;
	int			saved_states;
676
	struct event_constraint	**constraints;
677
	struct sched_state	state;
678
	struct sched_state	saved[SCHED_STATES_MAX];
679 680 681 682 683
};

/*
 * Initialize interator that runs through all events and counters.
 */
684
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
685
			    int num, int wmin, int wmax, int gpmax)
686 687 688 689 690 691
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
692
	sched->max_gp		= gpmax;
693
	sched->constraints	= constraints;
694 695

	for (idx = 0; idx < num; idx++) {
696
		if (constraints[idx]->weight == wmin)
697 698 699 700 701 702 703 704
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

728 729 730 731
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
732
static bool __perf_sched_find_counter(struct perf_sched *sched)
733 734 735 736 737 738 739 740 741 742
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

743
	c = sched->constraints[sched->state.event];
744
	/* Prefer fixed purpose counters */
745 746
	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
		idx = INTEL_PMC_IDX_FIXED;
747
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
748 749 750 751
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
752

753 754
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
755
	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
756 757 758 759
		if (!__test_and_set_bit(idx, sched->state.used)) {
			if (sched->state.nr_gp++ >= sched->max_gp)
				return false;

760
			goto done;
761
		}
762 763
	}

764 765 766 767
	return false;

done:
	sched->state.counter = idx;
768

769 770 771 772 773 774 775 776 777 778 779 780 781
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
806
		c = sched->constraints[sched->state.event];
807 808 809 810 811 812 813 814 815 816
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
817
int perf_assign_events(struct event_constraint **constraints, int n,
818
			int wmin, int wmax, int gpmax, int *assign)
819 820 821
{
	struct perf_sched sched;

822
	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
823 824 825 826 827 828 829 830 831 832

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}
833
EXPORT_SYMBOL_GPL(perf_assign_events);
834

835
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
836
{
837
	struct event_constraint *c;
838
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
839
	struct perf_event *e;
840
	int i, wmin, wmax, unsched = 0;
841 842 843 844
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

845 846 847
	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);

848
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
849
		cpuc->event_constraint[i] = NULL;
850
		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
851
		cpuc->event_constraint[i] = c;
852

853 854
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
855 856
	}

857 858 859
	/*
	 * fastpath, try to reuse previous register
	 */
860
	for (i = 0; i < n; i++) {
861
		hwc = &cpuc->event_list[i]->hw;
862
		c = cpuc->event_constraint[i];
863 864 865 866 867 868

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
869
		if (!test_bit(hwc->idx, c->idxmsk))
870 871 872 873 874 875
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

876
		__set_bit(hwc->idx, used_mask);
877 878 879 880
		if (assign)
			assign[i] = hwc->idx;
	}

881
	/* slow path */
882
	if (i != n) {
883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898
		int gpmax = x86_pmu.num_counters;

		/*
		 * Do not allow scheduling of more than half the available
		 * generic counters.
		 *
		 * This helps avoid counter starvation of sibling thread by
		 * ensuring at most half the counters cannot be in exclusive
		 * mode. There is no designated counters for the limits. Any
		 * N/2 counters can be used. This helps with events with
		 * specific counter constraints.
		 */
		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
			gpmax /= 2;

899
		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
900
					     wmax, gpmax, assign);
901
	}
902

903
	/*
904 905 906 907 908 909 910 911
	 * In case of success (unsched = 0), mark events as committed,
	 * so we do not put_constraint() in case new events are added
	 * and fail to be scheduled
	 *
	 * We invoke the lower level commit callback to lock the resource
	 *
	 * We do not need to do all of this in case we are called to
	 * validate an event group (assign == NULL)
912
	 */
913
	if (!unsched && assign) {
914 915 916
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
917
			if (x86_pmu.commit_scheduling)
918
				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
919
		}
920
	} else {
921
		for (i = 0; i < n; i++) {
922 923 924 925 926 927 928 929
			e = cpuc->event_list[i];
			/*
			 * do not put_constraint() on comitted events,
			 * because they are good to go
			 */
			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
				continue;

930 931 932
			/*
			 * release events that failed scheduling
			 */
933
			if (x86_pmu.put_event_constraints)
934
				x86_pmu.put_event_constraints(cpuc, e);
935 936
		}
	}
937 938 939 940

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);

941
	return unsched ? -EINVAL : 0;
942 943 944 945 946 947 948 949 950 951 952
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

953
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
954 955 956 957 958 959

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
960
			return -EINVAL;
961 962 963 964 965 966 967 968
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
969
		    event->state <= PERF_EVENT_STATE_OFF)
970 971 972
			continue;

		if (n >= max_count)
973
			return -EINVAL;
974 975 976 977 978 979 980 981

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
982
				struct cpu_hw_events *cpuc, int i)
983
{
984 985 986 987 988
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
989

990
	if (hwc->idx == INTEL_PMC_IDX_FIXED_BTS) {
991 992
		hwc->config_base = 0;
		hwc->event_base	= 0;
993
	} else if (hwc->idx >= INTEL_PMC_IDX_FIXED) {