core.c 56.4 KB
Newer Older
1
/*
2
 * Performance events x86 architecture code
3
 *
4 5 6 7
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20
#include <linux/module.h>
21 22
#include <linux/kdebug.h>
#include <linux/sched.h>
23
#include <linux/uaccess.h>
24
#include <linux/slab.h>
25
#include <linux/cpu.h>
26
#include <linux/bitops.h>
27
#include <linux/device.h>
28 29

#include <asm/apic.h>
30
#include <asm/stacktrace.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
31
#include <asm/nmi.h>
32
#include <asm/smp.h>
33
#include <asm/alternative.h>
34
#include <asm/mmu_context.h>
35
#include <asm/tlbflush.h>
36
#include <asm/timer.h>
37 38
#include <asm/desc.h>
#include <asm/ldt.h>
39

40
#include "perf_event.h"
41 42

struct x86_pmu x86_pmu __read_mostly;
43

44
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
45 46
	.enabled = 1,
};
47

48 49
struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;

50
u64 __read_mostly hw_cache_event_ids
51 52 53
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
54
u64 __read_mostly hw_cache_extra_regs
55 56 57
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
58

59
/*
60 61
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
62 63
 * Returns the delta events processed.
 */
64
u64 x86_perf_event_update(struct perf_event *event)
65
{
66
	struct hw_perf_event *hwc = &event->hw;
67
	int shift = 64 - x86_pmu.cntval_bits;
68
	u64 prev_raw_count, new_raw_count;
69
	int idx = hwc->idx;
70
	s64 delta;
71

72
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
73 74
		return 0;

75
	/*
76
	 * Careful: an NMI might modify the previous event value.
77 78 79
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
80
	 * count to the generic event atomically:
81 82
	 */
again:
83
	prev_raw_count = local64_read(&hwc->prev_count);
84
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
85

86
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
87 88 89 90 91 92
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
93
	 * (event-)time and add that to the generic event.
94 95
	 *
	 * Careful, not all hw sign-extends above the physical width
96
	 * of the count.
97
	 */
98 99
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
100

101 102
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
103 104

	return new_raw_count;
105 106
}

107 108 109 110 111
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
112
	struct hw_perf_event_extra *reg;
113 114
	struct extra_reg *er;

115
	reg = &event->hw.extra_reg;
116 117 118 119 120 121 122 123 124

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
125 126 127
		/* Check if the extra msrs can be safely accessed*/
		if (!er->extra_msr_access)
			return -ENXIO;
128 129 130 131

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
132 133 134 135 136
		break;
	}
	return 0;
}

137
static atomic_t active_events;
138
static atomic_t pmc_refcount;
Peter Zijlstra's avatar
Peter Zijlstra committed
139 140
static DEFINE_MUTEX(pmc_reserve_mutex);

141 142
#ifdef CONFIG_X86_LOCAL_APIC

Peter Zijlstra's avatar
Peter Zijlstra committed
143 144 145 146
static bool reserve_pmc_hardware(void)
{
	int i;

147
	for (i = 0; i < x86_pmu.num_counters; i++) {
148
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
149 150 151
			goto perfctr_fail;
	}

152
	for (i = 0; i < x86_pmu.num_counters; i++) {
153
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
154 155 156 157 158 159 160
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
161
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
162

163
	i = x86_pmu.num_counters;
Peter Zijlstra's avatar
Peter Zijlstra committed
164 165 166

perfctr_fail:
	for (i--; i >= 0; i--)
167
		release_perfctr_nmi(x86_pmu_event_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
168 169 170 171 172 173 174 175

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

176
	for (i = 0; i < x86_pmu.num_counters; i++) {
177 178
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
179 180 181
	}
}

182 183 184 185 186 187 188
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

189 190
static bool check_hw_exists(void)
{
191 192 193
	u64 val, val_fail, val_new= ~0;
	int i, reg, reg_fail, ret = 0;
	int bios_fail = 0;
194
	int reg_safe = -1;
195

196 197 198 199 200
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
201
		reg = x86_pmu_config_addr(i);
202 203 204
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
205 206 207 208
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
			bios_fail = 1;
			val_fail = val;
			reg_fail = reg;
209 210
		} else {
			reg_safe = i;
211
		}
212 213 214 215 216 217 218 219
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
220 221 222 223 224
			if (val & (0x03 << i*4)) {
				bios_fail = 1;
				val_fail = val;
				reg_fail = reg;
			}
225 226 227
		}
	}

228 229 230 231 232 233 234 235 236 237 238
	/*
	 * If all the counters are enabled, the below test will always
	 * fail.  The tools will also become useless in this scenario.
	 * Just fail and disable the hardware counters.
	 */

	if (reg_safe == -1) {
		reg = reg_safe;
		goto msr_fail;
	}

239
	/*
240 241 242
	 * Read the current value, change it and read it back to see if it
	 * matches, this is needed to detect certain hardware emulators
	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
243
	 */
244
	reg = x86_pmu_event_addr(reg_safe);
245 246 247
	if (rdmsrl_safe(reg, &val))
		goto msr_fail;
	val ^= 0xffffUL;
248 249
	ret = wrmsrl_safe(reg, val);
	ret |= rdmsrl_safe(reg, &val_new);
250
	if (ret || val != val_new)
251
		goto msr_fail;
252

253 254 255
	/*
	 * We still allow the PMU driver to operate:
	 */
256
	if (bios_fail) {
257 258 259
		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
			      reg_fail, val_fail);
260
	}
261 262

	return true;
263 264

msr_fail:
265 266
	pr_cont("Broken PMU hardware detected, using software events only.\n");
	pr_info("%sFailed to access perfctr msr (MSR %x is %Lx)\n",
267 268
		boot_cpu_has(X86_FEATURE_HYPERVISOR) ? KERN_INFO : KERN_ERR,
		reg, val_new);
269

270
	return false;
271 272
}

273
static void hw_perf_event_destroy(struct perf_event *event)
Peter Zijlstra's avatar
Peter Zijlstra committed
274
{
275
	x86_release_hardware();
276
	atomic_dec(&active_events);
Peter Zijlstra's avatar
Peter Zijlstra committed
277 278
}

279 280 281 282 283 284 285 286
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
	hw_perf_event_destroy(event);

	/* undo the lbr/bts event accounting */
	x86_del_exclusive(x86_lbr_exclusive_lbr);
}

287 288 289 290 291
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

292
static inline int
293
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
294
{
295
	struct perf_event_attr *attr = &event->attr;
296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
322 323
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
324 325
}

326 327 328 329
int x86_reserve_hardware(void)
{
	int err = 0;

330
	if (!atomic_inc_not_zero(&pmc_refcount)) {
331
		mutex_lock(&pmc_reserve_mutex);
332
		if (atomic_read(&pmc_refcount) == 0) {
333 334 335 336 337 338
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
				reserve_ds_buffers();
		}
		if (!err)
339
			atomic_inc(&pmc_refcount);
340 341 342 343 344 345 346 347
		mutex_unlock(&pmc_reserve_mutex);
	}

	return err;
}

void x86_release_hardware(void)
{
348
	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
349 350 351 352 353 354
		release_pmc_hardware();
		release_ds_buffers();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

355 356 357 358 359 360
/*
 * Check if we can create event of a certain type (that no conflicting events
 * are present).
 */
int x86_add_exclusive(unsigned int what)
{
361
	int i;
362

363 364 365
	if (x86_pmu.lbr_pt_coexist)
		return 0;

366 367 368 369 370 371 372 373
	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
		mutex_lock(&pmc_reserve_mutex);
		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
				goto fail_unlock;
		}
		atomic_inc(&x86_pmu.lbr_exclusive[what]);
		mutex_unlock(&pmc_reserve_mutex);
374
	}
375

376 377
	atomic_inc(&active_events);
	return 0;
378

379
fail_unlock:
380
	mutex_unlock(&pmc_reserve_mutex);
381
	return -EBUSY;
382 383 384 385
}

void x86_del_exclusive(unsigned int what)
{
386 387 388
	if (x86_pmu.lbr_pt_coexist)
		return;

389
	atomic_dec(&x86_pmu.lbr_exclusive[what]);
390
	atomic_dec(&active_events);
391 392
}

393
int x86_setup_perfctr(struct perf_event *event)
394 395 396 397 398
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

399
	if (!is_sampling_event(event)) {
400 401
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
402
		local64_set(&hwc->period_left, hwc->sample_period);
403 404 405
	}

	if (attr->type == PERF_TYPE_RAW)
406
		return x86_pmu_extra_regs(event->attr.config, event);
407 408

	if (attr->type == PERF_TYPE_HW_CACHE)
409
		return set_ext_hw_attr(hwc, event);
410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
428 429
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
430
		/* BTS is not supported by this architecture. */
431
		if (!x86_pmu.bts_active)
432 433 434 435 436
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
437 438 439 440 441 442

		/* disallow bts if conflicting events are present */
		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
			return -EBUSY;

		event->destroy = hw_perf_lbr_event_destroy;
443 444 445 446 447 448
	}

	hwc->config |= config;

	return 0;
}
449

450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

480
int x86_pmu_hw_config(struct perf_event *event)
481
{
482 483 484 485
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
486
		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
487 488
			precise++;

489
			/* Support for IP fixup */
Andi Kleen's avatar
Andi Kleen committed
490
			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
491
				precise++;
492 493 494

			if (x86_pmu.pebs_prec_dist)
				precise++;
495
		}
496 497 498

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
	}
	/*
	 * check that PEBS LBR correction does not conflict with
	 * whatever the user is asking with attr->branch_sample_type
	 */
	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
		u64 *br_type = &event->attr.branch_sample_type;

		if (has_branch_stack(event)) {
			if (!precise_br_compat(event))
				return -EOPNOTSUPP;

			/* branch_sample_type is compatible */

		} else {
			/*
			 * user did not specify  branch_sample_type
			 *
			 * For PEBS fixups, we capture all
			 * the branches at the priv level of the
			 * event.
			 */
			*br_type = PERF_SAMPLE_BRANCH_ANY;

			if (!event->attr.exclude_user)
				*br_type |= PERF_SAMPLE_BRANCH_USER;

			if (!event->attr.exclude_kernel)
				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
528
		}
529 530
	}

531 532 533
	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
		event->attach_state |= PERF_ATTACH_TASK_DATA;

534 535 536 537
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
538
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
539 540 541 542

	/*
	 * Count user and OS events unless requested not to
	 */
543 544 545 546
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
547

548 549
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
550

551 552 553 554 555 556
	if (event->attr.sample_period && x86_pmu.limit_period) {
		if (x86_pmu.limit_period(event, event->attr.sample_period) >
				event->attr.sample_period)
			return -EINVAL;
	}

557
	return x86_setup_perfctr(event);
558 559
}

560
/*
561
 * Setup the hardware configuration for a given attr_type
562
 */
563
static int __x86_pmu_event_init(struct perf_event *event)
564
{
Peter Zijlstra's avatar
Peter Zijlstra committed
565
	int err;
566

567 568
	if (!x86_pmu_initialized())
		return -ENODEV;
569

570
	err = x86_reserve_hardware();
Peter Zijlstra's avatar
Peter Zijlstra committed
571 572 573
	if (err)
		return err;

574
	atomic_inc(&active_events);
575
	event->destroy = hw_perf_event_destroy;
576

577 578 579
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
580

581 582
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
583 584
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

585
	return x86_pmu.hw_config(event);
586 587
}

588
void x86_pmu_disable_all(void)
589
{
590
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
591 592
	int idx;

593
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
594 595
		u64 val;

596
		if (!test_bit(idx, cpuc->active_mask))
597
			continue;
598
		rdmsrl(x86_pmu_config_addr(idx), val);
599
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
600
			continue;
601
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
602
		wrmsrl(x86_pmu_config_addr(idx), val);
603 604 605
	}
}

606 607 608 609 610 611 612 613 614 615 616 617 618
/*
 * There may be PMI landing after enabled=0. The PMI hitting could be before or
 * after disable_all.
 *
 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 * It will not be re-enabled in the NMI handler again, because enabled=0. After
 * handling the NMI, disable_all will be called, which will not change the
 * state either. If PMI hits after disable_all, the PMU is already disabled
 * before entering NMI handler. The NMI handler will not change the state
 * either.
 *
 * So either situation is harmless.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
619
static void x86_pmu_disable(struct pmu *pmu)
620
{
621
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
622

623
	if (!x86_pmu_initialized())
624
		return;
625

626 627 628 629 630 631
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
632 633

	x86_pmu.disable_all();
634
}
635

636
void x86_pmu_enable_all(int added)
637
{
638
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
639 640
	int idx;

641
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
642
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
643

644
		if (!test_bit(idx, cpuc->active_mask))
645
			continue;
646

647
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
648 649 650
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
651
static struct pmu pmu;
652 653 654 655 656 657

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

658 659 660 661 662 663 664 665 666 667 668 669
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
670
	int	nr_gp;		/* number of GP counters used */
671 672 673
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

674 675 676
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

677 678 679
struct perf_sched {
	int			max_weight;
	int			max_events;
680 681
	int			max_gp;
	int			saved_states;
682
	struct event_constraint	**constraints;
683
	struct sched_state	state;
684
	struct sched_state	saved[SCHED_STATES_MAX];
685 686 687 688 689
};

/*
 * Initialize interator that runs through all events and counters.
 */
690
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
691
			    int num, int wmin, int wmax, int gpmax)
692 693 694 695 696 697
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
698
	sched->max_gp		= gpmax;
699
	sched->constraints	= constraints;
700 701

	for (idx = 0; idx < num; idx++) {
702
		if (constraints[idx]->weight == wmin)
703 704 705 706 707 708 709 710
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

734 735 736 737
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
738
static bool __perf_sched_find_counter(struct perf_sched *sched)
739 740 741 742 743 744 745 746 747 748
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

749
	c = sched->constraints[sched->state.event];
750
	/* Prefer fixed purpose counters */
751 752
	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
		idx = INTEL_PMC_IDX_FIXED;
753
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
754 755 756 757
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
758

759 760
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
761
	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
762 763 764 765
		if (!__test_and_set_bit(idx, sched->state.used)) {
			if (sched->state.nr_gp++ >= sched->max_gp)
				return false;

766
			goto done;
767
		}
768 769
	}

770 771 772 773
	return false;

done:
	sched->state.counter = idx;
774

775 776 777 778 779 780 781 782 783 784 785 786 787
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
812
		c = sched->constraints[sched->state.event];
813 814 815 816 817 818 819 820 821 822
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
823
int perf_assign_events(struct event_constraint **constraints, int n,
824
			int wmin, int wmax, int gpmax, int *assign)
825 826 827
{
	struct perf_sched sched;

828
	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
829 830 831 832 833 834 835 836 837 838

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}
839
EXPORT_SYMBOL_GPL(perf_assign_events);
840

841
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
842
{
843
	struct event_constraint *c;
844
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
845
	struct perf_event *e;
846
	int i, wmin, wmax, unsched = 0;
847 848 849 850
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

851 852 853
	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);

854
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
855
		cpuc->event_constraint[i] = NULL;
856
		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
857
		cpuc->event_constraint[i] = c;
858

859 860
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
861 862
	}

863 864 865
	/*
	 * fastpath, try to reuse previous register
	 */
866
	for (i = 0; i < n; i++) {
867
		hwc = &cpuc->event_list[i]->hw;
868
		c = cpuc->event_constraint[i];
869 870 871 872 873 874

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
875
		if (!test_bit(hwc->idx, c->idxmsk))
876 877 878 879 880 881
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

882
		__set_bit(hwc->idx, used_mask);
883 884 885 886
		if (assign)
			assign[i] = hwc->idx;
	}

887
	/* slow path */
888
	if (i != n) {
889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
		int gpmax = x86_pmu.num_counters;

		/*
		 * Do not allow scheduling of more than half the available
		 * generic counters.
		 *
		 * This helps avoid counter starvation of sibling thread by
		 * ensuring at most half the counters cannot be in exclusive
		 * mode. There is no designated counters for the limits. Any
		 * N/2 counters can be used. This helps with events with
		 * specific counter constraints.
		 */
		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
			gpmax /= 2;

905
		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
906
					     wmax, gpmax, assign);
907
	}
908

909
	/*
910 911 912 913 914 915 916 917
	 * In case of success (unsched = 0), mark events as committed,
	 * so we do not put_constraint() in case new events are added
	 * and fail to be scheduled
	 *
	 * We invoke the lower level commit callback to lock the resource
	 *
	 * We do not need to do all of this in case we are called to
	 * validate an event group (assign == NULL)
918
	 */
919
	if (!unsched && assign) {
920 921 922
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
923
			if (x86_pmu.commit_scheduling)
924
				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
925
		}
926
	} else {
927
		for (i = 0; i < n; i++) {
928 929 930 931 932 933 934 935
			e = cpuc->event_list[i];
			/*
			 * do not put_constraint() on comitted events,
			 * because they are good to go
			 */
			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
				continue;

936 937 938
			/*
			 * release events that failed scheduling
			 */
939
			if (x86_pmu.put_event_constraints)
940
				x86_pmu.put_event_constraints(cpuc, e);
941 942
		}
	}
943 944 945 946

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);

947
	return unsched ? -EINVAL : 0;
948 949 950 951 952 953 954 955 956 957 958
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

959
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
960 961 962 963 964 965

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
966
			return -EINVAL;
967 968 969 970 971 972 973 974
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
975
		    event->state <= PERF_EVENT_STATE_OFF)
976 977 978
			continue;

		if (n >= max_count)
979
			return -EINVAL;
980 981 982 983 984 985 986 987

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
988
				struct cpu_hw_events *cpuc, int i)
989
{
990 991 992 993 994
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
995

<