core.c 57.9 KB
Newer Older
1
/*
2
 * Performance events x86 architecture code
3
 *
4 5 6 7
 *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
 *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
 *  Copyright (C) 2009 Jaswinder Singh Rajput
 *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8
 *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra
9
 *  Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10
 *  Copyright (C) 2009 Google, Inc., Stephane Eranian
11 12 13 14
 *
 *  For licencing details see kernel-base/COPYING
 */

15
#include <linux/perf_event.h>
16 17 18 19
#include <linux/capability.h>
#include <linux/notifier.h>
#include <linux/hardirq.h>
#include <linux/kprobes.h>
20 21
#include <linux/export.h>
#include <linux/init.h>
22 23
#include <linux/kdebug.h>
#include <linux/sched.h>
24
#include <linux/uaccess.h>
25
#include <linux/slab.h>
26
#include <linux/cpu.h>
27
#include <linux/bitops.h>
28
#include <linux/device.h>
29 30

#include <asm/apic.h>
31
#include <asm/stacktrace.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
32
#include <asm/nmi.h>
33
#include <asm/smp.h>
34
#include <asm/alternative.h>
35
#include <asm/mmu_context.h>
36
#include <asm/tlbflush.h>
37
#include <asm/timer.h>
38 39
#include <asm/desc.h>
#include <asm/ldt.h>
40

41
#include "perf_event.h"
42 43

struct x86_pmu x86_pmu __read_mostly;
44

45
DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
46 47
	.enabled = 1,
};
48

49 50
struct static_key rdpmc_always_available = STATIC_KEY_INIT_FALSE;

51
u64 __read_mostly hw_cache_event_ids
52 53 54
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
55
u64 __read_mostly hw_cache_extra_regs
56 57 58
				[PERF_COUNT_HW_CACHE_MAX]
				[PERF_COUNT_HW_CACHE_OP_MAX]
				[PERF_COUNT_HW_CACHE_RESULT_MAX];
59

60
/*
61 62
 * Propagate event elapsed time into the generic event.
 * Can only be executed on the CPU where the event is active.
63 64
 * Returns the delta events processed.
 */
65
u64 x86_perf_event_update(struct perf_event *event)
66
{
67
	struct hw_perf_event *hwc = &event->hw;
68
	int shift = 64 - x86_pmu.cntval_bits;
69
	u64 prev_raw_count, new_raw_count;
70
	int idx = hwc->idx;
71
	s64 delta;
72

73
	if (idx == INTEL_PMC_IDX_FIXED_BTS)
74 75
		return 0;

76
	/*
77
	 * Careful: an NMI might modify the previous event value.
78 79 80
	 *
	 * Our tactic to handle this is to first atomically read and
	 * exchange a new raw count - then add that new-prev delta
81
	 * count to the generic event atomically:
82 83
	 */
again:
84
	prev_raw_count = local64_read(&hwc->prev_count);
85
	rdpmcl(hwc->event_base_rdpmc, new_raw_count);
86

87
	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
88 89 90 91 92 93
					new_raw_count) != prev_raw_count)
		goto again;

	/*
	 * Now we have the new raw value and have updated the prev
	 * timestamp already. We can now calculate the elapsed delta
94
	 * (event-)time and add that to the generic event.
95 96
	 *
	 * Careful, not all hw sign-extends above the physical width
97
	 * of the count.
98
	 */
99 100
	delta = (new_raw_count << shift) - (prev_raw_count << shift);
	delta >>= shift;
101

102 103
	local64_add(delta, &event->count);
	local64_sub(delta, &hwc->period_left);
104 105

	return new_raw_count;
106 107
}

108 109 110 111 112
/*
 * Find and validate any extra registers to set up.
 */
static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
{
113
	struct hw_perf_event_extra *reg;
114 115
	struct extra_reg *er;

116
	reg = &event->hw.extra_reg;
117 118 119 120 121 122 123 124 125

	if (!x86_pmu.extra_regs)
		return 0;

	for (er = x86_pmu.extra_regs; er->msr; er++) {
		if (er->event != (config & er->config_mask))
			continue;
		if (event->attr.config1 & ~er->valid_mask)
			return -EINVAL;
126 127 128
		/* Check if the extra msrs can be safely accessed*/
		if (!er->extra_msr_access)
			return -ENXIO;
129 130 131 132

		reg->idx = er->idx;
		reg->config = event->attr.config1;
		reg->reg = er->msr;
133 134 135 136 137
		break;
	}
	return 0;
}

138
static atomic_t active_events;
139
static atomic_t pmc_refcount;
Peter Zijlstra's avatar
Peter Zijlstra committed
140 141
static DEFINE_MUTEX(pmc_reserve_mutex);

142 143
#ifdef CONFIG_X86_LOCAL_APIC

Peter Zijlstra's avatar
Peter Zijlstra committed
144 145 146 147
static bool reserve_pmc_hardware(void)
{
	int i;

148
	for (i = 0; i < x86_pmu.num_counters; i++) {
149
		if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
150 151 152
			goto perfctr_fail;
	}

153
	for (i = 0; i < x86_pmu.num_counters; i++) {
154
		if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
Peter Zijlstra's avatar
Peter Zijlstra committed
155 156 157 158 159 160 161
			goto eventsel_fail;
	}

	return true;

eventsel_fail:
	for (i--; i >= 0; i--)
162
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
163

164
	i = x86_pmu.num_counters;
Peter Zijlstra's avatar
Peter Zijlstra committed
165 166 167

perfctr_fail:
	for (i--; i >= 0; i--)
168
		release_perfctr_nmi(x86_pmu_event_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
169 170 171 172 173 174 175 176

	return false;
}

static void release_pmc_hardware(void)
{
	int i;

177
	for (i = 0; i < x86_pmu.num_counters; i++) {
178 179
		release_perfctr_nmi(x86_pmu_event_addr(i));
		release_evntsel_nmi(x86_pmu_config_addr(i));
Peter Zijlstra's avatar
Peter Zijlstra committed
180 181 182
	}
}

183 184 185 186 187 188 189
#else

static bool reserve_pmc_hardware(void) { return true; }
static void release_pmc_hardware(void) {}

#endif

190 191
static bool check_hw_exists(void)
{
192 193 194
	u64 val, val_fail, val_new= ~0;
	int i, reg, reg_fail, ret = 0;
	int bios_fail = 0;
195
	int reg_safe = -1;
196

197 198 199 200 201
	/*
	 * Check to see if the BIOS enabled any of the counters, if so
	 * complain and bail.
	 */
	for (i = 0; i < x86_pmu.num_counters; i++) {
202
		reg = x86_pmu_config_addr(i);
203 204 205
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
206 207 208 209
		if (val & ARCH_PERFMON_EVENTSEL_ENABLE) {
			bios_fail = 1;
			val_fail = val;
			reg_fail = reg;
210 211
		} else {
			reg_safe = i;
212
		}
213 214 215 216 217 218 219 220
	}

	if (x86_pmu.num_counters_fixed) {
		reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
		ret = rdmsrl_safe(reg, &val);
		if (ret)
			goto msr_fail;
		for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
221 222 223 224 225
			if (val & (0x03 << i*4)) {
				bios_fail = 1;
				val_fail = val;
				reg_fail = reg;
			}
226 227 228
		}
	}

229 230 231 232 233 234 235 236 237 238 239
	/*
	 * If all the counters are enabled, the below test will always
	 * fail.  The tools will also become useless in this scenario.
	 * Just fail and disable the hardware counters.
	 */

	if (reg_safe == -1) {
		reg = reg_safe;
		goto msr_fail;
	}

240
	/*
241 242 243
	 * Read the current value, change it and read it back to see if it
	 * matches, this is needed to detect certain hardware emulators
	 * (qemu/kvm) that don't trap on the MSR access and always return 0s.
244
	 */
245
	reg = x86_pmu_event_addr(reg_safe);
246 247 248
	if (rdmsrl_safe(reg, &val))
		goto msr_fail;
	val ^= 0xffffUL;
249 250
	ret = wrmsrl_safe(reg, val);
	ret |= rdmsrl_safe(reg, &val_new);
251
	if (ret || val != val_new)
252
		goto msr_fail;
253

254 255 256
	/*
	 * We still allow the PMU driver to operate:
	 */
257
	if (bios_fail) {
258 259 260
		pr_cont("Broken BIOS detected, complain to your hardware vendor.\n");
		pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",
			      reg_fail, val_fail);
261
	}
262 263

	return true;
264 265

msr_fail:
266 267 268 269 270 271 272
	if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) {
		pr_cont("PMU not available due to virtualization, using software events only.\n");
	} else {
		pr_cont("Broken PMU hardware detected, using software events only.\n");
		pr_err("Failed to access perfctr msr (MSR %x is %Lx)\n",
		       reg, val_new);
	}
273

274
	return false;
275 276
}

277
static void hw_perf_event_destroy(struct perf_event *event)
Peter Zijlstra's avatar
Peter Zijlstra committed
278
{
279
	x86_release_hardware();
280
	atomic_dec(&active_events);
Peter Zijlstra's avatar
Peter Zijlstra committed
281 282
}

283 284 285 286 287 288 289 290
void hw_perf_lbr_event_destroy(struct perf_event *event)
{
	hw_perf_event_destroy(event);

	/* undo the lbr/bts event accounting */
	x86_del_exclusive(x86_lbr_exclusive_lbr);
}

291 292 293 294 295
static inline int x86_pmu_initialized(void)
{
	return x86_pmu.handle_irq != NULL;
}

296
static inline int
297
set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
298
{
299
	struct perf_event_attr *attr = &event->attr;
300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
	unsigned int cache_type, cache_op, cache_result;
	u64 config, val;

	config = attr->config;

	cache_type = (config >>  0) & 0xff;
	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
		return -EINVAL;

	cache_op = (config >>  8) & 0xff;
	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
		return -EINVAL;

	cache_result = (config >> 16) & 0xff;
	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
		return -EINVAL;

	val = hw_cache_event_ids[cache_type][cache_op][cache_result];

	if (val == 0)
		return -ENOENT;

	if (val == -1)
		return -EINVAL;

	hwc->config |= val;
326 327
	attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
	return x86_pmu_extra_regs(val, event);
328 329
}

330 331 332 333
int x86_reserve_hardware(void)
{
	int err = 0;

334
	if (!atomic_inc_not_zero(&pmc_refcount)) {
335
		mutex_lock(&pmc_reserve_mutex);
336
		if (atomic_read(&pmc_refcount) == 0) {
337 338 339 340 341 342
			if (!reserve_pmc_hardware())
				err = -EBUSY;
			else
				reserve_ds_buffers();
		}
		if (!err)
343
			atomic_inc(&pmc_refcount);
344 345 346 347 348 349 350 351
		mutex_unlock(&pmc_reserve_mutex);
	}

	return err;
}

void x86_release_hardware(void)
{
352
	if (atomic_dec_and_mutex_lock(&pmc_refcount, &pmc_reserve_mutex)) {
353 354 355 356 357 358
		release_pmc_hardware();
		release_ds_buffers();
		mutex_unlock(&pmc_reserve_mutex);
	}
}

359 360 361 362 363 364
/*
 * Check if we can create event of a certain type (that no conflicting events
 * are present).
 */
int x86_add_exclusive(unsigned int what)
{
365
	int i;
366

367 368 369
	if (x86_pmu.lbr_pt_coexist)
		return 0;

370 371 372 373 374 375 376 377
	if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) {
		mutex_lock(&pmc_reserve_mutex);
		for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++) {
			if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
				goto fail_unlock;
		}
		atomic_inc(&x86_pmu.lbr_exclusive[what]);
		mutex_unlock(&pmc_reserve_mutex);
378
	}
379

380 381
	atomic_inc(&active_events);
	return 0;
382

383
fail_unlock:
384
	mutex_unlock(&pmc_reserve_mutex);
385
	return -EBUSY;
386 387 388 389
}

void x86_del_exclusive(unsigned int what)
{
390 391 392
	if (x86_pmu.lbr_pt_coexist)
		return;

393
	atomic_dec(&x86_pmu.lbr_exclusive[what]);
394
	atomic_dec(&active_events);
395 396
}

397
int x86_setup_perfctr(struct perf_event *event)
398 399 400 401 402
{
	struct perf_event_attr *attr = &event->attr;
	struct hw_perf_event *hwc = &event->hw;
	u64 config;

403
	if (!is_sampling_event(event)) {
404 405
		hwc->sample_period = x86_pmu.max_period;
		hwc->last_period = hwc->sample_period;
406
		local64_set(&hwc->period_left, hwc->sample_period);
407 408 409
	}

	if (attr->type == PERF_TYPE_RAW)
410
		return x86_pmu_extra_regs(event->attr.config, event);
411 412

	if (attr->type == PERF_TYPE_HW_CACHE)
413
		return set_ext_hw_attr(hwc, event);
414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431

	if (attr->config >= x86_pmu.max_events)
		return -EINVAL;

	/*
	 * The generic map:
	 */
	config = x86_pmu.event_map(attr->config);

	if (config == 0)
		return -ENOENT;

	if (config == -1LL)
		return -EINVAL;

	/*
	 * Branch tracing:
	 */
Peter Zijlstra's avatar
Peter Zijlstra committed
432 433
	if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
	    !attr->freq && hwc->sample_period == 1) {
434
		/* BTS is not supported by this architecture. */
435
		if (!x86_pmu.bts_active)
436 437 438 439 440
			return -EOPNOTSUPP;

		/* BTS is currently only allowed for user-mode. */
		if (!attr->exclude_kernel)
			return -EOPNOTSUPP;
441 442 443 444 445 446

		/* disallow bts if conflicting events are present */
		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
			return -EBUSY;

		event->destroy = hw_perf_lbr_event_destroy;
447 448 449 450 451 452
	}

	hwc->config |= config;

	return 0;
}
453

454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483
/*
 * check that branch_sample_type is compatible with
 * settings needed for precise_ip > 1 which implies
 * using the LBR to capture ALL taken branches at the
 * priv levels of the measurement
 */
static inline int precise_br_compat(struct perf_event *event)
{
	u64 m = event->attr.branch_sample_type;
	u64 b = 0;

	/* must capture all branches */
	if (!(m & PERF_SAMPLE_BRANCH_ANY))
		return 0;

	m &= PERF_SAMPLE_BRANCH_KERNEL | PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_user)
		b |= PERF_SAMPLE_BRANCH_USER;

	if (!event->attr.exclude_kernel)
		b |= PERF_SAMPLE_BRANCH_KERNEL;

	/*
	 * ignore PERF_SAMPLE_BRANCH_HV, not supported on x86
	 */

	return m == b;
}

484
int x86_pmu_hw_config(struct perf_event *event)
485
{
486 487 488 489
	if (event->attr.precise_ip) {
		int precise = 0;

		/* Support for constant skid */
490
		if (x86_pmu.pebs_active && !x86_pmu.pebs_broken) {
491 492
			precise++;

493
			/* Support for IP fixup */
Andi Kleen's avatar
Andi Kleen committed
494
			if (x86_pmu.lbr_nr || x86_pmu.intel_cap.pebs_format >= 2)
495
				precise++;
496 497 498

			if (x86_pmu.pebs_prec_dist)
				precise++;
499
		}
500 501 502

		if (event->attr.precise_ip > precise)
			return -EOPNOTSUPP;
503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531
	}
	/*
	 * check that PEBS LBR correction does not conflict with
	 * whatever the user is asking with attr->branch_sample_type
	 */
	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
		u64 *br_type = &event->attr.branch_sample_type;

		if (has_branch_stack(event)) {
			if (!precise_br_compat(event))
				return -EOPNOTSUPP;

			/* branch_sample_type is compatible */

		} else {
			/*
			 * user did not specify  branch_sample_type
			 *
			 * For PEBS fixups, we capture all
			 * the branches at the priv level of the
			 * event.
			 */
			*br_type = PERF_SAMPLE_BRANCH_ANY;

			if (!event->attr.exclude_user)
				*br_type |= PERF_SAMPLE_BRANCH_USER;

			if (!event->attr.exclude_kernel)
				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
532
		}
533 534
	}

535 536 537
	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
		event->attach_state |= PERF_ATTACH_TASK_DATA;

538 539 540 541
	/*
	 * Generate PMC IRQs:
	 * (keep 'enabled' bit clear for now)
	 */
542
	event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
543 544 545 546

	/*
	 * Count user and OS events unless requested not to
	 */
547 548 549 550
	if (!event->attr.exclude_user)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
	if (!event->attr.exclude_kernel)
		event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
551

552 553
	if (event->attr.type == PERF_TYPE_RAW)
		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
554

555 556 557 558 559 560
	if (event->attr.sample_period && x86_pmu.limit_period) {
		if (x86_pmu.limit_period(event, event->attr.sample_period) >
				event->attr.sample_period)
			return -EINVAL;
	}

561
	return x86_setup_perfctr(event);
562 563
}

564
/*
565
 * Setup the hardware configuration for a given attr_type
566
 */
567
static int __x86_pmu_event_init(struct perf_event *event)
568
{
Peter Zijlstra's avatar
Peter Zijlstra committed
569
	int err;
570

571 572
	if (!x86_pmu_initialized())
		return -ENODEV;
573

574
	err = x86_reserve_hardware();
Peter Zijlstra's avatar
Peter Zijlstra committed
575 576 577
	if (err)
		return err;

578
	atomic_inc(&active_events);
579
	event->destroy = hw_perf_event_destroy;
580

581 582 583
	event->hw.idx = -1;
	event->hw.last_cpu = -1;
	event->hw.last_tag = ~0ULL;
584

585 586
	/* mark unused */
	event->hw.extra_reg.idx = EXTRA_REG_NONE;
587 588
	event->hw.branch_reg.idx = EXTRA_REG_NONE;

589
	return x86_pmu.hw_config(event);
590 591
}

592
void x86_pmu_disable_all(void)
593
{
594
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
595 596
	int idx;

597
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
598 599
		u64 val;

600
		if (!test_bit(idx, cpuc->active_mask))
601
			continue;
602
		rdmsrl(x86_pmu_config_addr(idx), val);
603
		if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
604
			continue;
605
		val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
606
		wrmsrl(x86_pmu_config_addr(idx), val);
607 608 609
	}
}

610 611 612 613 614 615 616 617 618 619 620 621 622
/*
 * There may be PMI landing after enabled=0. The PMI hitting could be before or
 * after disable_all.
 *
 * If PMI hits before disable_all, the PMU will be disabled in the NMI handler.
 * It will not be re-enabled in the NMI handler again, because enabled=0. After
 * handling the NMI, disable_all will be called, which will not change the
 * state either. If PMI hits after disable_all, the PMU is already disabled
 * before entering NMI handler. The NMI handler will not change the state
 * either.
 *
 * So either situation is harmless.
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
623
static void x86_pmu_disable(struct pmu *pmu)
624
{
625
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
626

627
	if (!x86_pmu_initialized())
628
		return;
629

630 631 632 633 634 635
	if (!cpuc->enabled)
		return;

	cpuc->n_added = 0;
	cpuc->enabled = 0;
	barrier();
636 637

	x86_pmu.disable_all();
638
}
639

640
void x86_pmu_enable_all(int added)
641
{
642
	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
643 644
	int idx;

645
	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
646
		struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
647

648
		if (!test_bit(idx, cpuc->active_mask))
649
			continue;
650

651
		__x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
652 653 654
	}
}

Peter Zijlstra's avatar
Peter Zijlstra committed
655
static struct pmu pmu;
656 657 658 659 660 661

static inline int is_x86_event(struct perf_event *event)
{
	return event->pmu == &pmu;
}

662 663 664 665 666 667 668 669 670 671 672 673
/*
 * Event scheduler state:
 *
 * Assign events iterating over all events and counters, beginning
 * with events with least weights first. Keep the current iterator
 * state in struct sched_state.
 */
struct sched_state {
	int	weight;
	int	event;		/* event index */
	int	counter;	/* counter index */
	int	unassigned;	/* number of events to be assigned left */
674
	int	nr_gp;		/* number of GP counters used */
675 676 677
	unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
};

678 679 680
/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */
#define	SCHED_STATES_MAX	2

681 682 683
struct perf_sched {
	int			max_weight;
	int			max_events;
684 685
	int			max_gp;
	int			saved_states;
686
	struct event_constraint	**constraints;
687
	struct sched_state	state;
688
	struct sched_state	saved[SCHED_STATES_MAX];
689 690 691 692 693
};

/*
 * Initialize interator that runs through all events and counters.
 */
694
static void perf_sched_init(struct perf_sched *sched, struct event_constraint **constraints,
695
			    int num, int wmin, int wmax, int gpmax)
696 697 698 699 700 701
{
	int idx;

	memset(sched, 0, sizeof(*sched));
	sched->max_events	= num;
	sched->max_weight	= wmax;
702
	sched->max_gp		= gpmax;
703
	sched->constraints	= constraints;
704 705

	for (idx = 0; idx < num; idx++) {
706
		if (constraints[idx]->weight == wmin)
707 708 709 710 711 712 713 714
			break;
	}

	sched->state.event	= idx;		/* start with min weight */
	sched->state.weight	= wmin;
	sched->state.unassigned	= num;
}

715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737
static void perf_sched_save_state(struct perf_sched *sched)
{
	if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX))
		return;

	sched->saved[sched->saved_states] = sched->state;
	sched->saved_states++;
}

static bool perf_sched_restore_state(struct perf_sched *sched)
{
	if (!sched->saved_states)
		return false;

	sched->saved_states--;
	sched->state = sched->saved[sched->saved_states];

	/* continue with next counter: */
	clear_bit(sched->state.counter++, sched->state.used);

	return true;
}

738 739 740 741
/*
 * Select a counter for the current event to schedule. Return true on
 * success.
 */
742
static bool __perf_sched_find_counter(struct perf_sched *sched)
743 744 745 746 747 748 749 750 751 752
{
	struct event_constraint *c;
	int idx;

	if (!sched->state.unassigned)
		return false;

	if (sched->state.event >= sched->max_events)
		return false;

753
	c = sched->constraints[sched->state.event];
754
	/* Prefer fixed purpose counters */
755 756
	if (c->idxmsk64 & (~0ULL << INTEL_PMC_IDX_FIXED)) {
		idx = INTEL_PMC_IDX_FIXED;
757
		for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) {
758 759 760 761
			if (!__test_and_set_bit(idx, sched->state.used))
				goto done;
		}
	}
762

763 764
	/* Grab the first unused counter starting with idx */
	idx = sched->state.counter;
765
	for_each_set_bit_from(idx, c->idxmsk, INTEL_PMC_IDX_FIXED) {
766 767 768 769
		if (!__test_and_set_bit(idx, sched->state.used)) {
			if (sched->state.nr_gp++ >= sched->max_gp)
				return false;

770
			goto done;
771
		}
772 773
	}

774 775 776 777
	return false;

done:
	sched->state.counter = idx;
778

779 780 781 782 783 784 785 786 787 788 789 790 791
	if (c->overlap)
		perf_sched_save_state(sched);

	return true;
}

static bool perf_sched_find_counter(struct perf_sched *sched)
{
	while (!__perf_sched_find_counter(sched)) {
		if (!perf_sched_restore_state(sched))
			return false;
	}

792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815
	return true;
}

/*
 * Go through all unassigned events and find the next one to schedule.
 * Take events with the least weight first. Return true on success.
 */
static bool perf_sched_next_event(struct perf_sched *sched)
{
	struct event_constraint *c;

	if (!sched->state.unassigned || !--sched->state.unassigned)
		return false;

	do {
		/* next event */
		sched->state.event++;
		if (sched->state.event >= sched->max_events) {
			/* next weight */
			sched->state.event = 0;
			sched->state.weight++;
			if (sched->state.weight > sched->max_weight)
				return false;
		}
816
		c = sched->constraints[sched->state.event];
817 818 819 820 821 822 823 824 825 826
	} while (c->weight != sched->state.weight);

	sched->state.counter = 0;	/* start with first counter */

	return true;
}

/*
 * Assign a counter for each event.
 */
827
int perf_assign_events(struct event_constraint **constraints, int n,
828
			int wmin, int wmax, int gpmax, int *assign)
829 830 831
{
	struct perf_sched sched;

832
	perf_sched_init(&sched, constraints, n, wmin, wmax, gpmax);
833 834 835 836 837 838 839 840 841 842

	do {
		if (!perf_sched_find_counter(&sched))
			break;	/* failed */
		if (assign)
			assign[sched.state.event] = sched.state.counter;
	} while (perf_sched_next_event(&sched));

	return sched.state.unassigned;
}
843
EXPORT_SYMBOL_GPL(perf_assign_events);
844

845
int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
846
{
847
	struct event_constraint *c;
848
	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
849
	struct perf_event *e;
850
	int i, wmin, wmax, unsched = 0;
851 852 853 854
	struct hw_perf_event *hwc;

	bitmap_zero(used_mask, X86_PMC_IDX_MAX);

855 856 857
	if (x86_pmu.start_scheduling)
		x86_pmu.start_scheduling(cpuc);

858
	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
859
		cpuc->event_constraint[i] = NULL;
860
		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
861
		cpuc->event_constraint[i] = c;
862

863 864
		wmin = min(wmin, c->weight);
		wmax = max(wmax, c->weight);
865 866
	}

867 868 869
	/*
	 * fastpath, try to reuse previous register
	 */
870
	for (i = 0; i < n; i++) {
871
		hwc = &cpuc->event_list[i]->hw;
872
		c = cpuc->event_constraint[i];
873 874 875 876 877 878

		/* never assigned */
		if (hwc->idx == -1)
			break;

		/* constraint still honored */
879
		if (!test_bit(hwc->idx, c->idxmsk))
880 881 882 883 884 885
			break;

		/* not already used */
		if (test_bit(hwc->idx, used_mask))
			break;

886
		__set_bit(hwc->idx, used_mask);
887 888 889 890
		if (assign)
			assign[i] = hwc->idx;
	}

891
	/* slow path */
892
	if (i != n) {
893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908
		int gpmax = x86_pmu.num_counters;

		/*
		 * Do not allow scheduling of more than half the available
		 * generic counters.
		 *
		 * This helps avoid counter starvation of sibling thread by
		 * ensuring at most half the counters cannot be in exclusive
		 * mode. There is no designated counters for the limits. Any
		 * N/2 counters can be used. This helps with events with
		 * specific counter constraints.
		 */
		if (is_ht_workaround_enabled() && !cpuc->is_fake &&
		    READ_ONCE(cpuc->excl_cntrs->exclusive_present))
			gpmax /= 2;

909
		unsched = perf_assign_events(cpuc->event_constraint, n, wmin,
910
					     wmax, gpmax, assign);
911
	}
912

913
	/*
914 915 916 917 918 919 920 921
	 * In case of success (unsched = 0), mark events as committed,
	 * so we do not put_constraint() in case new events are added
	 * and fail to be scheduled
	 *
	 * We invoke the lower level commit callback to lock the resource
	 *
	 * We do not need to do all of this in case we are called to
	 * validate an event group (assign == NULL)
922
	 */
923
	if (!unsched && assign) {
924 925 926
		for (i = 0; i < n; i++) {
			e = cpuc->event_list[i];
			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
927
			if (x86_pmu.commit_scheduling)
928
				x86_pmu.commit_scheduling(cpuc, i, assign[i]);
929
		}
930
	} else {
931
		for (i = 0; i < n; i++) {
932 933 934 935 936 937 938 939
			e = cpuc->event_list[i];
			/*
			 * do not put_constraint() on comitted events,
			 * because they are good to go
			 */
			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
				continue;

940 941 942
			/*
			 * release events that failed scheduling
			 */
943
			if (x86_pmu.put_event_constraints)
944
				x86_pmu.put_event_constraints(cpuc, e);
945 946
		}
	}
947 948 949 950

	if (x86_pmu.stop_scheduling)
		x86_pmu.stop_scheduling(cpuc);

951
	return unsched ? -EINVAL : 0;
952 953 954 955 956 957 958 959 960 961 962
}

/*
 * dogrp: true if must collect siblings events (group)
 * returns total number of events and error code
 */
static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
{
	struct perf_event *event;
	int n, max_count;

963
	max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
964 965 966 967 968 969

	/* current number of events already accepted */
	n = cpuc->n_events;

	if (is_x86_event(leader)) {
		if (n >= max_count)
970
			return -EINVAL;
971 972 973 974 975 976 977 978
		cpuc->event_list[n] = leader;
		n++;
	}
	if (!dogrp)
		return n;

	list_for_each_entry(event, &leader->sibling_list, group_entry) {
		if (!is_x86_event(event) ||
979
		    event->state <= PERF_EVENT_STATE_OFF)
980 981 982
			continue;

		if (n >= max_count)
983
			return -EINVAL;
984 985 986 987 988 989 990 991

		cpuc->event_list[n] = event;
		n++;
	}
	return n;
}

static inline void x86_assign_hw_event(struct perf_event *event,
992
				struct cpu_hw_events *cpuc, int i)
993
{
994 995 996 997 998
	struct hw_perf_event *hwc = &event->hw;

	hwc->idx = cpuc->assign[i];
	hwc->last_cpu = smp_processor_id();
	hwc->last_tag = ++cpuc->tags[i];
999

Robert Richter's avatar