Commit 30dd568c authored by Markus Metzger's avatar Markus Metzger Committed by Ingo Molnar

x86, perf_counter, bts: Add BTS support to perfcounters

Implement a performance counter with:

    attr.type           = PERF_TYPE_HARDWARE
    attr.config         = PERF_COUNT_HW_BRANCH_INSTRUCTIONS
    attr.sample_period  = 1

Using branch trace store (BTS) on x86 hardware, if available.

The from and to address for each branch can be sampled using:

    PERF_SAMPLE_IP      for the from address
    PERF_SAMPLE_ADDR    for the to address

[ v2: address review feedback, fix bugs ]
Signed-off-by: default avatarMarkus Metzger <markus.t.metzger@intel.com>
Acked-by: default avatarPeter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: default avatarIngo Molnar <mingo@elte.hu>
parent 183f3b08
......@@ -84,6 +84,16 @@ union cpuid10_edx {
#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b
#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2)
/*
* We model BTS tracing as another fixed-mode PMC.
*
* We choose a value in the middle of the fixed counter range, since lower
* values are used by actual fixed counters and higher values are used
* to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr.
*/
#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
#ifdef CONFIG_PERF_COUNTERS
extern void init_hw_perf_counters(void);
extern void perf_counters_lapic_init(void);
......
......@@ -6,6 +6,7 @@
* Copyright (C) 2009 Jaswinder Singh Rajput
* Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
* Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
*
* For licencing details see kernel-base/COPYING
*/
......@@ -20,6 +21,7 @@
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
#include <linux/cpu.h>
#include <asm/apic.h>
#include <asm/stacktrace.h>
......@@ -27,12 +29,52 @@
static u64 perf_counter_mask __read_mostly;
/* The maximal number of PEBS counters: */
#define MAX_PEBS_COUNTERS 4
/* The size of a BTS record in bytes: */
#define BTS_RECORD_SIZE 24
/* The size of a per-cpu BTS buffer in bytes: */
#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024)
/* The BTS overflow threshold in bytes from the end of the buffer: */
#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64)
/*
* Bits in the debugctlmsr controlling branch tracing.
*/
#define X86_DEBUGCTL_TR (1 << 6)
#define X86_DEBUGCTL_BTS (1 << 7)
#define X86_DEBUGCTL_BTINT (1 << 8)
#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
/*
* A debug store configuration.
*
* We only support architectures that use 64bit fields.
*/
struct debug_store {
u64 bts_buffer_base;
u64 bts_index;
u64 bts_absolute_maximum;
u64 bts_interrupt_threshold;
u64 pebs_buffer_base;
u64 pebs_index;
u64 pebs_absolute_maximum;
u64 pebs_interrupt_threshold;
u64 pebs_counter_reset[MAX_PEBS_COUNTERS];
};
struct cpu_hw_counters {
struct perf_counter *counters[X86_PMC_IDX_MAX];
unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
unsigned long interrupts;
int enabled;
struct debug_store *ds;
};
/*
......@@ -57,6 +99,8 @@ struct x86_pmu {
u64 counter_mask;
u64 max_period;
u64 intel_ctrl;
void (*enable_bts)(u64 config);
void (*disable_bts)(void);
};
static struct x86_pmu x86_pmu __read_mostly;
......@@ -576,6 +620,9 @@ x86_perf_counter_update(struct perf_counter *counter,
u64 prev_raw_count, new_raw_count;
s64 delta;
if (idx == X86_PMC_IDX_FIXED_BTS)
return 0;
/*
* Careful: an NMI might modify the previous counter value.
*
......@@ -659,10 +706,109 @@ static void release_pmc_hardware(void)
enable_lapic_nmi_watchdog();
}
static inline bool bts_available(void)
{
return x86_pmu.enable_bts != NULL;
}
static inline void init_debug_store_on_cpu(int cpu)
{
struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
if (!ds)
return;
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
(u32)((u64)(long)ds), (u32)((u64)(long)ds >> 32));
}
static inline void fini_debug_store_on_cpu(int cpu)
{
if (!per_cpu(cpu_hw_counters, cpu).ds)
return;
wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
}
static void release_bts_hardware(void)
{
int cpu;
if (!bts_available())
return;
get_online_cpus();
for_each_online_cpu(cpu)
fini_debug_store_on_cpu(cpu);
for_each_possible_cpu(cpu) {
struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds;
if (!ds)
continue;
per_cpu(cpu_hw_counters, cpu).ds = NULL;
kfree((void *)(long)ds->bts_buffer_base);
kfree(ds);
}
put_online_cpus();
}
static int reserve_bts_hardware(void)
{
int cpu, err = 0;
if (!bts_available())
return -EOPNOTSUPP;
get_online_cpus();
for_each_possible_cpu(cpu) {
struct debug_store *ds;
void *buffer;
err = -ENOMEM;
buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
if (unlikely(!buffer))
break;
ds = kzalloc(sizeof(*ds), GFP_KERNEL);
if (unlikely(!ds)) {
kfree(buffer);
break;
}
ds->bts_buffer_base = (u64)(long)buffer;
ds->bts_index = ds->bts_buffer_base;
ds->bts_absolute_maximum =
ds->bts_buffer_base + BTS_BUFFER_SIZE;
ds->bts_interrupt_threshold =
ds->bts_absolute_maximum - BTS_OVFL_TH;
per_cpu(cpu_hw_counters, cpu).ds = ds;
err = 0;
}
if (err)
release_bts_hardware();
else {
for_each_online_cpu(cpu)
init_debug_store_on_cpu(cpu);
}
put_online_cpus();
return err;
}
static void hw_perf_counter_destroy(struct perf_counter *counter)
{
if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
release_pmc_hardware();
release_bts_hardware();
mutex_unlock(&pmc_reserve_mutex);
}
}
......@@ -705,6 +851,42 @@ set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
return 0;
}
static void intel_pmu_enable_bts(u64 config)
{
unsigned long debugctlmsr;
debugctlmsr = get_debugctlmsr();
debugctlmsr |= X86_DEBUGCTL_TR;
debugctlmsr |= X86_DEBUGCTL_BTS;
debugctlmsr |= X86_DEBUGCTL_BTINT;
if (!(config & ARCH_PERFMON_EVENTSEL_OS))
debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
if (!(config & ARCH_PERFMON_EVENTSEL_USR))
debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
update_debugctlmsr(debugctlmsr);
}
static void intel_pmu_disable_bts(void)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
unsigned long debugctlmsr;
if (!cpuc->ds)
return;
debugctlmsr = get_debugctlmsr();
debugctlmsr &=
~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
update_debugctlmsr(debugctlmsr);
}
/*
* Setup the hardware configuration for a given attr_type
*/
......@@ -721,9 +903,13 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
err = 0;
if (!atomic_inc_not_zero(&active_counters)) {
mutex_lock(&pmc_reserve_mutex);
if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
err = -EBUSY;
else
if (atomic_read(&active_counters) == 0) {
if (!reserve_pmc_hardware())
err = -EBUSY;
else
reserve_bts_hardware();
}
if (!err)
atomic_inc(&active_counters);
mutex_unlock(&pmc_reserve_mutex);
}
......@@ -801,7 +987,18 @@ static void p6_pmu_disable_all(void)
static void intel_pmu_disable_all(void)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
if (!cpuc->enabled)
return;
cpuc->enabled = 0;
barrier();
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
}
static void amd_pmu_disable_all(void)
......@@ -859,7 +1056,25 @@ static void p6_pmu_enable_all(void)
static void intel_pmu_enable_all(void)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
if (cpuc->enabled)
return;
cpuc->enabled = 1;
barrier();
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
struct perf_counter *counter =
cpuc->counters[X86_PMC_IDX_FIXED_BTS];
if (WARN_ON_ONCE(!counter))
return;
intel_pmu_enable_bts(counter->hw.config);
}
}
static void amd_pmu_enable_all(void)
......@@ -946,6 +1161,11 @@ p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
static inline void
intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
{
if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
intel_pmu_disable_bts();
return;
}
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_disable_fixed(hwc, idx);
return;
......@@ -974,6 +1194,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
s64 period = hwc->sample_period;
int err, ret = 0;
if (idx == X86_PMC_IDX_FIXED_BTS)
return 0;
/*
* If we are way outside a reasoable range then just skip forward:
*/
......@@ -1056,6 +1279,14 @@ static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
{
if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
if (!__get_cpu_var(cpu_hw_counters).enabled)
return;
intel_pmu_enable_bts(hwc->config);
return;
}
if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
intel_pmu_enable_fixed(hwc, idx);
return;
......@@ -1077,11 +1308,16 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
{
unsigned int event;
event = hwc->config & ARCH_PERFMON_EVENT_MASK;
if (unlikely((event ==
x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
(hwc->sample_period == 1)))
return X86_PMC_IDX_FIXED_BTS;
if (!x86_pmu.num_counters_fixed)
return -1;
event = hwc->config & ARCH_PERFMON_EVENT_MASK;
if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
return X86_PMC_IDX_FIXED_INSTRUCTIONS;
if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
......@@ -1102,7 +1338,25 @@ static int x86_pmu_enable(struct perf_counter *counter)
int idx;
idx = fixed_mode_idx(counter, hwc);
if (idx >= 0) {
if (idx == X86_PMC_IDX_FIXED_BTS) {
/*
* Try to use BTS for branch tracing. If that is not
* available, try to get a generic counter.
*/
if (unlikely(!cpuc->ds))
goto try_generic;
/*
* Try to get the fixed counter, if that is already taken
* then try to get a generic counter:
*/
if (test_and_set_bit(idx, cpuc->used_mask))
goto try_generic;
hwc->config_base = 0;
hwc->counter_base = 0;
hwc->idx = idx;
} else if (idx >= 0) {
/*
* Try to get the fixed counter, if that is already taken
* then try to get a generic counter:
......@@ -1213,6 +1467,45 @@ void perf_counter_print_debug(void)
local_irq_restore(flags);
}
static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
struct perf_sample_data *data)
{
struct debug_store *ds = cpuc->ds;
struct bts_record {
u64 from;
u64 to;
u64 flags;
};
struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
unsigned long orig_ip = data->regs->ip;
u64 at;
if (!counter)
return;
if (!ds)
return;
for (at = ds->bts_buffer_base;
at < ds->bts_index;
at += sizeof(struct bts_record)) {
struct bts_record *rec = (struct bts_record *)(long)at;
data->regs->ip = rec->from;
data->addr = rec->to;
perf_counter_output(counter, 1, data);
}
ds->bts_index = ds->bts_buffer_base;
data->regs->ip = orig_ip;
data->addr = 0;
/* There's new data available. */
counter->pending_kill = POLL_IN;
}
static void x86_pmu_disable(struct perf_counter *counter)
{
struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
......@@ -1237,6 +1530,15 @@ static void x86_pmu_disable(struct perf_counter *counter)
* that we are disabling:
*/
x86_perf_counter_update(counter, hwc, idx);
/* Drain the remaining BTS records. */
if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
struct perf_sample_data data;
struct pt_regs regs;
data.regs = &regs;
intel_pmu_drain_bts_buffer(cpuc, &data);
}
cpuc->counters[idx] = NULL;
clear_bit(idx, cpuc->used_mask);
......@@ -1264,6 +1566,7 @@ static int intel_pmu_save_and_restart(struct perf_counter *counter)
static void intel_pmu_reset(void)
{
struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds;
unsigned long flags;
int idx;
......@@ -1281,6 +1584,8 @@ static void intel_pmu_reset(void)
for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
}
if (ds)
ds->bts_index = ds->bts_buffer_base;
local_irq_restore(flags);
}
......@@ -1346,6 +1651,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_counters);
perf_disable();
intel_pmu_drain_bts_buffer(cpuc, &data);
status = intel_pmu_get_status();
if (!status) {
perf_enable();
......@@ -1547,6 +1853,8 @@ static struct x86_pmu intel_pmu = {
* the generic counter period:
*/
.max_period = (1ULL << 31) - 1,
.enable_bts = intel_pmu_enable_bts,
.disable_bts = intel_pmu_disable_bts,
};
static struct x86_pmu amd_pmu = {
......@@ -1936,3 +2244,8 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
void hw_perf_counter_setup_online(int cpu)
{
init_debug_store_on_cpu(cpu);
}
......@@ -692,6 +692,8 @@ struct perf_sample_data {
extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
struct perf_sample_data *data);
extern void perf_counter_output(struct perf_counter *counter, int nmi,
struct perf_sample_data *data);
/*
* Return 1 for a software counter, 0 for a hardware counter
......
......@@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); }
void __weak hw_perf_enable(void) { barrier(); }
void __weak hw_perf_counter_setup(int cpu) { barrier(); }
void __weak hw_perf_counter_setup_online(int cpu) { barrier(); }
int __weak
hw_perf_group_sched_in(struct perf_counter *group_leader,
......@@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
return task_pid_nr_ns(p, counter->ns);
}
static void perf_counter_output(struct perf_counter *counter, int nmi,
void perf_counter_output(struct perf_counter *counter, int nmi,
struct perf_sample_data *data)
{
int ret;
......@@ -4566,6 +4567,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
perf_counter_init_cpu(cpu);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
hw_perf_counter_setup_online(cpu);
break;
case CPU_DOWN_PREPARE:
case CPU_DOWN_PREPARE_FROZEN:
perf_counter_exit_cpu(cpu);
......@@ -4590,6 +4596,8 @@ void __init perf_counter_init(void)
{
perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
(void *)(long)smp_processor_id());
perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE,
(void *)(long)smp_processor_id());
register_cpu_notifier(&perf_cpu_nb);
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment