From 1cba7ba939d83257114e7beebbabb2660662fe57 Mon Sep 17 00:00:00 2001
From: Ian Romanick
Date: Fri, 12 Jan 2018 12:00:19 -0800
Subject: [PATCH] report.py: Gather and log some statistics about the helped /
hurt data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
A previous method that I tried was treating the helped and hurt data
as samples from separate populations, and these were compared using a
T-test. Since we're applying a common change to "both" sample sets, I
don't think this is a valid analysis.
Instead, I think it is more valid to look at the entire change set as a
sample of a single population and compare the mean of that sample to
zero. Only the changed samples are examined because the vast majority
of the sample in unaffected. If the mean of the entire sample was used,
the mean confidence interval would always include zero. It would be
more valid, I believe, include shaders that were affected but had no
change in instruction or cycle count. I don't know of a way to
determine this using the existing shader-db infrastructure.
These two different methods communicate two different things. The first
tries to determine whether the shaders hurt are affected more or less
than the shaders helped. This doesn't capture any information about the
number of shaders affected. There might be 1,000 shaders helped and 3
hurt, and the conclusion could still be negative. The second methods
trieds to determine whether the sample set is overall helped or hurt.
This allows the magnitued of hurt (or help) to be overwhelmed by the
number of helped (or hurt) shaders. There could be 1,000 shaders helped
by 1 instruction and 3 shaders hurt by 50 instructions, and the
conclusion would be positive.
Comparing the declared result with the mean and median, I feel like the
second method matches my intuitive interpretation of the data. Here is
a result of the T-test:
total cycles in shared programs: 559379982 -> 559342256 (<.01%)
cycles in affected programs: 10791218 -> 10753492 (-0.35%)
helped: 1952
HURT: 908
helped stats (abs) min: 1 max: 5762 x̄: 37.71 x̃: 16
helped stats (rel) min: <.01% max: 28.57% x̄: 3.54% x̃: 2.09%
HURT stats (abs) min: 1 max: 573 x̄: 39.51 x̃: 10
HURT stats (rel) min: <.01% max: 27.78% x̄: 1.93% x̃: 0.66%
abs t: -0.34, p: 73.70%
rel t: 9.88, p: <.01%
Inconclusive result (cannot disprove both null hypothoses).
And here is the result of the mean confidence interval tests on the
same data:
total cycles in shared programs: 559378112 -> 559340386 (<.01%)
cycles in affected programs: 10791218 -> 10753492 (-0.35%)
helped: 1952
HURT: 908
helped stats (abs) min: 1 max: 5762 x̄: 37.71 x̃: 16
helped stats (rel) min: <.01% max: 28.57% x̄: 3.54% x̃: 2.09%
HURT stats (abs) min: 1 max: 573 x̄: 39.51 x̃: 10
HURT stats (rel) min: <.01% max: 27.78% x̄: 1.93% x̃: 0.66%
95% mean confidence interval for cycles value: -18.27 -8.11
95% mean confidence interval for cycles %-change: -1.98% -1.63%
Cycles are helped.
Since the confidence interval is calculated based on the sample mean and
the sample standard deviation, it can include values out side the sample
minimum and maximum. This can lead to unexpected conclusions. In this
case all of the affected shaders were helped, but the result is
inconclusive.
total instructions in shared programs: 7886959 -> 7886925 (<.01%)
instructions in affected programs: 1340 -> 1306 (-2.54%)
helped: 4
HURT: 0
helped stats (abs) min: 2 max: 15 x̄: 8.50 x̃: 8
helped stats (rel) min: 0.63% max: 4.30% x̄: 2.45% x̃: 2.43%
95% mean confidence interval for instructions value: -20.44 3.44
95% mean confidence interval for instructions %-change: -5.78% 0.89%
Inconclusive result (value mean confidence interval includes 0).
v2: Don't log statistics for spill or fills. Simplify T-test logging.
v3: Use confidence interval instead.
Acked-by: Jason Ekstrand jason@jlekstrand.net
---
report.py | 101 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 100 insertions(+), 1 deletion(-)
diff --git a/report.py b/report.py
index b306220..6d53052 100755
--- a/report.py
+++ b/report.py
@@ -2,6 +2,9 @@
import re
import argparse
+import statistics
+from scipy import stats
+import numpy as np
def get_results(filename):
@@ -63,6 +66,35 @@ def get_result_string(p, b, a):
def split_list(string):
return string.split(",")
+
+def gather_statistics(changes, before, after, m):
+ stats = (0.0, 0, 0.0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0)
+
+ num = len(changes)
+ if num > 0:
+ absolute = [abs(before[p][m] - after[p][m]) for p in changes]
+ relative = [0 if before[p][m] == 0 else abs(before[p][m] - after[p][m]) / before[p][m] for p in changes]
+
+ stats = (statistics.mean(absolute),
+ statistics.median(absolute),
+ min(absolute),
+ max(absolute),
+ statistics.mean(relative),
+ statistics.median(relative),
+ min(relative),
+ max(relative))
+
+ return stats
+
+
+def mean_confidence_interval(data, confidence=0.95):
+ a = 1.0 * np.array(data)
+ n = len(a)
+ m, se = np.mean(a), stats.sem(a)
+ h = se * stats.t.ppf((1 + confidence) / 2., n - 1)
+ return m, m - h, m + h
+
+
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--measurements", "-m", type=split_list,
@@ -82,6 +114,9 @@ def main():
affected_after = {}
num_hurt = {}
num_helped = {}
+ helped_statistics = {}
+ hurt_statistics = {}
+ confidence_interval = {}
for m in args.measurements:
total_before[m] = 0
@@ -135,9 +170,21 @@ def main():
if hurt:
print("")
+ helped_statistics[m] = gather_statistics(helped, args.before, args.after, m)
+ hurt_statistics[m] = gather_statistics(hurt, args.before, args.after, m)
+
num_helped[m] = len(helped)
num_hurt[m] = len(hurt)
+ # Statistics for spills and fills is usually meaningless.
+ if m in ["spills", "fills"]:
+ continue
+
+ if num_hurt[m] + num_helped[m] > 3:
+ A = [args.after[p][m] - args.before[p][m] for p in helped + hurt]
+ B = [0 if args.before[p][m] == 0 else (args.after[p][m] - args.before[p][m]) / args.before[p][m] for p in helped + hurt]
+
+ confidence_interval[m] = (mean_confidence_interval(A), mean_confidence_interval(B))
lost = []
gained = []
@@ -172,13 +219,65 @@ def main():
print("total {0} in shared programs: {1}\n"
"{0} in affected programs: {2}\n"
"helped: {3}\n"
- "HURT: {4}\n".format(
+ "HURT: {4}".format(
m,
change(total_before[m], total_after[m]),
change(affected_before[m], affected_after[m]),
num_helped[m],
num_hurt[m]))
+ # Statistics for spills and fills is usually meaningless.
+ if m in ["spills", "fills"]:
+ print()
+ continue
+
+ if num_helped[m] > 2 or (num_helped[m] > 0 and num_hurt[m] > 0):
+ (avg_abs, med_abs, lo_abs, hi_abs, avg_rel, med_rel, lo_rel, hi_rel) = helped_statistics[m]
+
+ print("helped stats (abs) min: {} max: {} x\u0304: {:.2f} x\u0303: {}".format(
+ lo_abs, hi_abs, avg_abs, int(med_abs)))
+ print("helped stats (rel) min: {} max: {} x\u0304: {} x\u0303: {}".format(
+ format_percent(lo_rel),
+ format_percent(hi_rel),
+ format_percent(avg_rel),
+ format_percent(med_rel)))
+
+ if num_hurt[m] > 2 or (num_hurt[m] > 0 and num_helped[m] > 0):
+ (avg_abs, med_abs, lo_abs, hi_abs, avg_rel, med_rel, lo_rel, hi_rel) = hurt_statistics[m]
+
+ print("HURT stats (abs) min: {} max: {} x\u0304: {:.2f} x\u0303: {}".format(
+ lo_abs, hi_abs, avg_abs, int(med_abs)))
+ print("HURT stats (rel) min: {} max: {} x\u0304: {} x\u0303: {}".format(
+ format_percent(lo_rel),
+ format_percent(hi_rel),
+ format_percent(avg_rel),
+ format_percent(med_rel)))
+
+ if m in confidence_interval:
+ print("95% mean confidence interval for {} value: {:.2f} {:.2f}".format(m,
+ confidence_interval[m][0][1],
+ confidence_interval[m][0][2]))
+ print("95% mean confidence interval for {} %-change: {} {}".format(m,
+ format_percent(confidence_interval[m][1][1]),
+ format_percent(confidence_interval[m][1][2])))
+
+ # Be very, very conservative about applying results
+ # based on the confidence intervals. Neither interval
+ # can include zero, and both intervals must be on the
+ # same side of zero.
+ if confidence_interval[m][0][1] < 0 and confidence_interval[m][0][2] > 0:
+ print("Inconclusive result (value mean confidence interval includes 0).");
+ elif confidence_interval[m][1][1] < 0 and confidence_interval[m][1][2] > 0:
+ print("Inconclusive result (%-change mean confidence interval includes 0).");
+ elif (confidence_interval[m][0][1] < 0) != (confidence_interval[m][1][1] < 0):
+ print("Inconclusive result (value mean confidence interval and %-change mean confidence interval disagree).");
+ elif confidence_interval[m][0][1] < 0:
+ print("{} are helped.".format(m.capitalize()))
+ else:
+ print("{} are HURT.".format(m.capitalize()))
+
+ print()
+
if lost or gained or not args.changes_only:
print("LOST: " + str(len(lost)))
--
2.22.0