ref: b2898a9ade42f623cdc6ab526de657fd078660d9
parent: 2b08f89076d1e93339fbbcc10e3298a0eec66bd6
author: Luc Trudeau <[email protected]>
date: Tue May 22 10:16:15 EDT 2018
Bench Class For More Robust Speed Tests To make speed testing more robust, the AbstractBench runs the desired code multiple times and report the median run time with mean absolute deviation around the median. To use the AbstractBench, simply add it as a parent to your test class, and implement the run() method (with the code you want to benchmark). Sample output for VP9QuantizeTest [ BENCH ] Bypass calculations 4x4 165.8 ms ( ±1.0 ms ) [ BENCH ] Full calculations 4x4 165.8 ms ( ±0.9 ms ) [ BENCH ] Bypass calculations 8x8 129.7 ms ( ±0.9 ms ) [ BENCH ] Full calculations 8x8 130.3 ms ( ±1.4 ms ) [ BENCH ] Bypass calculations 16x16 110.3 ms ( ±1.4 ms ) [ BENCH ] Full calculations 16x16 110.1 ms ( ±0.9 ms ) Change-Id: I1dd649754cb8c4c621eee2728198ea6a555f38b3
--- /dev/null
+++ b/test/bench.cc
@@ -1,0 +1,38 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <algorithm>
+
+#include "test/bench.h"
+#include "vpx_ports/vpx_timer.h"
+
+void AbstractBench::runNTimes(int n) {
+ for (int r = 0; r < VPX_BENCH_ROBUST_ITER; r++) {
+ vpx_usec_timer timer;
+ vpx_usec_timer_start(&timer);
+ for (int j = 0; j < n; ++j) {
+ run();
+ }
+ vpx_usec_timer_mark(&timer);
+ times[r] = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+ }
+}
+
+void AbstractBench::printMedian(const char *title) {
+ std::sort(times, times + VPX_BENCH_ROBUST_ITER);
+ const int med = times[VPX_BENCH_ROBUST_ITER >> 1];
+ int sad = 0;
+ for (int t = 0; t < VPX_BENCH_ROBUST_ITER; t++) {
+ sad += abs(times[t] - med);
+ }
+ printf("[%10s] %s %.1f ms ( ±%.1f ms )\n", "BENCH ", title, med / 1000.0,
+ sad / (VPX_BENCH_ROBUST_ITER * 1000.0));
+}
--- /dev/null
+++ b/test/bench.h
@@ -1,0 +1,30 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_BENCH_H_
+#define TEST_BENCH_H_
+
+// Number of iterations used to compute median run time.
+#define VPX_BENCH_ROBUST_ITER 15
+
+class AbstractBench {
+ public:
+ void runNTimes(int n);
+ void printMedian(const char *title);
+
+ protected:
+ // Implement this method and put the code to benchmark in it.
+ virtual void run() = 0;
+
+ private:
+ int times[VPX_BENCH_ROBUST_ITER];
+};
+
+#endif // TEST_BENCH_H_
--- a/test/test.mk
+++ b/test/test.mk
@@ -1,4 +1,6 @@
LIBVPX_TEST_SRCS-yes += acm_random.h
+LIBVPX_TEST_SRCS-yes += bench.h
+LIBVPX_TEST_SRCS-yes += bench.cc
LIBVPX_TEST_SRCS-yes += buffer.h
LIBVPX_TEST_SRCS-yes += clear_system_state.h
LIBVPX_TEST_SRCS-yes += codec_factory.h
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -18,6 +18,7 @@
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "test/acm_random.h"
+#include "test/bench.h"
#include "test/buffer.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
@@ -67,10 +68,13 @@
scan, iscan);
}
-class VP9QuantizeBase {
+class VP9QuantizeBase : public AbstractBench {
public:
VP9QuantizeBase(vpx_bit_depth_t bit_depth, int max_size, bool is_fp)
- : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp) {
+ : bit_depth_(bit_depth), max_size_(max_size), is_fp_(is_fp),
+ coeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 16)),
+ qcoeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)),
+ dqcoeff(Buffer<tran_low_t>(max_size_, max_size_, 0, 32)) {
max_value_ = (1 << bit_depth_) - 1;
zbin_ptr_ =
reinterpret_cast<int16_t *>(vpx_memalign(16, 8 * sizeof(*zbin_ptr_)));
@@ -86,6 +90,9 @@
vpx_memalign(16, 8 * sizeof(*quant_shift_ptr_)));
dequant_ptr_ = reinterpret_cast<int16_t *>(
vpx_memalign(16, 8 * sizeof(*dequant_ptr_)));
+
+ r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
+ q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
}
~VP9QuantizeBase() {
@@ -118,6 +125,15 @@
int max_value_;
const int max_size_;
const bool is_fp_;
+ Buffer<tran_low_t> coeff;
+ Buffer<tran_low_t> qcoeff;
+ Buffer<tran_low_t> dqcoeff;
+ int16_t *r_ptr;
+ int16_t *q_ptr;
+ int count;
+ int skip_block;
+ const scan_order *scan;
+ uint16_t eob;
};
class VP9QuantizeTest : public VP9QuantizeBase,
@@ -128,10 +144,17 @@
quantize_op_(GET_PARAM(0)), ref_quantize_op_(GET_PARAM(1)) {}
protected:
+ void run();
const QuantizeFunc quantize_op_;
const QuantizeFunc ref_quantize_op_;
};
+void VP9QuantizeTest::run() {
+ quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
+ quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
+ dequant_ptr_, &eob, scan->scan, scan->iscan);
+}
+
// This quantizer compares the AC coefficients to the quantization step size to
// determine if further multiplication operations are needed.
// Based on vp9_quantize_fp_sse2().
@@ -269,11 +292,8 @@
TEST_P(VP9QuantizeTest, OperationCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
ASSERT_TRUE(coeff.Init());
- Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(qcoeff.Init());
- Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(dqcoeff.Init());
Buffer<tran_low_t> ref_qcoeff =
Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
@@ -281,7 +301,8 @@
Buffer<tran_low_t> ref_dqcoeff =
Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(ref_dqcoeff.Init());
- uint16_t eob, ref_eob;
+ uint16_t ref_eob = 0;
+ eob = 0;
for (int i = 0; i < number_of_iterations; ++i) {
// Test skip block for the first three iterations to catch all the different
@@ -294,23 +315,21 @@
sz = TX_32X32;
}
const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
- const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- const int count = (4 << sz) * (4 << sz);
+ scan = &vp9_scan_orders[sz][tx_type];
+ count = (4 << sz) * (4 << sz);
coeff.Set(&rnd, -max_value_, max_value_);
GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
quant_fp_ptr_);
- int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
- int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
- scan_order->scan, scan_order->iscan);
+ scan->scan, scan->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(
coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
- dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+ dequant_ptr_, &eob, scan->scan, scan->iscan));
EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -328,11 +347,8 @@
TEST_P(VP9QuantizeTest, EOBCheck) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
ASSERT_TRUE(coeff.Init());
- Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(qcoeff.Init());
- Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(dqcoeff.Init());
Buffer<tran_low_t> ref_qcoeff =
Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
@@ -340,10 +356,12 @@
Buffer<tran_low_t> ref_dqcoeff =
Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(ref_dqcoeff.Init());
- uint16_t eob, ref_eob;
+ uint16_t ref_eob = 0;
+ eob = 0;
+ const uint32_t max_index = max_size_ * max_size_ - 1;
for (int i = 0; i < number_of_iterations; ++i) {
- const int skip_block = 0;
+ skip_block = 0;
TX_SIZE sz;
if (max_size_ == 16) {
sz = static_cast<TX_SIZE>(i % 3); // TX_4X4, TX_8X8 TX_16X16
@@ -351,28 +369,26 @@
sz = TX_32X32;
}
const TX_TYPE tx_type = static_cast<TX_TYPE>((i >> 2) % 3);
- const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- int count = (4 << sz) * (4 << sz);
+ scan = &vp9_scan_orders[sz][tx_type];
+ count = (4 << sz) * (4 << sz);
// Two random entries
coeff.Set(0);
- coeff.TopLeftPixel()[rnd(count)] =
+ coeff.TopLeftPixel()[rnd.RandRange(count) & max_index] =
static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
- coeff.TopLeftPixel()[rnd(count)] =
+ coeff.TopLeftPixel()[rnd.RandRange(count) & max_index] =
static_cast<int>(rnd.RandRange(max_value_ * 2)) - max_value_;
GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
quant_fp_ptr_);
- int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
- int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
ref_quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
q_ptr, quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
- scan_order->scan, scan_order->iscan);
+ scan->scan, scan->iscan);
ASM_REGISTER_STATE_CHECK(quantize_op_(
coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr, q_ptr,
quant_shift_ptr_, qcoeff.TopLeftPixel(), dqcoeff.TopLeftPixel(),
- dequant_ptr_, &eob, scan_order->scan, scan_order->iscan));
+ dequant_ptr_, &eob, scan->scan, scan->iscan));
EXPECT_TRUE(qcoeff.CheckValues(ref_qcoeff));
EXPECT_TRUE(dqcoeff.CheckValues(ref_dqcoeff));
@@ -390,13 +406,9 @@
TEST_P(VP9QuantizeTest, DISABLED_Speed) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
- Buffer<tran_low_t> coeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 16);
ASSERT_TRUE(coeff.Init());
- Buffer<tran_low_t> qcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(qcoeff.Init());
- Buffer<tran_low_t> dqcoeff = Buffer<tran_low_t>(max_size_, max_size_, 0, 32);
ASSERT_TRUE(dqcoeff.Init());
- uint16_t eob;
TX_SIZE starting_sz, ending_sz;
if (max_size_ == 16) {
@@ -410,18 +422,16 @@
for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
// zbin > coeff, zbin < coeff.
for (int i = 0; i < 2; ++i) {
- const int skip_block = 0;
+ skip_block = 0;
// TX_TYPE defines the scan order. That is not relevant to the speed test.
// Pick the first one.
const TX_TYPE tx_type = DCT_DCT;
- const scan_order *scan_order = &vp9_scan_orders[sz][tx_type];
- const int count = (4 << sz) * (4 << sz);
+ count = (4 << sz) * (4 << sz);
+ scan = &vp9_scan_orders[sz][tx_type];
GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
quant_fp_ptr_);
- int16_t *r_ptr = (is_fp_) ? round_fp_ptr_ : round_ptr_;
- int16_t *q_ptr = (is_fp_) ? quant_fp_ptr_ : quant_ptr_;
if (i == 0) {
// When |coeff values| are less than zbin the results are 0.
@@ -438,22 +448,15 @@
coeff.Set(&rnd, -500, 500);
}
- vpx_usec_timer timer;
- vpx_usec_timer_start(&timer);
- for (int j = 0; j < 100000000 / count; ++j) {
- quantize_op_(coeff.TopLeftPixel(), count, skip_block, zbin_ptr_, r_ptr,
- q_ptr, quant_shift_ptr_, qcoeff.TopLeftPixel(),
- dqcoeff.TopLeftPixel(), dequant_ptr_, &eob,
- scan_order->scan, scan_order->iscan);
- }
- vpx_usec_timer_mark(&timer);
- const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
- if (i == 0) printf("Bypass calculations.\n");
- if (i == 1) printf("Full calculations.\n");
- printf("Quantize %dx%d time: %5d ms\n", 4 << sz, 4 << sz,
- elapsed_time / 1000);
+ runNTimes(10000000 / count);
+ const char *type =
+ (i == 0) ? "Bypass calculations " : "Full calculations ";
+ char block_size[16];
+ snprintf(block_size, sizeof(block_size), "%dx%d", 4 << sz, 4 << sz);
+ char title[100];
+ snprintf(title, sizeof(title), "%25s %8s ", type, block_size);
+ printMedian(title);
}
- printf("\n");
}
}