-
Notifications
You must be signed in to change notification settings - Fork 818
ACL: INT8 3x3 slower than F32 on small-channel conv (Neoverse-V2, SVE2) #1225
Description
Hi, @milpuz01
I catch perf degradation
Environment
- HW: ARM Neoverse-V2 (flags: sve, sve2, i8mm, bf16, dotprod)
- ACL build (scons):
scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=1
benchmark_tests=0 validation_tests=0 asserts=0
arch=arm64-v8.2-a-sve2
(compiles with -march=armv8.2-a+sve2+fp16+dotprod; SVE2 enabled)
Minimal reproducer (standalone)
Single NEConvolutionLayer benchmark, parameters via CLI.
Build (save code below as repro_conv.cpp):
g++ -std=c++14 -O3 -march=armv8.2-a+sve2+fp16+dotprod
-I<ACL_ROOT> -I<ACL_ROOT>/include
repro_conv.cpp -L<ACL_ROOT>/build -larm_compute -pthread
-Wl,-rpath,<ACL_ROOT>/build -o conv_repro
Runs (NCHW, N=1, threads=4):
- Problem case: 3x3, H=W=75, C=32->32, stride=1, pad=1
./conv_repro --prec=i8 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4
avg_ms ≈ 0.471
./conv_repro --prec=f32 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4
avg_ms ≈ 0.276
INT8 ~1.7x slower than F32
- Reference where INT8 is faster (first 7x7 layer)
./conv_repro --prec=i8 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.192 (H=W=300, C:3->32, k=7, s=2, p=3)
./conv_repro --prec=f32 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.882
Source (repro_conv.cpp)
#include "arm_compute/runtime/NEON/NEFunctions.h" #include "arm_compute/runtime/Tensor.h" #include "arm_compute/runtime/Scheduler.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Types.h" #include <chrono> #include <iostream> #include <random> #include <string> using namespace arm_compute; struct Options { std::string prec = "i8"; // i8 | f32 unsigned iters = 50, warmup = 5, threads = 4; unsigned h = 300, w = 300, in_c = 3, out_c = 32; unsigned k = 7, stride = 2, pad = 3; }; Options parse(int argc, char** argv) { Options o; for (int i = 1; i < argc; ++i) { std::string a = argv[i]; auto val = [&](const std::string&) { auto p = a.find('='); return p == std::string::npos ? std::string() : a.substr(p + 1); }; if (a.rfind("--prec=",0)==0) o.prec = val(a); else if (a.rfind("--iters=",0)==0)o.iters = std::stoul(val(a)); else if (a.rfind("--warmup=",0)==0)o.warmup = std::stoul(val(a)); else if (a.rfind("--threads=",0)==0)o.threads=std::stoul(val(a)); else if (a.rfind("--h=",0)==0) o.h = std::stoul(val(a)); else if (a.rfind("--w=",0)==0) o.w = std::stoul(val(a)); else if (a.rfind("--in_c=",0)==0) o.in_c = std::stoul(val(a)); else if (a.rfind("--out_c=",0)==0)o.out_c = std::stoul(val(a)); else if (a.rfind("--k=",0)==0) o.k = std::stoul(val(a)); else if (a.rfind("--stride=",0)==0)o.stride= std::stoul(val(a)); else if (a.rfind("--pad=",0)==0) o.pad = std::stoul(val(a)); } return o; } void fill_random(Tensor& t) { std::mt19937 gen(0); std::uniform_int_distribution<int> dist_i(-128, 127); std::uniform_real_distribution<float> dist_f(-1.f, 1.f); auto dt = t.info()->data_type(); Window win; win.use_tensor_dimensions(t.info()->tensor_shape()); Iterator it(&t, win); execute_window_loop(win, [&](const Coordinates&) { if (dt == DataType::QASYMM8 || dt == DataType::QASYMM8_SIGNED) *reinterpret_cast<int8_t*>(it.ptr()) = static_cast<int8_t>(dist_i(gen)); else *reinterpret_cast<float*>(it.ptr()) = dist_f(gen); }, it); } int main(int argc, char** argv) { Options opt = parse(argc, argv); Scheduler::get().set_num_threads(opt.threads); TensorShape src_shape(opt.w, opt.h, opt.in_c, 1U); TensorShape wei_shape(opt.k, opt.k, opt.in_c, opt.out_c); unsigned out_w = (opt.w + 2 * opt.pad - opt.k) / opt.stride + 1; unsigned out_h = (opt.h + 2 * opt.pad - opt.k) / opt.stride + 1; TensorShape dst_shape(out_w, out_h, opt.out_c, 1U); PadStrideInfo conv_info(opt.stride, opt.stride, opt.pad, opt.pad); DataType dt = (opt.prec == "f32") ? DataType::F32 : DataType::QASYMM8_SIGNED; QuantizationInfo qi(1.f, 0); Tensor src, wei, dst; src.allocator()->init(TensorInfo(src_shape, 1, dt, qi)); wei.allocator()->init(TensorInfo(wei_shape, 1, dt, qi)); dst.allocator()->init(TensorInfo(dst_shape, 1, dt, qi)); NEConvolutionLayer conv; conv.configure(&src, &wei, nullptr, &dst, conv_info, WeightsInfo(), Size2D(1U, 1U), ActivationLayerInfo(), false, 1); src.allocator()->allocate(); wei.allocator()->allocate(); dst.allocator()->allocate(); fill_random(src); fill_random(wei); for (unsigned i = 0; i < opt.warmup; ++i) conv.run(); auto t0 = std::chrono::high_resolution_clock::now(); for (unsigned i = 0; i < opt.iters; ++i) conv.run(); auto t1 = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration<double, std::milli>(t1 - t0).count(); std::cout << "prec=" << opt.prec << " h=" << opt.h << " w=" << opt.w << " in_c=" << opt.in_c << " out_c=" << opt.out_c << " k=" << opt.k << " s=" << opt.stride << " p=" << opt.pad << " threads=" << opt.threads << " iters=" << opt.iters << " avg_ms=" << ms / opt.iters << std::endl; return 0; }
Observation
- On Neoverse-V2 with SVE2 build, INT8 ×ばつ3 (×ばつ75, C=32) is still ×ばつ slower than F32. Large ×ばつ7 remains faster in INT8. Please investigate INT8 kernel selection/perf for this pattern.