Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

ACL: INT8 3x3 slower than F32 on small-channel conv (Neoverse-V2, SVE2) #1225

Open
Assignees

Description

Hi, @milpuz01

I catch perf degradation

Environment

  • HW: ARM Neoverse-V2 (flags: sve, sve2, i8mm, bf16, dotprod)
  • ACL build (scons):
    scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=1
    benchmark_tests=0 validation_tests=0 asserts=0
    arch=arm64-v8.2-a-sve2
    (compiles with -march=armv8.2-a+sve2+fp16+dotprod; SVE2 enabled)

Minimal reproducer (standalone)

Single NEConvolutionLayer benchmark, parameters via CLI.

Build (save code below as repro_conv.cpp):

g++ -std=c++14 -O3 -march=armv8.2-a+sve2+fp16+dotprod
-I<ACL_ROOT> -I<ACL_ROOT>/include
repro_conv.cpp -L<ACL_ROOT>/build -larm_compute -pthread
-Wl,-rpath,<ACL_ROOT>/build -o conv_repro

Runs (NCHW, N=1, threads=4):

  1. Problem case: 3x3, H=W=75, C=32->32, stride=1, pad=1

./conv_repro --prec=i8 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4

avg_ms ≈ 0.471

./conv_repro --prec=f32 --h=75 --w=75 --in_c=32 --out_c=32 --k=3 --stride=1 --pad=1 --iters=50 --warmup=5 --threads=4

avg_ms ≈ 0.276

INT8 ~1.7x slower than F32

  1. Reference where INT8 is faster (first 7x7 layer)

./conv_repro --prec=i8 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.192 (H=W=300, C:3->32, k=7, s=2, p=3)
./conv_repro --prec=f32 --iters=50 --warmup=5 --threads=4 # avg_ms ≈ 1.882

Source (repro_conv.cpp)

#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/Scheduler.h"
#include "arm_compute/core/Helpers.h"
#include "arm_compute/core/TensorShape.h"
#include "arm_compute/core/Types.h"
#include <chrono>
#include <iostream>
#include <random>
#include <string>
using namespace arm_compute;
struct Options {
 std::string prec = "i8"; // i8 | f32
 unsigned iters = 50, warmup = 5, threads = 4;
 unsigned h = 300, w = 300, in_c = 3, out_c = 32;
 unsigned k = 7, stride = 2, pad = 3;
};
Options parse(int argc, char** argv) {
 Options o;
 for (int i = 1; i < argc; ++i) {
 std::string a = argv[i];
 auto val = [&](const std::string&) { auto p = a.find('='); return p == std::string::npos ? std::string() : a.substr(p + 1); };
 if (a.rfind("--prec=",0)==0) o.prec = val(a);
 else if (a.rfind("--iters=",0)==0)o.iters = std::stoul(val(a));
 else if (a.rfind("--warmup=",0)==0)o.warmup = std::stoul(val(a));
 else if (a.rfind("--threads=",0)==0)o.threads=std::stoul(val(a));
 else if (a.rfind("--h=",0)==0) o.h = std::stoul(val(a));
 else if (a.rfind("--w=",0)==0) o.w = std::stoul(val(a));
 else if (a.rfind("--in_c=",0)==0) o.in_c = std::stoul(val(a));
 else if (a.rfind("--out_c=",0)==0)o.out_c = std::stoul(val(a));
 else if (a.rfind("--k=",0)==0) o.k = std::stoul(val(a));
 else if (a.rfind("--stride=",0)==0)o.stride= std::stoul(val(a));
 else if (a.rfind("--pad=",0)==0) o.pad = std::stoul(val(a));
 }
 return o;
}
void fill_random(Tensor& t) {
 std::mt19937 gen(0);
 std::uniform_int_distribution<int> dist_i(-128, 127);
 std::uniform_real_distribution<float> dist_f(-1.f, 1.f);
 auto dt = t.info()->data_type();
 Window win; win.use_tensor_dimensions(t.info()->tensor_shape());
 Iterator it(&t, win);
 execute_window_loop(win, [&](const Coordinates&) {
 if (dt == DataType::QASYMM8 || dt == DataType::QASYMM8_SIGNED)
 *reinterpret_cast<int8_t*>(it.ptr()) = static_cast<int8_t>(dist_i(gen));
 else
 *reinterpret_cast<float*>(it.ptr()) = dist_f(gen);
 }, it);
}
int main(int argc, char** argv) {
 Options opt = parse(argc, argv);
 Scheduler::get().set_num_threads(opt.threads);
 TensorShape src_shape(opt.w, opt.h, opt.in_c, 1U);
 TensorShape wei_shape(opt.k, opt.k, opt.in_c, opt.out_c);
 unsigned out_w = (opt.w + 2 * opt.pad - opt.k) / opt.stride + 1;
 unsigned out_h = (opt.h + 2 * opt.pad - opt.k) / opt.stride + 1;
 TensorShape dst_shape(out_w, out_h, opt.out_c, 1U);
 PadStrideInfo conv_info(opt.stride, opt.stride, opt.pad, opt.pad);
 DataType dt = (opt.prec == "f32") ? DataType::F32 : DataType::QASYMM8_SIGNED;
 QuantizationInfo qi(1.f, 0);
 Tensor src, wei, dst;
 src.allocator()->init(TensorInfo(src_shape, 1, dt, qi));
 wei.allocator()->init(TensorInfo(wei_shape, 1, dt, qi));
 dst.allocator()->init(TensorInfo(dst_shape, 1, dt, qi));
 NEConvolutionLayer conv;
 conv.configure(&src, &wei, nullptr, &dst,
 conv_info, WeightsInfo(), Size2D(1U, 1U),
 ActivationLayerInfo(), false, 1);
 src.allocator()->allocate();
 wei.allocator()->allocate();
 dst.allocator()->allocate();
 fill_random(src);
 fill_random(wei);
 for (unsigned i = 0; i < opt.warmup; ++i) conv.run();
 auto t0 = std::chrono::high_resolution_clock::now();
 for (unsigned i = 0; i < opt.iters; ++i) conv.run();
 auto t1 = std::chrono::high_resolution_clock::now();
 double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
 std::cout << "prec=" << opt.prec
 << " h=" << opt.h << " w=" << opt.w
 << " in_c=" << opt.in_c << " out_c=" << opt.out_c
 << " k=" << opt.k << " s=" << opt.stride << " p=" << opt.pad
 << " threads=" << opt.threads
 << " iters=" << opt.iters
 << " avg_ms=" << ms / opt.iters
 << std::endl;
 return 0;
}

Observation

  • On Neoverse-V2 with SVE2 build, INT8 ×ばつ3 (×ばつ75, C=32) is still ×ばつ slower than F32. Large ×ばつ7 remains faster in INT8. Please investigate INT8 kernel selection/perf for this pattern.

Metadata

Metadata

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions

      AltStyle によって変換されたページ (->オリジナル) /