Why `mul_mat` in ggml slower than llama.cpp?

Question

I use the following command to compile an executable file for Android:

cmake \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DANDROID_ABI=arm64-v8a \
 -DANDROID_PLATFORM=android-28 \
 -DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DGGML_OPENMP=ON \
 -DGGML_LLAMAFILE=OFF \
 -B build-android

This is the command I enter in the shell on Android (REDMI K80 Pro):

cd /data/local/tmp/llama.cpp/build-android && chmod +x llama-bench && export LD_LIBRARY_PATH=:/data/local/tmp/llama.cpp/build-android && taskset 0f ./llama-bench -m ../qwen2.5-0.5b-instruct-fp16.gguf -p 0 -n 1 -b 1 -t 4 >op.txt

I used taskset 0f to limit it to only 4 CPU cores. The model file qwen2.5-0.5b-instruct-fp16.gguf means that weights are stored in FP16 format and activations are computed in FP32.

I modified the ggml_graph_compute_thread function in ggml/src/ggml-cpu/ggml-cpu.c to log information such as the operator name, shape, and execution time. Here is the code:

static void print_op_(struct ggml_tensor * tensor, const char* prefix) {
 printf("\n");
 printf("%sop symbol: %s\n", prefix, ggml_op_symbol(tensor->op));
 printf("%sop desc : %s\n", prefix, ggml_op_desc(tensor));
 printf("%sop type : %s\n", prefix, ggml_type_name(tensor->type));
 printf("%sop ne[0,1,2,3] : %lld %lld %lld %lld\n", prefix,
 (long long) tensor->ne[0], (long long) tensor->ne[1], (long long) tensor->ne[2], (long long) tensor->ne[3]);
}
static void print_op(struct ggml_tensor * tensor, const int64_t us) {
 printf("\n");
 print_op_(tensor,"");
 printf("op time: %f ms\n", us / 1000.0);
 int i = 0;
 struct ggml_tensor * p = tensor->src[i++];
 printf("src:");
 while (p) {
 print_op_(p," ");
 p = tensor->src[i++];
 }
}
// static ggml_mutex_t mymutex;
static thread_ret_t ggml_graph_compute_thread(void * data) {
 struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 struct ggml_threadpool * tp = state->threadpool;
 const struct ggml_cgraph * cgraph = tp->cgraph;
 const struct ggml_cplan * cplan = tp->cplan;
 set_numa_thread_affinity(state->ith);
 struct ggml_compute_params params = {
 /*.ith =*/ state->ith,
 /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
 /*.wsize =*/ cplan->work_size,
 /*.wdata =*/ cplan->work_data,
 /*.threadpool=*/ tp,
 };
 for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
 ggml_barrier(state->threadpool);
 int64_t start_time = ggml_time_us();
 struct ggml_tensor * node = cgraph->nodes[node_n];
 ggml_compute_forward(&params, node);
 if (state->ith == 0 && cplan->abort_callback &&
 cplan->abort_callback(cplan->abort_callback_data)) {
 atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
 tp->ec = GGML_STATUS_ABORTED;
 }
 ggml_barrier(state->threadpool);
 int64_t end_time = ggml_time_us();
 if (state->ith == 0) {
 print_op(node, end_time - start_time);
 }
 }
 ggml_barrier(state->threadpool);
 return 0;
}

I wrote a script to analyze the output, which shows: X*Y (896 1 1 1) * (896 151936 1 1): 4.616333 ms

This means that multiplying an activation tensor of shape (896 1 1 1) (FP32) with a weight matrix of shape (896 151936 1 1) (FP16) takes an average of 4.616333 ms .

To reproduce this operation, I built a minimal test graph using GGML: tests/mytest.cpp

#include <ggml-cpu.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/resource.h>
#include "ggml.h"
#include "time_helper.hpp"
void test_f16_matmul() {
 ggml_backend_load_all();
 struct ggml_init_params params = {
 .mem_size = 1280 * 1024 * 1024,
 .mem_buffer = NULL,
 .no_alloc = false,
 };
 struct ggml_context * ctx = ggml_init(params);
 const int64_t neC[2] = {896, 896};
 const int64_t neD[2] = {896, 1};
 const int64_t neA[2] = {896, 151936};
 struct ggml_tensor * C = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neC[0], neC[1]);
 struct ggml_tensor * D = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, neD[0], neD[1]);
 struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neA[0], neA[1]);
 // Fill tensors with random values
 for (int i = 0; i < ggml_nelements(C); ++i) {
 ggml_set_f32_1d(C, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
 }
 for (int i = 0; i < ggml_nelements(D); ++i) {
 ggml_set_f32_1d(D, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
 }
 for (int i = 0; i < ggml_nelements(A); ++i) {
 ggml_set_f32_1d(A, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
 }
 struct ggml_tensor * B = ggml_mul_mat(ctx, C, D);
 struct ggml_tensor * E = ggml_mul_mat(ctx, A, B);
 // Build and compute the graph
 struct ggml_cgraph * gf = ggml_new_graph(ctx);
 ggml_build_forward_expand(gf, E);
 ggml_graph_compute_with_ctx(ctx, gf, 4);
 ggml_free(ctx);
}
int main() {
 for (int i = 0; i < 6; i++) {
 test_f16_matmul();
 sleep(1);
 }
 return 0;
}

tests/CMakeLists.txt

set(TEST_TARGET mytest)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_include_directories(${TEST_TARGET} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

Build Command

cmake \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DANDROID_ABI=arm64-v8a \
 -DANDROID_PLATFORM=android-28 \
 -DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DGGML_OPENMP=ON \
 -DGGML_LLAMAFILE=OFF \
 -B build-ggml

Run Command on Device via ADB cd /data/local/tmp/ggml/build-ggml && chmod +x mytest && export LD_LIBRARY_PATH=:/data/local/tmp/ggml/build-ggml && taskset 0f ./mytest >ggml_op.txt

The result from running this standalone test shows: X*Y (896 1 1 1) * (896 151936 1 1): 9.072333 ms

Multiple runs consistently show around 8–9 ms , whereas the same operation inside llama.cpp takes only ~4.6 ms. I verified through debugging that both versions execute the same underlying code path .

Why does the mul_mat operation in llama.cpp run significantly faster than in my standalone test?

This issue has been puzzling me for two weeks now. I would greatly appreciate any insight you can offer!

Thank you for taking the time to read through this.

Python Script for Log Analysis (ana_log.py)

import sys
from collections import defaultdict
def main(logfile, avg=True):
 total_times = defaultdict(float)
 current_symbol = None
 with open(logfile, 'r', encoding='utf-8') as f:
 lines = f.readlines()
 line_cnt = len(lines)
 line_index = 0
 matmul_dict = defaultdict(list)
 while line_index < line_cnt:
 line = lines[line_index]
 stripped_line = line.lstrip()
 is_indented = (len(line) - len(stripped_line)) > 0
 if stripped_line.startswith('op symbol:') and not is_indented:
 parts = stripped_line.split(':', 1)
 current_symbol = parts[1].strip()
 elif stripped_line.startswith('op time:') and not is_indented:
 time_part = stripped_line.split(':', 1)[1].strip()
 time_str = time_part.replace(' ms', '').strip()
 time_val = float(time_str)
 if current_symbol:
 total_times[current_symbol] += time_val
 elif current_symbol == "X*Y" and stripped_line.startswith('src'):
 line_index += 4
 line = lines[line_index].strip()
 src1_shape = line.split(':')[1].strip()
 line_index += 5
 line = lines[line_index].strip()
 src2_shape = line.split(':')[1].strip()
 matmul_dict[(src1_shape, src2_shape)].append(time_val)
 line_index += 1
 infer_times = len(matmul_dict[("896 151936 1 1","896 1 1 1")])
 for symbol, total_time in sorted(total_times.items()):
 print(f"{symbol}: {total_time / infer_times:.6f} ms")
 for shapes, times in matmul_dict.items():
 src1_shape, src2_shape = shapes
 used_time = sum(times) / len(times) if avg else min(times)
 print(f"X*Y ({src2_shape}) * ({src1_shape}): {used_time:.6f} ms, once infer call times: {len(times)/infer_times}")
if __name__ == '__main__':
 if len(sys.argv) < 2 or len(sys.argv) > 3:
 print("Usage: python ana_log.py <log_file> [avg]")
 sys.exit(1)
 avg = True if len(sys.argv) == 3 and sys.argv[2].lower() == 'avg' else False
 print("avg" if avg else "min")
 main(sys.argv[1], avg)

Usage python ana_log.py op.txt avg

I used GGML to build a computation graph to measure the execution time of mul_mat, but it is consistently much slower than the mul_mat operation during inference in llama.cpp, especially when the activation tensor has shape (896 1 1 1) in FP32 and the weight tensor has shape (896 151936 1 1) in FP16.

I explained my attempts in detail above. If you can help me with this issue, I would be extremely grateful!

CollectivesTM on Stack Overflow

Why `mul_mat` in ggml slower than llama.cpp?

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Hot Network Questions