0

I use the following command to compile an executable file for Android:

cmake \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DANDROID_ABI=arm64-v8a \
 -DANDROID_PLATFORM=android-28 \
 -DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DGGML_OPENMP=ON \
 -DGGML_LLAMAFILE=OFF \
 -B build-android

This is the command I enter in the shell on Android (REDMI K80 Pro):

cd /data/local/tmp/llama.cpp/build-android && chmod +x llama-bench && export LD_LIBRARY_PATH=:/data/local/tmp/llama.cpp/build-android && taskset 0f ./llama-bench -m ../qwen2.5-0.5b-instruct-fp16.gguf -p 0 -n 1 -b 1 -t 4 >op.txt

I used taskset 0f to limit it to only 4 CPU cores. The model file qwen2.5-0.5b-instruct-fp16.gguf means that weights are stored in FP16 format and activations are computed in FP32.

I modified the ggml_graph_compute_thread function in ggml/src/ggml-cpu/ggml-cpu.c to log information such as the operator name, shape, and execution time. Here is the code:

static void print_op_(struct ggml_tensor * tensor, const char* prefix) {
 printf("\n");
 printf("%sop symbol: %s\n", prefix, ggml_op_symbol(tensor->op));
 printf("%sop desc : %s\n", prefix, ggml_op_desc(tensor));
 printf("%sop type : %s\n", prefix, ggml_type_name(tensor->type));
 printf("%sop ne[0,1,2,3] : %lld %lld %lld %lld\n", prefix,
 (long long) tensor->ne[0], (long long) tensor->ne[1], (long long) tensor->ne[2], (long long) tensor->ne[3]);
}
static void print_op(struct ggml_tensor * tensor, const int64_t us) {
 printf("\n");
 print_op_(tensor,"");
 printf("op time: %f ms\n", us / 1000.0);
 int i = 0;
 struct ggml_tensor * p = tensor->src[i++];
 printf("src:");
 while (p) {
 print_op_(p," ");
 p = tensor->src[i++];
 }
}
// static ggml_mutex_t mymutex;
static thread_ret_t ggml_graph_compute_thread(void * data) {
 struct ggml_compute_state * state = (struct ggml_compute_state *) data;
 struct ggml_threadpool * tp = state->threadpool;
 const struct ggml_cgraph * cgraph = tp->cgraph;
 const struct ggml_cplan * cplan = tp->cplan;
 set_numa_thread_affinity(state->ith);
 struct ggml_compute_params params = {
 /*.ith =*/ state->ith,
 /*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
 /*.wsize =*/ cplan->work_size,
 /*.wdata =*/ cplan->work_data,
 /*.threadpool=*/ tp,
 };
 for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
 ggml_barrier(state->threadpool);
 int64_t start_time = ggml_time_us();
 struct ggml_tensor * node = cgraph->nodes[node_n];
 ggml_compute_forward(&params, node);
 if (state->ith == 0 && cplan->abort_callback &&
 cplan->abort_callback(cplan->abort_callback_data)) {
 atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
 tp->ec = GGML_STATUS_ABORTED;
 }
 ggml_barrier(state->threadpool);
 int64_t end_time = ggml_time_us();
 if (state->ith == 0) {
 print_op(node, end_time - start_time);
 }
 }
 ggml_barrier(state->threadpool);
 return 0;
}

I wrote a script to analyze the output, which shows: X*Y (896 1 1 1) * (896 151936 1 1): 4.616333 ms

This means that multiplying an activation tensor of shape (896 1 1 1) (FP32) with a weight matrix of shape (896 151936 1 1) (FP16) takes an average of 4.616333 ms .

To reproduce this operation, I built a minimal test graph using GGML: tests/mytest.cpp

#include <ggml-cpu.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/resource.h>
#include "ggml.h"
#include "time_helper.hpp"
void test_f16_matmul() {
 ggml_backend_load_all();
 struct ggml_init_params params = {
 .mem_size = 1280 * 1024 * 1024,
 .mem_buffer = NULL,
 .no_alloc = false,
 };
 struct ggml_context * ctx = ggml_init(params);
 const int64_t neC[2] = {896, 896};
 const int64_t neD[2] = {896, 1};
 const int64_t neA[2] = {896, 151936};
 struct ggml_tensor * C = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neC[0], neC[1]);
 struct ggml_tensor * D = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, neD[0], neD[1]);
 struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neA[0], neA[1]);
 // Fill tensors with random values
 for (int i = 0; i < ggml_nelements(C); ++i) {
 ggml_set_f32_1d(C, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
 }
 for (int i = 0; i < ggml_nelements(D); ++i) {
 ggml_set_f32_1d(D, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
 }
 for (int i = 0; i < ggml_nelements(A); ++i) {
 ggml_set_f32_1d(A, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
 }
 struct ggml_tensor * B = ggml_mul_mat(ctx, C, D);
 struct ggml_tensor * E = ggml_mul_mat(ctx, A, B);
 // Build and compute the graph
 struct ggml_cgraph * gf = ggml_new_graph(ctx);
 ggml_build_forward_expand(gf, E);
 ggml_graph_compute_with_ctx(ctx, gf, 4);
 ggml_free(ctx);
}
int main() {
 for (int i = 0; i < 6; i++) {
 test_f16_matmul();
 sleep(1);
 }
 return 0;
}

tests/CMakeLists.txt

set(TEST_TARGET mytest)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_include_directories(${TEST_TARGET} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")

Build Command

cmake \
 -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
 -DANDROID_ABI=arm64-v8a \
 -DANDROID_PLATFORM=android-28 \
 -DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
 -DGGML_OPENMP=ON \
 -DGGML_LLAMAFILE=OFF \
 -B build-ggml

Run Command on Device via ADB cd /data/local/tmp/ggml/build-ggml && chmod +x mytest && export LD_LIBRARY_PATH=:/data/local/tmp/ggml/build-ggml && taskset 0f ./mytest >ggml_op.txt

The result from running this standalone test shows: X*Y (896 1 1 1) * (896 151936 1 1): 9.072333 ms

Multiple runs consistently show around 8–9 ms , whereas the same operation inside llama.cpp takes only ~4.6 ms. I verified through debugging that both versions execute the same underlying code path .

Why does the mul_mat operation in llama.cpp run significantly faster than in my standalone test?

This issue has been puzzling me for two weeks now. I would greatly appreciate any insight you can offer!

Thank you for taking the time to read through this.

Python Script for Log Analysis (ana_log.py)

import sys
from collections import defaultdict
def main(logfile, avg=True):
 total_times = defaultdict(float)
 current_symbol = None
 with open(logfile, 'r', encoding='utf-8') as f:
 lines = f.readlines()
 line_cnt = len(lines)
 line_index = 0
 matmul_dict = defaultdict(list)
 while line_index < line_cnt:
 line = lines[line_index]
 stripped_line = line.lstrip()
 is_indented = (len(line) - len(stripped_line)) > 0
 if stripped_line.startswith('op symbol:') and not is_indented:
 parts = stripped_line.split(':', 1)
 current_symbol = parts[1].strip()
 elif stripped_line.startswith('op time:') and not is_indented:
 time_part = stripped_line.split(':', 1)[1].strip()
 time_str = time_part.replace(' ms', '').strip()
 time_val = float(time_str)
 if current_symbol:
 total_times[current_symbol] += time_val
 elif current_symbol == "X*Y" and stripped_line.startswith('src'):
 line_index += 4
 line = lines[line_index].strip()
 src1_shape = line.split(':')[1].strip()
 line_index += 5
 line = lines[line_index].strip()
 src2_shape = line.split(':')[1].strip()
 matmul_dict[(src1_shape, src2_shape)].append(time_val)
 line_index += 1
 infer_times = len(matmul_dict[("896 151936 1 1","896 1 1 1")])
 for symbol, total_time in sorted(total_times.items()):
 print(f"{symbol}: {total_time / infer_times:.6f} ms")
 for shapes, times in matmul_dict.items():
 src1_shape, src2_shape = shapes
 used_time = sum(times) / len(times) if avg else min(times)
 print(f"X*Y ({src2_shape}) * ({src1_shape}): {used_time:.6f} ms, once infer call times: {len(times)/infer_times}")
if __name__ == '__main__':
 if len(sys.argv) < 2 or len(sys.argv) > 3:
 print("Usage: python ana_log.py <log_file> [avg]")
 sys.exit(1)
 avg = True if len(sys.argv) == 3 and sys.argv[2].lower() == 'avg' else False
 print("avg" if avg else "min")
 main(sys.argv[1], avg)

Usage python ana_log.py op.txt avg

I used GGML to build a computation graph to measure the execution time of mul_mat, but it is consistently much slower than the mul_mat operation during inference in llama.cpp, especially when the activation tensor has shape (896 1 1 1) in FP32 and the weight tensor has shape (896 151936 1 1) in FP16.

I explained my attempts in detail above. If you can help me with this issue, I would be extremely grateful!

asked May 13, 2025 at 6:49

0

Know someone who can answer? Share a link to this question via email, Twitter, or Facebook.

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.