I use the following command to compile an executable file for Android:
cmake \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
-DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
-DGGML_OPENMP=ON \
-DGGML_LLAMAFILE=OFF \
-B build-android
This is the command I enter in the shell on Android (REDMI K80 Pro):
cd /data/local/tmp/llama.cpp/build-android && chmod +x llama-bench && export LD_LIBRARY_PATH=:/data/local/tmp/llama.cpp/build-android && taskset 0f ./llama-bench -m ../qwen2.5-0.5b-instruct-fp16.gguf -p 0 -n 1 -b 1 -t 4 >op.txt
I used taskset 0f to limit it to only 4 CPU cores. The model file qwen2.5-0.5b-instruct-fp16.gguf means that weights are stored in FP16 format and activations are computed in FP32.
I modified the ggml_graph_compute_thread function in ggml/src/ggml-cpu/ggml-cpu.c to log information such as the operator name, shape, and execution time. Here is the code:
static void print_op_(struct ggml_tensor * tensor, const char* prefix) {
printf("\n");
printf("%sop symbol: %s\n", prefix, ggml_op_symbol(tensor->op));
printf("%sop desc : %s\n", prefix, ggml_op_desc(tensor));
printf("%sop type : %s\n", prefix, ggml_type_name(tensor->type));
printf("%sop ne[0,1,2,3] : %lld %lld %lld %lld\n", prefix,
(long long) tensor->ne[0], (long long) tensor->ne[1], (long long) tensor->ne[2], (long long) tensor->ne[3]);
}
static void print_op(struct ggml_tensor * tensor, const int64_t us) {
printf("\n");
print_op_(tensor,"");
printf("op time: %f ms\n", us / 1000.0);
int i = 0;
struct ggml_tensor * p = tensor->src[i++];
printf("src:");
while (p) {
print_op_(p," ");
p = tensor->src[i++];
}
}
// static ggml_mutex_t mymutex;
static thread_ret_t ggml_graph_compute_thread(void * data) {
struct ggml_compute_state * state = (struct ggml_compute_state *) data;
struct ggml_threadpool * tp = state->threadpool;
const struct ggml_cgraph * cgraph = tp->cgraph;
const struct ggml_cplan * cplan = tp->cplan;
set_numa_thread_affinity(state->ith);
struct ggml_compute_params params = {
/*.ith =*/ state->ith,
/*.nth =*/ atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed),
/*.wsize =*/ cplan->work_size,
/*.wdata =*/ cplan->work_data,
/*.threadpool=*/ tp,
};
for (int node_n = 0; node_n < cgraph->n_nodes && atomic_load_explicit(&tp->abort, memory_order_relaxed) != node_n; node_n++) {
ggml_barrier(state->threadpool);
int64_t start_time = ggml_time_us();
struct ggml_tensor * node = cgraph->nodes[node_n];
ggml_compute_forward(¶ms, node);
if (state->ith == 0 && cplan->abort_callback &&
cplan->abort_callback(cplan->abort_callback_data)) {
atomic_store_explicit(&tp->abort, node_n + 1, memory_order_relaxed);
tp->ec = GGML_STATUS_ABORTED;
}
ggml_barrier(state->threadpool);
int64_t end_time = ggml_time_us();
if (state->ith == 0) {
print_op(node, end_time - start_time);
}
}
ggml_barrier(state->threadpool);
return 0;
}
I wrote a script to analyze the output, which shows:
X*Y (896 1 1 1) * (896 151936 1 1): 4.616333 ms
This means that multiplying an activation tensor of shape (896 1 1 1) (FP32) with a weight matrix of shape (896 151936 1 1) (FP16) takes an average of 4.616333 ms .
To reproduce this operation, I built a minimal test graph using GGML: tests/mytest.cpp
#include <ggml-cpu.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <unistd.h>
#include <sys/resource.h>
#include "ggml.h"
#include "time_helper.hpp"
void test_f16_matmul() {
ggml_backend_load_all();
struct ggml_init_params params = {
.mem_size = 1280 * 1024 * 1024,
.mem_buffer = NULL,
.no_alloc = false,
};
struct ggml_context * ctx = ggml_init(params);
const int64_t neC[2] = {896, 896};
const int64_t neD[2] = {896, 1};
const int64_t neA[2] = {896, 151936};
struct ggml_tensor * C = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neC[0], neC[1]);
struct ggml_tensor * D = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, neD[0], neD[1]);
struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F16, neA[0], neA[1]);
// Fill tensors with random values
for (int i = 0; i < ggml_nelements(C); ++i) {
ggml_set_f32_1d(C, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
}
for (int i = 0; i < ggml_nelements(D); ++i) {
ggml_set_f32_1d(D, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
}
for (int i = 0; i < ggml_nelements(A); ++i) {
ggml_set_f32_1d(A, i, ((float)rand() / RAND_MAX) * 2.0f - 1.0f);
}
struct ggml_tensor * B = ggml_mul_mat(ctx, C, D);
struct ggml_tensor * E = ggml_mul_mat(ctx, A, B);
// Build and compute the graph
struct ggml_cgraph * gf = ggml_new_graph(ctx);
ggml_build_forward_expand(gf, E);
ggml_graph_compute_with_ctx(ctx, gf, 4);
ggml_free(ctx);
}
int main() {
for (int i = 0; i < 6; i++) {
test_f16_matmul();
sleep(1);
}
return 0;
}
tests/CMakeLists.txt
set(TEST_TARGET mytest)
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
target_include_directories(${TEST_TARGET} PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
target_link_libraries(${TEST_TARGET} PRIVATE ggml)
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}>)
set_property(TEST ${TEST_TARGET} PROPERTY ENVIRONMENT "LLVM_PROFILE_FILE=${TEST_TARGET}.profraw")
Build Command
cmake \
-DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DANDROID_ABI=arm64-v8a \
-DANDROID_PLATFORM=android-28 \
-DCMAKE_C_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
-DCMAKE_CXX_FLAGS="-march=armv8.6-a+fp16fml+i8mm" \
-DGGML_OPENMP=ON \
-DGGML_LLAMAFILE=OFF \
-B build-ggml
Run Command on Device via ADB
cd /data/local/tmp/ggml/build-ggml && chmod +x mytest && export LD_LIBRARY_PATH=:/data/local/tmp/ggml/build-ggml && taskset 0f ./mytest >ggml_op.txt
The result from running this standalone test shows:
X*Y (896 1 1 1) * (896 151936 1 1): 9.072333 ms
Multiple runs consistently show around 8–9 ms , whereas the same operation inside llama.cpp takes only ~4.6 ms. I verified through debugging that both versions execute the same underlying code path .
Why does the mul_mat operation in llama.cpp run significantly faster than in my standalone test?
This issue has been puzzling me for two weeks now. I would greatly appreciate any insight you can offer!
Thank you for taking the time to read through this.
Python Script for Log Analysis (ana_log.py)
import sys
from collections import defaultdict
def main(logfile, avg=True):
total_times = defaultdict(float)
current_symbol = None
with open(logfile, 'r', encoding='utf-8') as f:
lines = f.readlines()
line_cnt = len(lines)
line_index = 0
matmul_dict = defaultdict(list)
while line_index < line_cnt:
line = lines[line_index]
stripped_line = line.lstrip()
is_indented = (len(line) - len(stripped_line)) > 0
if stripped_line.startswith('op symbol:') and not is_indented:
parts = stripped_line.split(':', 1)
current_symbol = parts[1].strip()
elif stripped_line.startswith('op time:') and not is_indented:
time_part = stripped_line.split(':', 1)[1].strip()
time_str = time_part.replace(' ms', '').strip()
time_val = float(time_str)
if current_symbol:
total_times[current_symbol] += time_val
elif current_symbol == "X*Y" and stripped_line.startswith('src'):
line_index += 4
line = lines[line_index].strip()
src1_shape = line.split(':')[1].strip()
line_index += 5
line = lines[line_index].strip()
src2_shape = line.split(':')[1].strip()
matmul_dict[(src1_shape, src2_shape)].append(time_val)
line_index += 1
infer_times = len(matmul_dict[("896 151936 1 1","896 1 1 1")])
for symbol, total_time in sorted(total_times.items()):
print(f"{symbol}: {total_time / infer_times:.6f} ms")
for shapes, times in matmul_dict.items():
src1_shape, src2_shape = shapes
used_time = sum(times) / len(times) if avg else min(times)
print(f"X*Y ({src2_shape}) * ({src1_shape}): {used_time:.6f} ms, once infer call times: {len(times)/infer_times}")
if __name__ == '__main__':
if len(sys.argv) < 2 or len(sys.argv) > 3:
print("Usage: python ana_log.py <log_file> [avg]")
sys.exit(1)
avg = True if len(sys.argv) == 3 and sys.argv[2].lower() == 'avg' else False
print("avg" if avg else "min")
main(sys.argv[1], avg)
Usage
python ana_log.py op.txt avg
I used GGML to build a computation graph to measure the execution time of mul_mat, but it is consistently much slower than the mul_mat operation during inference in llama.cpp, especially when the activation tensor has shape (896 1 1 1) in FP32 and the weight tensor has shape (896 151936 1 1) in FP16.
I explained my attempts in detail above. If you can help me with this issue, I would be extremely grateful!