How to add a new backend for llama.cpp · ggml-org/llama.cpp · Discussion #14359

yuenyu1
Jun 24, 2025

How to add a new backend for llama.cpp? Is there any docments?

Answered by taronaeo

Jun 26, 2025

If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.

Backends are contained within the ggml/src/ggml-YOUR-BACKEND-NAME directory. You will have to self-provide the CMakeLists.txt to compile your relevant backend and it has to contain the following functions to register it properly.

Backend Registration

Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.

ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) {
 static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = {
 /* .api_version = */ GGML_YO...

View full answer

Replies: 1 comment 1 reply

taronaeo
Jun 26, 2025
Collaborator

If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.

Backend Registration

Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.

ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) {
 static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = {
 /* .api_version = */ GGML_YOUR_BACKEND_NAME_BACKEND_VERSION,
 /* .interface = */ ggml_backend_YOUR_BACKEND_NAME_reg_i,
 /* .context = */ NULL,
 };
 return &ggml_backend_zdnn_reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)

Backend Inference

static const struct ggml_backend_reg_i ggml_backend_YOUR_BACKEND_NAME_reg_i = {
 /* .get_name = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_name,
 /* .get_device_count = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device_count,
 /* .get_device = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device,
 /* .get_proc_address = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address,
};

Backend Device and Proc Address

static ggml_backend_dev_t ggml_backend_YOUR_BACKEND_NAME_reg_get_device(ggml_backend_reg_t reg, size_t index) {
 GGML_ASSERT(index == 0);
 static ggml_backend_device ggml_backend_YOUR_BACKEND_NAME_device = {
 /* .interface = */ ggml_backend_YOUR_BACKEND_NAME_device_i,
 /* .register = */ reg,
 /* .context = */ nullptr,
 };
 return &ggml_backend_YOUR_BACKEND_NAME_device;
}
static void * ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
 return nullptr;
 GGML_UNUSED(reg);
 GGML_UNUSED(name);
}

Actual Device Interface

static const struct ggml_backend_device_i ggml_backend_YOUR_BACKEND_NAME_device_i = {
 /* .get_name = */ ggml_backend_YOUR_BACKEND_NAME_device_get_name,
 /* .get_description = */ ggml_backend_YOUR_BACKEND_NAME_device_get_desc,
 /* .get_memory = */ ggml_backend_YOUR_BACKEND_NAME_device_get_memory,
 /* .get_type = */ ggml_backend_YOUR_BACKEND_NAME_device_get_type,
 /* .get_props = */ ggml_backend_YOUR_BACKEND_NAME_device_get_props,
 /* .init_backend = */ ggml_backend_YOUR_BACKEND_NAME_device_init_backend,
 /* .get_buffer_type = */ ggml_backend_YOUR_BACKEND_NAME_device_get_buffer_type,
 /* .get_host_buffer_type = */ NULL,
 /* .buffer_from_host_ptr = */ ggml_backend_YOUR_BACKEND_NAME_device_buffer_from_host_ptr,
 /* .supports_op = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_op,
 /* .supports_buft = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_buft,
 /* .offload_op = */ NULL,
 /* .event_new = */ NULL,
 /* .event_free = */ NULL,
 /* .event_synchronize = */ NULL,
};

Check if device supports operation

static bool ggml_backend_YOUR_BACKEND_NAME_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
 const struct ggml_tensor * src0 = op->src[0];
 const struct ggml_tensor * src1 = op->src[1];
 switch (op->op) {
 // GGML required ops
 case GGML_OP_NONE:
 case GGML_OP_RESHAPE:
 case GGML_OP_VIEW:
 case GGML_OP_PERMUTE:
 case GGML_OP_TRANSPOSE:
 break;
 case GGML_OP_ADD:
 case GGML_OP_ADD1:
 case GGML_OP_SUB:
 case GGML_OP_MUL:
 case GGML_OP_DIV:
 case GGML_OP_SQRT:
 case GGML_OP_LOG:
 case GGML_OP_NORM:
 case GGML_OP_MUL_MAT:
 case GGML_OP_MUL_MAT_ID:
 case GGML_OP_SOFT_MAX:
 case GGML_OP_LEAKY_RELU:
 return false; // TODO: disable all support first to showcase device reg
 case GGML_OP_UNARY:
 switch (ggml_get_unary_op(op)) {
 case GGML_UNARY_OP_ABS:
 case GGML_UNARY_OP_SGN:
 case GGML_UNARY_OP_NEG:
 case GGML_UNARY_OP_STEP:
 case GGML_UNARY_OP_TANH:
 case GGML_UNARY_OP_ELU:
 case GGML_UNARY_OP_RELU:
 case GGML_UNARY_OP_SIGMOID:
 case GGML_UNARY_OP_GELU:
 case GGML_UNARY_OP_GELU_QUICK:
 case GGML_UNARY_OP_SILU:
 case GGML_UNARY_OP_HARDSWISH:
 case GGML_UNARY_OP_HARDSIGMOID:
 case GGML_UNARY_OP_EXP:
 break;
 default:
 return false;
 }
 default:
 return false;
 }
 return true;
 GGML_UNUSED(dev);
}

Dispatch of computation of operation

inline bool ggml_YOUR_BACKEND_NAME_compute_forward(ggml_backend_YOUR_BACKEND_NAME_context & ctx,
 ggml_tensor * dst) {
 switch (dst->op) {
 case GGML_OP_ADD:
 case GGML_OP_ADD1:
 case GGML_OP_SUB:
 case GGML_OP_MUL:
 case GGML_OP_DIV:
 case GGML_OP_SQRT:
 case GGML_OP_LOG:
 case GGML_OP_NORM:
 case GGML_OP_MUL_MAT:
 case GGML_OP_MUL_MAT_ID:
 case GGML_OP_SOFT_MAX:
 case GGML_OP_LEAKY_RELU:
 return false;
 case GGML_OP_UNARY:
 switch (ggml_get_unary_op(dst)) {
 case GGML_UNARY_OP_ABS:
 case GGML_UNARY_OP_SGN:
 case GGML_UNARY_OP_NEG:
 case GGML_UNARY_OP_STEP:
 case GGML_UNARY_OP_TANH:
 case GGML_UNARY_OP_ELU:
 case GGML_UNARY_OP_RELU:
 case GGML_UNARY_OP_SIGMOID:
 case GGML_UNARY_OP_GELU:
 case GGML_UNARY_OP_GELU_QUICK:
 case GGML_UNARY_OP_SILU:
 case GGML_UNARY_OP_HARDSWISH:
 case GGML_UNARY_OP_HARDSIGMOID:
 return false;
 case GGML_UNARY_OP_EXP:
 break;
 default:
 return false;
 }
 default:
 return false;
 }
 return true;
}

That should be the main bulk of registering a device and getting the compute operation forwarded to your backend. Please take note that GGML's matrix multiplication is computed as $C = B*A^T$, where C = destination tensor, B = input tensor, A = weights tensor.

You may choose to refer to my zDNN implementation here: https://github.com/taronaeo/llama.cpp-s390x/blob/zdnn-accelerator-backend/ggml/src/ggml-zdnn/ggml-zdnn.cpp

1 reply

@yuenyu1

yuenyu1 Oct 21, 2025
Author

Thanks, this answer is very helpful to me.

Answer selected by yuenyu1

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

How to add a new backend for llama.cpp #14359

Uh oh!

{{title}}

Uh oh!

yuenyu1
Jun 24, 2025

Replies: 1 comment 1 reply

Uh oh!

{{title}}

Uh oh!

Uh oh!

{{editor}}'s edit

{{editor}}'s edit

Uh oh!

taronaeo
Jun 26, 2025
Collaborator

Uh oh!

{{title}}

Uh oh!

yuenyu1 Oct 21, 2025
Author

Select a reply

Uh oh!

How to add a new backend for llama.cpp #14359

Uh oh!

yuenyu1 Jun 24, 2025

Replies: 1 comment · 1 reply

Uh oh!

Uh oh!

taronaeo Jun 26, 2025 Collaborator

Uh oh!

yuenyu1 Oct 21, 2025 Author

yuenyu1
Jun 24, 2025

Replies: 1 comment 1 reply

taronaeo
Jun 26, 2025
Collaborator

yuenyu1 Oct 21, 2025
Author