Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

How to add a new backend for llama.cpp #14359

Answered by taronaeo
yuenyu1 asked this question in Q&A
Discussion options

How to add a new backend for llama.cpp? Is there any docments?

You must be logged in to vote

If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.

Backends are contained within the ggml/src/ggml-YOUR-BACKEND-NAME directory. You will have to self-provide the CMakeLists.txt to compile your relevant backend and it has to contain the following functions to register it properly.

  1. Backend Registration

Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.

ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) {
 static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = {
 /* .api_version = */ GGML_YO...

Replies: 1 comment 1 reply

Comment options

If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.

Backends are contained within the ggml/src/ggml-YOUR-BACKEND-NAME directory. You will have to self-provide the CMakeLists.txt to compile your relevant backend and it has to contain the following functions to register it properly.

  1. Backend Registration

Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.

ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) {
 static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = {
 /* .api_version = */ GGML_YOUR_BACKEND_NAME_BACKEND_VERSION,
 /* .interface = */ ggml_backend_YOUR_BACKEND_NAME_reg_i,
 /* .context = */ NULL,
 };
 return &ggml_backend_zdnn_reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)
  1. Backend Inference
static const struct ggml_backend_reg_i ggml_backend_YOUR_BACKEND_NAME_reg_i = {
 /* .get_name = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_name,
 /* .get_device_count = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device_count,
 /* .get_device = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device,
 /* .get_proc_address = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address,
};
  1. Backend Device and Proc Address
static ggml_backend_dev_t ggml_backend_YOUR_BACKEND_NAME_reg_get_device(ggml_backend_reg_t reg, size_t index) {
 GGML_ASSERT(index == 0);
 static ggml_backend_device ggml_backend_YOUR_BACKEND_NAME_device = {
 /* .interface = */ ggml_backend_YOUR_BACKEND_NAME_device_i,
 /* .register = */ reg,
 /* .context = */ nullptr,
 };
 return &ggml_backend_YOUR_BACKEND_NAME_device;
}
static void * ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
 return nullptr;
 GGML_UNUSED(reg);
 GGML_UNUSED(name);
}
  1. Actual Device Interface
static const struct ggml_backend_device_i ggml_backend_YOUR_BACKEND_NAME_device_i = {
 /* .get_name = */ ggml_backend_YOUR_BACKEND_NAME_device_get_name,
 /* .get_description = */ ggml_backend_YOUR_BACKEND_NAME_device_get_desc,
 /* .get_memory = */ ggml_backend_YOUR_BACKEND_NAME_device_get_memory,
 /* .get_type = */ ggml_backend_YOUR_BACKEND_NAME_device_get_type,
 /* .get_props = */ ggml_backend_YOUR_BACKEND_NAME_device_get_props,
 /* .init_backend = */ ggml_backend_YOUR_BACKEND_NAME_device_init_backend,
 /* .get_buffer_type = */ ggml_backend_YOUR_BACKEND_NAME_device_get_buffer_type,
 /* .get_host_buffer_type = */ NULL,
 /* .buffer_from_host_ptr = */ ggml_backend_YOUR_BACKEND_NAME_device_buffer_from_host_ptr,
 /* .supports_op = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_op,
 /* .supports_buft = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_buft,
 /* .offload_op = */ NULL,
 /* .event_new = */ NULL,
 /* .event_free = */ NULL,
 /* .event_synchronize = */ NULL,
};
  1. Check if device supports operation
static bool ggml_backend_YOUR_BACKEND_NAME_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
 const struct ggml_tensor * src0 = op->src[0];
 const struct ggml_tensor * src1 = op->src[1];
 switch (op->op) {
 // GGML required ops
 case GGML_OP_NONE:
 case GGML_OP_RESHAPE:
 case GGML_OP_VIEW:
 case GGML_OP_PERMUTE:
 case GGML_OP_TRANSPOSE:
 break;
 case GGML_OP_ADD:
 case GGML_OP_ADD1:
 case GGML_OP_SUB:
 case GGML_OP_MUL:
 case GGML_OP_DIV:
 case GGML_OP_SQRT:
 case GGML_OP_LOG:
 case GGML_OP_NORM:
 case GGML_OP_MUL_MAT:
 case GGML_OP_MUL_MAT_ID:
 case GGML_OP_SOFT_MAX:
 case GGML_OP_LEAKY_RELU:
 return false; // TODO: disable all support first to showcase device reg
 case GGML_OP_UNARY:
 switch (ggml_get_unary_op(op)) {
 case GGML_UNARY_OP_ABS:
 case GGML_UNARY_OP_SGN:
 case GGML_UNARY_OP_NEG:
 case GGML_UNARY_OP_STEP:
 case GGML_UNARY_OP_TANH:
 case GGML_UNARY_OP_ELU:
 case GGML_UNARY_OP_RELU:
 case GGML_UNARY_OP_SIGMOID:
 case GGML_UNARY_OP_GELU:
 case GGML_UNARY_OP_GELU_QUICK:
 case GGML_UNARY_OP_SILU:
 case GGML_UNARY_OP_HARDSWISH:
 case GGML_UNARY_OP_HARDSIGMOID:
 case GGML_UNARY_OP_EXP:
 break;
 default:
 return false;
 }
 default:
 return false;
 }
 return true;
 GGML_UNUSED(dev);
}
  1. Dispatch of computation of operation
inline bool ggml_YOUR_BACKEND_NAME_compute_forward(ggml_backend_YOUR_BACKEND_NAME_context & ctx,
 ggml_tensor * dst) {
 switch (dst->op) {
 case GGML_OP_ADD:
 case GGML_OP_ADD1:
 case GGML_OP_SUB:
 case GGML_OP_MUL:
 case GGML_OP_DIV:
 case GGML_OP_SQRT:
 case GGML_OP_LOG:
 case GGML_OP_NORM:
 case GGML_OP_MUL_MAT:
 case GGML_OP_MUL_MAT_ID:
 case GGML_OP_SOFT_MAX:
 case GGML_OP_LEAKY_RELU:
 return false;
 case GGML_OP_UNARY:
 switch (ggml_get_unary_op(dst)) {
 case GGML_UNARY_OP_ABS:
 case GGML_UNARY_OP_SGN:
 case GGML_UNARY_OP_NEG:
 case GGML_UNARY_OP_STEP:
 case GGML_UNARY_OP_TANH:
 case GGML_UNARY_OP_ELU:
 case GGML_UNARY_OP_RELU:
 case GGML_UNARY_OP_SIGMOID:
 case GGML_UNARY_OP_GELU:
 case GGML_UNARY_OP_GELU_QUICK:
 case GGML_UNARY_OP_SILU:
 case GGML_UNARY_OP_HARDSWISH:
 case GGML_UNARY_OP_HARDSIGMOID:
 return false;
 case GGML_UNARY_OP_EXP:
 break;
 default:
 return false;
 }
 default:
 return false;
 }
 return true;
}

That should be the main bulk of registering a device and getting the compute operation forwarded to your backend. Please take note that GGML's matrix multiplication is computed as $C = B*A^T$, where C = destination tensor, B = input tensor, A = weights tensor.

You may choose to refer to my zDNN implementation here: https://github.com/taronaeo/llama.cpp-s390x/blob/zdnn-accelerator-backend/ggml/src/ggml-zdnn/ggml-zdnn.cpp

You must be logged in to vote
1 reply
Comment options

Thanks, this answer is very helpful to me.

Answer selected by yuenyu1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Category
Q&A
Labels
None yet
2 participants

AltStyle によって変換されたページ (->オリジナル) /