-
Notifications
You must be signed in to change notification settings - Fork 13.4k
-
How to add a new backend for llama.cpp? Is there any docments?
Beta Was this translation helpful? Give feedback.
All reactions
If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.
Backends are contained within the ggml/src/ggml-YOUR-BACKEND-NAME directory. You will have to self-provide the CMakeLists.txt to compile your relevant backend and it has to contain the following functions to register it properly.
- Backend Registration
Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.
ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) { static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = { /* .api_version = */ GGML_YO...
Replies: 1 comment 1 reply
-
If by backend you mean hardware accelerator or GPU, unfortunately I did not find any documentation on it but I did manage to make it register a backend with operations.
Backends are contained within the ggml/src/ggml-YOUR-BACKEND-NAME directory. You will have to self-provide the CMakeLists.txt to compile your relevant backend and it has to contain the following functions to register it properly.
- Backend Registration
Inform GGML that you have a backend, and provide the necessary interface for GGML to interact with it.
ggml_backend_reg_t ggml_backend_YOUR_BACKEND_NAME_reg(void) { static struct ggml_backend_reg ggml_backend_YOUR_BACKEND_NAME_reg = { /* .api_version = */ GGML_YOUR_BACKEND_NAME_BACKEND_VERSION, /* .interface = */ ggml_backend_YOUR_BACKEND_NAME_reg_i, /* .context = */ NULL, }; return &ggml_backend_zdnn_reg; } GGML_BACKEND_DL_IMPL(ggml_backend_zdnn_reg)
- Backend Inference
static const struct ggml_backend_reg_i ggml_backend_YOUR_BACKEND_NAME_reg_i = { /* .get_name = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_name, /* .get_device_count = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device_count, /* .get_device = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_device, /* .get_proc_address = */ ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address, };
- Backend Device and Proc Address
static ggml_backend_dev_t ggml_backend_YOUR_BACKEND_NAME_reg_get_device(ggml_backend_reg_t reg, size_t index) { GGML_ASSERT(index == 0); static ggml_backend_device ggml_backend_YOUR_BACKEND_NAME_device = { /* .interface = */ ggml_backend_YOUR_BACKEND_NAME_device_i, /* .register = */ reg, /* .context = */ nullptr, }; return &ggml_backend_YOUR_BACKEND_NAME_device; } static void * ggml_backend_YOUR_BACKEND_NAME_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) { return nullptr; GGML_UNUSED(reg); GGML_UNUSED(name); }
- Actual Device Interface
static const struct ggml_backend_device_i ggml_backend_YOUR_BACKEND_NAME_device_i = { /* .get_name = */ ggml_backend_YOUR_BACKEND_NAME_device_get_name, /* .get_description = */ ggml_backend_YOUR_BACKEND_NAME_device_get_desc, /* .get_memory = */ ggml_backend_YOUR_BACKEND_NAME_device_get_memory, /* .get_type = */ ggml_backend_YOUR_BACKEND_NAME_device_get_type, /* .get_props = */ ggml_backend_YOUR_BACKEND_NAME_device_get_props, /* .init_backend = */ ggml_backend_YOUR_BACKEND_NAME_device_init_backend, /* .get_buffer_type = */ ggml_backend_YOUR_BACKEND_NAME_device_get_buffer_type, /* .get_host_buffer_type = */ NULL, /* .buffer_from_host_ptr = */ ggml_backend_YOUR_BACKEND_NAME_device_buffer_from_host_ptr, /* .supports_op = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_op, /* .supports_buft = */ ggml_backend_YOUR_BACKEND_NAME_device_supports_buft, /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_synchronize = */ NULL, };
- Check if device supports operation
static bool ggml_backend_YOUR_BACKEND_NAME_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { const struct ggml_tensor * src0 = op->src[0]; const struct ggml_tensor * src1 = op->src[1]; switch (op->op) { // GGML required ops case GGML_OP_NONE: case GGML_OP_RESHAPE: case GGML_OP_VIEW: case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: break; case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_SQRT: case GGML_OP_LOG: case GGML_OP_NORM: case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: case GGML_OP_SOFT_MAX: case GGML_OP_LEAKY_RELU: return false; // TODO: disable all support first to showcase device reg case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_HARDSIGMOID: case GGML_UNARY_OP_EXP: break; default: return false; } default: return false; } return true; GGML_UNUSED(dev); }
- Dispatch of computation of operation
inline bool ggml_YOUR_BACKEND_NAME_compute_forward(ggml_backend_YOUR_BACKEND_NAME_context & ctx, ggml_tensor * dst) { switch (dst->op) { case GGML_OP_ADD: case GGML_OP_ADD1: case GGML_OP_SUB: case GGML_OP_MUL: case GGML_OP_DIV: case GGML_OP_SQRT: case GGML_OP_LOG: case GGML_OP_NORM: case GGML_OP_MUL_MAT: case GGML_OP_MUL_MAT_ID: case GGML_OP_SOFT_MAX: case GGML_OP_LEAKY_RELU: return false; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_RELU: case GGML_UNARY_OP_SIGMOID: case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_HARDSIGMOID: return false; case GGML_UNARY_OP_EXP: break; default: return false; } default: return false; } return true; }
That should be the main bulk of registering a device and getting the compute operation forwarded to your backend. Please take note that GGML's matrix multiplication is computed as
You may choose to refer to my zDNN implementation here: https://github.com/taronaeo/llama.cpp-s390x/blob/zdnn-accelerator-backend/ggml/src/ggml-zdnn/ggml-zdnn.cpp
Beta Was this translation helpful? Give feedback.
All reactions
-
Thanks, this answer is very helpful to me.
Beta Was this translation helpful? Give feedback.