Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

rocm: fix distributed inference on unified-memory APUs (strix halo / gfx1151) #407

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
kyuz0 wants to merge 3 commits into antirez:main
base: main
Choose a base branch
Loading
from kyuz0:rocm-multi-node
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 76 additions & 29 deletions rocm/ds4_rocm_runtime.cuh
View file Open in desktop
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ struct cuda_model_image {
const void *host_base;
uint64_t size;
char *device_ptr;
uint64_t device_offset;
};

struct cuda_q8_f16_range {
Expand Down Expand Up @@ -206,11 +207,14 @@ static int cuda_model_image_find(const void *model_map) {
}

static const char *cuda_model_image_ptr(const void *model_map, uint64_t offset) {
const int idx = cuda_model_image_find(model_map);
if (idx < 0) return NULL;
const cuda_model_image &img = g_model_images[(size_t)idx];
if (offset > img.size) return NULL;
return img.device_ptr + offset;
for (size_t i = 0; i < g_model_images.size(); i++) {
const cuda_model_image &img = g_model_images[i];
if (img.host_base != model_map) continue;
if (offset < img.device_offset) continue;
if (offset - img.device_offset >= img.size) continue;
return img.device_ptr + (offset - img.device_offset);
}
return NULL;
}

static int cuda_model_image_owned(const void *model_map) {
Expand Down Expand Up @@ -1046,13 +1050,29 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) {
void *dev = NULL;
cudaError_t err = cudaMalloc(&dev, (size_t)chunk);
if (err != cudaSuccess) {
fprintf(stderr, DS4_GPU_LOG_PREFIX "model arena alloc failed for %s (%.2f MiB chunk): %s\n",
what ? what : "weights",
(double)chunk / 1048576.0,
cudaGetErrorString(err));
(void)cudaGetLastError();
g_model_cache_full = 1;
return NULL;
uint64_t fallback = chunk / 2u;
while (fallback >= aligned) {
err = cudaMalloc(&dev, (size_t)fallback);
if (err == cudaSuccess) break;
(void)cudaGetLastError();
fallback /= 2u;
}
if (err != cudaSuccess) {
err = cudaMalloc(&dev, (size_t)aligned);
if (err != cudaSuccess) {
fprintf(stderr, DS4_GPU_LOG_PREFIX "model arena alloc failed for %s (%.2f MiB): %s\n",
what ? what : "weights",
(double)aligned / 1048576.0,
cudaGetErrorString(err));
(void)cudaGetLastError();
g_model_cache_full = 1;
return NULL;
}
fallback = aligned;
}
g_model_arenas.push_back({(char *)dev, fallback, aligned});
return (char *)dev;
}
g_model_arenas.push_back({(char *)dev, chunk, aligned});
return (char *)dev;
Expand Down Expand Up @@ -1143,25 +1163,19 @@ static const char *cuda_model_range_ptr_from_fd(

static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, uint64_t map_offset, uint64_t map_size) {
if (!model_map || model_size == 0 || map_offset > model_size || map_size > model_size - map_offset) return 0;
if (cuda_model_image_owned(model_map)) {
g_model_host_base = model_map;
g_model_device_base = cuda_model_image_ptr(model_map, 0);
g_model_registered_size = model_size;
g_model_device_owned = 1;
return 1;
}
if (map_size == 0) return 0;

void *dev = NULL;
const double t0 = cuda_wall_sec();
cudaError_t err = cudaMalloc(&dev, (size_t)model_size);
cudaError_t err = cudaMalloc(&dev, (size_t)map_size);
if (err != cudaSuccess) {
fprintf(stderr, DS4_GPU_LOG_PREFIX "model allocation skipped: %s\n", cudaGetErrorString(err));
(void)cudaGetLastError();
return 0;
}

fprintf(stderr, DS4_GPU_LOG_PREFIX "chunk-copying %.2f GiB model image\n",
(double)model_size / 1073741824.0);
(double)map_size / 1073741824.0);

const uint64_t chunk = cuda_model_copy_chunk_bytes();
const uint64_t stage_bytes = chunk + (g_model_direct_align > 1 ? g_model_direct_align : 1);
Expand All @@ -1172,8 +1186,8 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u

uint64_t copied = 0;
uint64_t chunk_idx = 0;
while (copied < model_size) {
const uint64_t n = (model_size - copied < chunk) ? (model_size - copied) : chunk;
while (copied < map_size) {
const uint64_t n = (map_size - copied < chunk) ? (map_size - copied) : chunk;
const uint64_t bi = chunk_idx % 4u;
if (chunk_idx >= 4u) {
err = cudaEventSynchronize(g_model_stage_event[bi]);
Expand All @@ -1186,7 +1200,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
}
const char *payload = NULL;
if (!cuda_model_stage_read(g_model_stage[bi], g_model_stage_bytes,
copied, n, &payload)) {
map_offset + copied, n, &payload)) {
fprintf(stderr, DS4_GPU_LOG_PREFIX "model staged read failed at %.2f GiB: %s\n",
(double)copied / 1073741824.0, strerror(errno));
(void)cudaFree(dev);
Expand All @@ -1208,11 +1222,11 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
(void)cudaGetLastError();
return 0;
}
cuda_model_drop_file_pages(copied, n);
cuda_model_discard_source_pages(model_map, model_size, copied, n);
cuda_model_drop_file_pages(map_offset + copied, n);
cuda_model_discard_source_pages(model_map, model_size, map_offset + copied, n);
copied += n;
chunk_idx++;
cuda_model_load_progress_note(copied > map_offset ? copied - map_offset : 0);
cuda_model_load_progress_note(copied);
}
err = cudaStreamSynchronize(g_model_upload_stream);
if (err != cudaSuccess) {
Expand All @@ -1221,9 +1235,11 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
(void)cudaGetLastError();
return 0;
}
g_model_images.push_back({model_map, model_size, (char *)dev});
g_model_images.push_back({model_map, map_size, (char *)dev, map_offset});
g_model_host_base = model_map;
g_model_device_base = (const char *)dev;
/* With multiple disjoint images there is no single base pointer;
* all tensor lookups go through cuda_model_image_ptr() instead. */
g_model_device_base = NULL;
g_model_registered_size = model_size;
g_model_device_owned = 1;
const double t1 = cuda_wall_sec();
Expand Down Expand Up @@ -1511,12 +1527,14 @@ extern "C" int ds4_gpu_set_model_map_spans(
uint64_t max_tensor_bytes) {
(void)max_tensor_bytes;
if (!model_map || model_size == 0 || !offsets || !sizes || count == 0) return 0;
uint64_t span_bytes = 0;
for (uint32_t i = 0; i < count; i++) {
if (offsets[i] > model_size ||
sizes[i] == 0 ||
sizes[i] > model_size - offsets[i]) {
return 0;
}
span_bytes += sizes[i];
}
uint64_t min_offset = offsets[0];
uint64_t max_end = offsets[0] + sizes[0];
Expand All @@ -1526,7 +1544,36 @@ extern "C" int ds4_gpu_set_model_map_spans(
if (end > max_end) max_end = end;
}
if (!ds4_gpu_set_model_map(model_map, model_size)) return 0;
return cuda_model_copy_chunked(model_map, model_size, min_offset, max_end - min_offset);
/* If the bounding box is tight (<=10% waste), copy it as one
* contiguous range. Otherwise sort and merge to avoid copying
* large gaps between non-adjacent spans. */
const uint64_t bbox = max_end - min_offset;
if (bbox <= span_bytes + span_bytes / 10u) {
return cuda_model_copy_chunked(model_map, model_size, min_offset, bbox);
}
/* Bounding box has large gaps. Sort the spans, merge adjacent ones,
* and bulk-copy each contiguous group individually. */
std::vector<std::pair<uint64_t, uint64_t>> sorted(count);
for (uint32_t i = 0; i < count; i++) sorted[i] = {offsets[i], offsets[i] + sizes[i]};
std::sort(sorted.begin(), sorted.end());
uint64_t grp_off = sorted[0].first;
uint64_t grp_end = sorted[0].second;
for (uint32_t i = 1; i <= count; i++) {
/* Merge spans separated by <= 64 KiB to reduce the number of
* device allocations without wasting meaningful memory. */
const uint64_t gap = 64ull * 1024ull;
if (i < count && sorted[i].first <= grp_end + gap) {
if (sorted[i].second > grp_end) grp_end = sorted[i].second;
continue;
}
if (!cuda_model_copy_chunked(model_map, model_size, grp_off, grp_end - grp_off))
return 0;
if (i < count) {
grp_off = sorted[i].first;
grp_end = sorted[i].second;
}
}
return 1;
}

extern "C" int ds4_gpu_set_model_fd(int fd) {
Expand Down

AltStyle によって変換されたページ (->オリジナル) /