rocm: fix distributed inference on unified-memory APUs (strix halo / gfx1151) #407

Original file line number	Diff line number	Diff line change
Expand Up		@@ -47,6 +47,7 @@ struct cuda_model_image {
		const void *host_base;
		uint64_t size;
		char *device_ptr;
	uint64_t device_offset;
		};

		struct cuda_q8_f16_range {
Expand Down Expand Up		@@ -206,11 +207,14 @@ static int cuda_model_image_find(const void *model_map) {
		}

		static const char cuda_model_image_ptr(const void model_map, uint64_t offset) {
	const int idx = cuda_model_image_find(model_map);
	if (idx < 0) return NULL;
	const cuda_model_image &img = g_model_images[(size_t)idx];
	if (offset > img.size) return NULL;
	return img.device_ptr + offset;
	for (size_t i = 0; i < g_model_images.size(); i++) {
	const cuda_model_image &img = g_model_images[i];
	if (img.host_base != model_map) continue;
	if (offset < img.device_offset) continue;
	if (offset - img.device_offset >= img.size) continue;
	return img.device_ptr + (offset - img.device_offset);
	}
	return NULL;
		}

		static int cuda_model_image_owned(const void *model_map) {
Expand Down Expand Up		@@ -1046,13 +1050,29 @@ static char cuda_model_arena_alloc(uint64_t bytes, const char what) {
		void *dev = NULL;
		cudaError_t err = cudaMalloc(&dev, (size_t)chunk);
		if (err != cudaSuccess) {
	fprintf(stderr, DS4_GPU_LOG_PREFIX "model arena alloc failed for %s (%.2f MiB chunk): %s\n",
	what ? what : "weights",
	(double)chunk / 1048576.0,
	cudaGetErrorString(err));
		(void)cudaGetLastError();
	g_model_cache_full = 1;
	return NULL;
	uint64_t fallback = chunk / 2u;
	while (fallback >= aligned) {
	err = cudaMalloc(&dev, (size_t)fallback);
	if (err == cudaSuccess) break;
	(void)cudaGetLastError();
	fallback /= 2u;
	}
	if (err != cudaSuccess) {
	err = cudaMalloc(&dev, (size_t)aligned);
	if (err != cudaSuccess) {
	fprintf(stderr, DS4_GPU_LOG_PREFIX "model arena alloc failed for %s (%.2f MiB): %s\n",
	what ? what : "weights",
	(double)aligned / 1048576.0,
	cudaGetErrorString(err));
	(void)cudaGetLastError();
	g_model_cache_full = 1;
	return NULL;
	}
	fallback = aligned;
	}
	g_model_arenas.push_back({(char *)dev, fallback, aligned});
	return (char *)dev;
		}
		g_model_arenas.push_back({(char *)dev, chunk, aligned});
		return (char *)dev;
Expand Down Expand Up		@@ -1143,25 +1163,19 @@ static const char *cuda_model_range_ptr_from_fd(

		static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, uint64_t map_offset, uint64_t map_size) {
		if (!model_map \|\| model_size == 0 \|\| map_offset > model_size \|\| map_size > model_size - map_offset) return 0;
	if (cuda_model_image_owned(model_map)) {
	g_model_host_base = model_map;
	g_model_device_base = cuda_model_image_ptr(model_map, 0);
	g_model_registered_size = model_size;
	g_model_device_owned = 1;
	return 1;
	}
	if (map_size == 0) return 0;

		void *dev = NULL;
		const double t0 = cuda_wall_sec();
	cudaError_t err = cudaMalloc(&dev, (size_t)model_size);
	cudaError_t err = cudaMalloc(&dev, (size_t)map_size);
		if (err != cudaSuccess) {
		fprintf(stderr, DS4_GPU_LOG_PREFIX "model allocation skipped: %s\n", cudaGetErrorString(err));
		(void)cudaGetLastError();
		return 0;
		}

		fprintf(stderr, DS4_GPU_LOG_PREFIX "chunk-copying %.2f GiB model image\n",
	(double)model_size / 1073741824.0);
	(double)map_size / 1073741824.0);

		const uint64_t chunk = cuda_model_copy_chunk_bytes();
		const uint64_t stage_bytes = chunk + (g_model_direct_align > 1 ? g_model_direct_align : 1);
Expand All		@@ -1172,8 +1186,8 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u

		uint64_t copied = 0;
		uint64_t chunk_idx = 0;
	while (copied < model_size) {
	const uint64_t n = (model_size - copied < chunk) ? (model_size - copied) : chunk;
	while (copied < map_size) {
	const uint64_t n = (map_size - copied < chunk) ? (map_size - copied) : chunk;
		const uint64_t bi = chunk_idx % 4u;
		if (chunk_idx >= 4u) {
		err = cudaEventSynchronize(g_model_stage_event[bi]);
Expand All		@@ -1186,7 +1200,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
		}
		const char *payload = NULL;
		if (!cuda_model_stage_read(g_model_stage[bi], g_model_stage_bytes,
	copied, n, &payload)) {
	map_offset + copied, n, &payload)) {
		fprintf(stderr, DS4_GPU_LOG_PREFIX "model staged read failed at %.2f GiB: %s\n",
		(double)copied / 1073741824.0, strerror(errno));
		(void)cudaFree(dev);
Expand All		@@ -1208,11 +1222,11 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
		(void)cudaGetLastError();
		return 0;
		}
	cuda_model_drop_file_pages(copied, n);
	cuda_model_discard_source_pages(model_map, model_size, copied, n);
	cuda_model_drop_file_pages(map_offset + copied, n);
	cuda_model_discard_source_pages(model_map, model_size, map_offset + copied, n);
		copied += n;
		chunk_idx++;
	cuda_model_load_progress_note(copied > map_offset ? copied - map_offset : 0);
	cuda_model_load_progress_note(copied);
		}
		err = cudaStreamSynchronize(g_model_upload_stream);
		if (err != cudaSuccess) {
Expand All		@@ -1221,9 +1235,11 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
		(void)cudaGetLastError();
		return 0;
		}
	g_model_images.push_back({model_map, model_size, (char *)dev});
	g_model_images.push_back({model_map, map_size, (char *)dev, map_offset});
		g_model_host_base = model_map;
	g_model_device_base = (const char *)dev;
	/* With multiple disjoint images there is no single base pointer;
	* all tensor lookups go through cuda_model_image_ptr() instead. */
	g_model_device_base = NULL;
		g_model_registered_size = model_size;
		g_model_device_owned = 1;
		const double t1 = cuda_wall_sec();
Expand Down Expand Up		@@ -1511,12 +1527,14 @@ extern "C" int ds4_gpu_set_model_map_spans(
		uint64_t max_tensor_bytes) {
		(void)max_tensor_bytes;
		if (!model_map \|\| model_size == 0 \|\| !offsets \|\| !sizes \|\| count == 0) return 0;
	uint64_t span_bytes = 0;
		for (uint32_t i = 0; i < count; i++) {
		if (offsets[i] > model_size \|\|
		sizes[i] == 0 \|\|
		sizes[i] > model_size - offsets[i]) {
		return 0;
		}
	span_bytes += sizes[i];
		}
		uint64_t min_offset = offsets[0];
		uint64_t max_end = offsets[0] + sizes[0];
Expand All		@@ -1526,7 +1544,36 @@ extern "C" int ds4_gpu_set_model_map_spans(
		if (end > max_end) max_end = end;
		}
		if (!ds4_gpu_set_model_map(model_map, model_size)) return 0;
	return cuda_model_copy_chunked(model_map, model_size, min_offset, max_end - min_offset);
	/* If the bounding box is tight (<=10% waste), copy it as one
	* contiguous range. Otherwise sort and merge to avoid copying
	* large gaps between non-adjacent spans. */
	const uint64_t bbox = max_end - min_offset;
	if (bbox <= span_bytes + span_bytes / 10u) {
	return cuda_model_copy_chunked(model_map, model_size, min_offset, bbox);
	}
	/* Bounding box has large gaps. Sort the spans, merge adjacent ones,
	* and bulk-copy each contiguous group individually. */
	std::vector<std::pair<uint64_t, uint64_t>> sorted(count);
	for (uint32_t i = 0; i < count; i++) sorted[i] = {offsets[i], offsets[i] + sizes[i]};
	std::sort(sorted.begin(), sorted.end());
	uint64_t grp_off = sorted[0].first;
	uint64_t grp_end = sorted[0].second;
	for (uint32_t i = 1; i <= count; i++) {
	/* Merge spans separated by <= 64 KiB to reduce the number of
	* device allocations without wasting meaningful memory. */
	const uint64_t gap = 64ull * 1024ull;
	if (i < count && sorted[i].first <= grp_end + gap) {
	if (sorted[i].second > grp_end) grp_end = sorted[i].second;
	continue;
	}
	if (!cuda_model_copy_chunked(model_map, model_size, grp_off, grp_end - grp_off))
	return 0;
	if (i < count) {
	grp_off = sorted[i].first;
	grp_end = sorted[i].second;
	}
	}
	return 1;
		}

		extern "C" int ds4_gpu_set_model_fd(int fd) {
Expand Down

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

rocm: fix distributed inference on unified-memory APUs (strix halo / gfx1151) #407

Are you sure you want to change the base?

rocm: fix distributed inference on unified-memory APUs (strix halo / gfx1151) #407

Filter by extension

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing