Commit a394039

authored

ggml-cpu : add chunking support to mul_mat_id (ggml-org#11666)

* ggml-cpu : add chunking support to mul_mat_id * allocate chunk counter in wdata parallelize src1 quantization by column to allows parallelization even when there is only one row * disable for arm * cleanup * better way to disable for arm * fix uninitialized counter when using 1 thread only * revert test-backend-ops changes

1 parent be3bbd6 commit a394039Copy full SHA for a394039

File tree

1 file changed

+184

-85

lines changed

ggml/src/ggml-cpu
- ggml-cpu.c

1 file changed

+184

-85

lines changed

`‎ggml/src/ggml-cpu/ggml-cpu.c‎`

Lines changed: 184 additions & 85 deletions

Original file line number	Diff line number	Diff line change
`@@ -7,10 +7,8 @@`
`7`	`7`	`#include "ggml-cpu-impl.h"`
`8`	`8`	`#include "ggml-cpu.h"`
`9`	`9`	`#include "ggml-impl.h"`
`10`		`-#include "ggml-quants.h"`
`11`	`10`	`#include "ggml-cpu-quants.h"`
`12`	`11`	`#include "ggml-threading.h"`
`13`		`-#include "amx/amx.h"`
`14`	`12`	`#include "ggml.h"`
`15`	`13`
`16`	`14`	`#if defined(_MSC_VER) \|\| defined(__MINGW32__)`
`@@ -1291,7 +1289,7 @@ struct ggml_threadpool {`
`1291`	`1289`	`atomic_int n_graph; // incremented when there is work to be done (i.e each graph)`
`1292`	`1290`	`atomic_int GGML_CACHE_ALIGN n_barrier;`
`1293`	`1291`	`atomic_int GGML_CACHE_ALIGN n_barrier_passed;`
`1294`		`- atomic_int current_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.`
	`1292`	`+ atomic_int GGML_CACHE_ALIGNcurrent_chunk; // currently processing chunk during Mat_Mul, shared between all the threads.`
`1295`	`1293`
`1296`	`1294`	`// these are atomic as an annotation for thread-sanitizer`
`1297`	`1295`	`atomic_bool stop; // Used for stopping the threadpool altogether`
`@@ -7490,13 +7488,15 @@ UseGgmlGemm1:;`
`7490`	`7488`	`if (src1->type != vec_dot_type) {`
`7491`	`7489`	`char * wdata = params->wdata;`
`7492`	`7490`
	`7491`	`+ const size_t nbw0 = ggml_type_size(vec_dot_type);`
`7493`	`7492`	`const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);`
`7494`	`7493`	`const size_t nbw2 = nbw1*ne11;`
`7495`	`7494`	`const size_t nbw3 = nbw2*ne12;`
`7496`	`7495`
`7497`	`7496`	`assert(params->wsize >= ne13*nbw3);`
`7498`	`7497`	`GGML_ASSERT(src1->type == GGML_TYPE_F32);`
`7499`	`7498`
	`7499`	`+ #if 0`
`7500`	`7500`	`for (int64_t i13 = 0; i13 < ne13; ++i13) {`
`7501`	`7501`	`for (int64_t i12 = 0; i12 < ne12; ++i12) {`
`7502`	`7502`	`for (int64_t i11 = ith; i11 < ne11; i11 += nth) {`
`@@ -7506,6 +7506,20 @@ UseGgmlGemm1:;`
`7506`	`7506`	`}`
`7507`	`7507`	`}`
`7508`	`7508`	`}`
	`7509`	`+ #else`
	`7510`	`+ for (int64_t i13 = 0; i13 < ne13; ++i13) {`
	`7511`	`+ for (int64_t i12 = 0; i12 < ne12; ++i12) {`
	`7512`	`+ for (int64_t i11 = 0; i11 < ne11; ++i11) {`
	`7513`	`+ size_t bs = ggml_blck_size(vec_dot_type);`
	`7514`	`+ int64_t ne10_block_start = (ith * ne10/bs) / nth;`
	`7515`	`+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;`
	`7516`	`+ from_float((float )((char ) src1->data + i13nb13 + i12nb12 + i11nb11 + ne10_block_startbs*nb10),`
	`7517`	`+ (void ) (wdata + i13nbw3 + i12nbw2 + i11nbw1 + ne10_block_start*nbw0),`
	`7518`	`+ (ne10_block_end - ne10_block_start) * bs);`
	`7519`	`+ }`
	`7520`	`+ }`
	`7521`	`+ }`
	`7522`	`+ #endif`
`7509`	`7523`	`}`
`7510`	`7524`
`7511`	`7525`	`if (ith == 0) {`
`@@ -7593,7 +7607,6 @@ UseGgmlGemm2:;`
`7593`	`7607`	`if ((nr0 % 2 != 0) \|\| (ne11 % 2 != 0) \|\| ((ir0_end - ir0_start) % 2 != 0) \|\| ((ir1_end - ir1_start) % 2 != 0)) {`
`7594`	`7608`	`num_rows_per_vec_dot = 1;`
`7595`	`7609`	`}`
`7596`		`-`
`7597`	`7610`	`ggml_compute_forward_mul_mat_one_chunk(params, dst, src0->type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);`
`7598`	`7611`
`7599`	`7612`	`if (nth >= nchunk0 * nchunk1) {`
`@@ -7606,6 +7619,84 @@ UseGgmlGemm2:;`
`7606`	`7619`
`7607`	`7620`	`// ggml_compute_forward_mul_mat_id`
`7608`	`7621`
	`7622`	`+#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)ids->ne[0]ids->ne[1] + (i1)]`
	`7623`	`+`
	`7624`	`+struct mmid_row_mapping {`
	`7625`	`+ int32_t i1;`
	`7626`	`+ int32_t i2;`
	`7627`	`+};`
	`7628`	`+`
	`7629`	`+static void ggml_compute_forward_mul_mat_id_one_chunk(`
	`7630`	`+ struct ggml_tensor * dst,`
	`7631`	`+ const struct ggml_tensor * src0,`
	`7632`	`+ const struct ggml_tensor * src1,`
	`7633`	`+ const struct ggml_tensor * ids,`
	`7634`	`+ const int64_t cur_a,`
	`7635`	`+ const int64_t ir0_start,`
	`7636`	`+ const int64_t ir0_end,`
	`7637`	`+ const int64_t ir1_start,`
	`7638`	`+ const int64_t ir1_end,`
	`7639`	`+ const char * src0_cur,`
	`7640`	`+ const struct mmid_row_mapping * matrix_rows,`
	`7641`	`+ const size_t row_size,`
	`7642`	`+ const bool src1_cont,`
	`7643`	`+ const void * wdata) {`
	`7644`	`+`
	`7645`	`+ GGML_TENSOR_BINARY_OP_LOCALS`
	`7646`	`+`
	`7647`	`+ const enum ggml_type type = src0->type;`
	`7648`	`+`
	`7649`	`+ ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;`
	`7650`	`+ enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;`
	`7651`	`+`
	`7652`	`+ const int64_t blck_0 = 16;`
	`7653`	`+ const int64_t blck_1 = 16;`
	`7654`	`+`
	`7655`	`+ float tmp[16];`
	`7656`	`+`
	`7657`	`+ for (int64_t iir1 = ir1_start; iir1 < ir1_end; iir1 += blck_1) {`
	`7658`	`+ for (int64_t iir0 = ir0_start; iir0 < ir0_end; iir0 += blck_0) {`
	`7659`	`+ for (int64_t ir1 = iir1; ir1 < iir1 + blck_1 && ir1 < ir1_end; ++ir1) {`
	`7660`	`+ const int64_t _i12 = ir1; // logical row index for this expert`
	`7661`	`+`
	`7662`	`+ struct mmid_row_mapping row_mapping = MMID_MATRIX_ROW(cur_a, _i12);`
	`7663`	`+ const int id = row_mapping.i1; // selected expert index`
	`7664`	`+`
	`7665`	`+ const int64_t i11 = id % ne11;`
	`7666`	`+ const int64_t i12 = row_mapping.i2; // row index in src1`
	`7667`	`+`
	`7668`	`+ const int64_t i1 = id; // selected expert index`
	`7669`	`+ const int64_t i2 = i12; // row`
	`7670`	`+`
	`7671`	`+ // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides`
	`7672`	`+ // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using`
	`7673`	`+ // the original src1 data pointer, so we should index using the indices directly`
	`7674`	`+ // TODO: this is a bit of a hack, we should probably have a better way to handle this`
	`7675`	`+ const char * src1_col = (const char *) wdata +`
	`7676`	`+ (src1_cont \|\| src1->type != vec_dot_type`
	`7677`	`+ ? (i11 + i12ne11)row_size`
	`7678`	`+ : (i11nb11 + i12nb12));`
	`7679`	`+`
	`7680`	`+ float * dst_col = (float ) ((char ) dst->data + (i1nb1 + i2nb2));`
	`7681`	`+`
	`7682`	`+ for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir0_end; ++ir0) {`
	`7683`	`+ vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);`
	`7684`	`+ }`
	`7685`	`+`
	`7686`	`+ memcpy(&dst_col[iir0], tmp, (MIN(iir0 + blck_0, ir0_end) - iir0)*sizeof(float));`
	`7687`	`+ }`
	`7688`	`+ }`
	`7689`	`+ }`
	`7690`	`+}`
	`7691`	`+`
	`7692`	`+static void * incr_ptr_aligned(void ** p, size_t size, size_t align) {`
	`7693`	`+`
	`7694`	`+ void * ptr = *p;`
	`7695`	`+ ptr = (void *) GGML_PAD((uintptr_t) ptr, align);`
	`7696`	`+ p = (void ) ((char *) ptr + size);`
	`7697`	`+ return ptr;`
	`7698`	`+}`
	`7699`	`+`
`7609`	`7700`	`static void ggml_compute_forward_mul_mat_id(`
`7610`	`7701`	`const struct ggml_compute_params * params,`
`7611`	`7702`	`struct ggml_tensor * dst) {`
`@@ -7623,7 +7714,6 @@ static void ggml_compute_forward_mul_mat_id(`
`7623`	`7714`
`7624`	`7715`	`const bool src1_cont = ggml_is_contiguous(src1);`
`7625`	`7716`
`7626`		`- ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;`
`7627`	`7717`	`enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;`
`7628`	`7718`	`ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;`
`7629`	`7719`
`@@ -7641,41 +7731,60 @@ static void ggml_compute_forward_mul_mat_id(`
`7641`	`7731`	`const int n_ids = ids->ne[0]; // n_expert_used`
`7642`	`7732`	`const int n_as = ne02; // n_expert`
`7643`	`7733`
`7644`		`- char * wdata_src1_end = (src1->type == vec_dot_type) ?`
`7645`		`- (char *) params->wdata :`
`7646`		`- (char *) params->wdata + GGML_PAD(ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));`
	`7734`	`+ void * wdata_cur = params->wdata;`
`7647`	`7735`
`7648`		`- struct mmid_row_mapping {`
`7649`		`- int32_t i1;`
`7650`		`- int32_t i2;`
`7651`		`- };`
	`7736`	`+ if (src1->type != vec_dot_type) {`
	`7737`	`+ incr_ptr_aligned(&wdata_cur, ggml_row_size(vec_dot_type, ggml_nelements(src1)), sizeof(int64_t));`
	`7738`	`+ }`
	`7739`	`+`
	`7740`	`+ int64_t * matrix_row_counts = // [n_as]`
	`7741`	`+ incr_ptr_aligned(&wdata_cur, n_as*sizeof(int64_t), sizeof(int64_t));`
	`7742`	`+`
	`7743`	`+ struct mmid_row_mapping * matrix_rows = // [n_as][ids->ne[0]*ids->ne[1]]`
	`7744`	`+ incr_ptr_aligned(&wdata_cur, n_asids->ne[0]ids->ne[1]*sizeof(struct mmid_row_mapping), sizeof(int64_t));`
`7652`	`7745`
`7653`		`- int64_t * matrix_row_counts = (int64_t *) (wdata_src1_end); // [n_as]`
`7654`		`- struct mmid_row_mapping * matrix_rows = (struct mmid_row_mapping *)(matrix_row_counts + n_as); // [n_as][ne11]`
	`7746`	`+ char (*atomic_current_chunk)[CACHE_LINE_SIZE] = // [n_as]`
	`7747`	`+ incr_ptr_aligned(&wdata_cur, CACHE_LINE_SIZE * n_as, CACHE_LINE_SIZE);`
	`7748`	`+`
	`7749`	`+ GGML_ASSERT(params->wsize >= (size_t)((char ) wdata_cur - (char ) params->wdata));`
`7655`	`7750`
`7656`	`7751`	`if (src1->type != vec_dot_type) {`
`7657`	`7752`	`char * wdata = params->wdata;`
`7658`	`7753`
	`7754`	`+ const size_t nbw0 = ggml_type_size(vec_dot_type);`
`7659`	`7755`	`const size_t nbw1 = ggml_row_size(vec_dot_type, ne10);`
`7660`	`7756`	`const size_t nbw2 = nbw1*ne11;`
`7661`	`7757`	`const size_t nbw3 = nbw2*ne12;`
`7662`	`7758`
`7663`	`7759`	`assert(params->wsize >= ne13*nbw3);`
`7664`	`7760`	`GGML_ASSERT(src1->type == GGML_TYPE_F32);`
`7665`	`7761`
	`7762`	`+#if 0`
`7666`	`7763`	`for (int64_t i13 = 0; i13 < ne13; ++i13) {`
`7667`		`- for (int64_t i12 = 0; i12 < ne12; ++i12) {`
`7668`		`- for (int64_t i11 = ith; i11 < ne11; i11+=nth) {`
	`7764`	`+ for (int64_t i12 = ith; i12 < ne12; i12+=nth) {`
	`7765`	`+ for (int64_t i11 = 0; i11 < ne11; ++i11) {`
`7669`	`7766`	`from_float((float )((char ) src1->data + i13nb13 + i12nb12 + i11*nb11),`
`7670`	`7767`	`(void ) (wdata + i13nbw3 + i12nbw2 + i11nbw1),`
`7671`	`7768`	`ne10);`
`7672`	`7769`	`}`
`7673`	`7770`	`}`
`7674`	`7771`	`}`
	`7772`	`+#else`
	`7773`	`+ for (int64_t i13 = 0; i13 < ne13; ++i13) {`
	`7774`	`+ for (int64_t i12 = 0; i12 < ne12; ++i12) {`
	`7775`	`+ for (int64_t i11 = 0; i11 < ne11; ++i11) {`
	`7776`	`+ size_t bs = ggml_blck_size(vec_dot_type);`
	`7777`	`+ int64_t ne10_block_start = (ith * ne10/bs) / nth;`
	`7778`	`+ int64_t ne10_block_end = ((ith + 1) * ne10/bs) / nth;`
	`7779`	`+ from_float((float )((char ) src1->data + i13nb13 + i12nb12 + i11nb11 + ne10_block_startbs*nb10),`
	`7780`	`+ (void ) (wdata + i13nbw3 + i12nbw2 + i11nbw1 + ne10_block_start*nbw0),`
	`7781`	`+ (ne10_block_end - ne10_block_start) * bs);`
	`7782`	`+ }`
	`7783`	`+ }`
	`7784`	`+ }`
	`7785`	`+#endif`
`7675`	`7786`	`}`
`7676`	`7787`
`7677`		`-#define MMID_MATRIX_ROW(row_id, i1) matrix_rows[(row_id)*ne12 + (i1)]`
`7678`		`-`
`7679`	`7788`	`if (ith == 0) {`
`7680`	`7789`	`// initialize matrix_row_counts`
`7681`	`7790`	`memset(matrix_row_counts, 0, n_as*sizeof(int64_t));`
`@@ -7693,94 +7802,79 @@ static void ggml_compute_forward_mul_mat_id(`
`7693`	`7802`	`}`
`7694`	`7803`	`}`
`7695`	`7804`
	`7805`	`+ // reset current_chunk`
	`7806`	`+ for (int cur_a = ith; cur_a < n_as; cur_a += nth) {`
	`7807`	`+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);`
	`7808`	`+ *current_chunk_ctr = nth;`
	`7809`	`+ }`
	`7810`	`+`
`7696`	`7811`	`ggml_barrier(params->threadpool);`
`7697`	`7812`
`7698`		`- // compute each matrix multiplication in sequence`
`7699`	`7813`	`for (int cur_a = 0; cur_a < n_as; ++cur_a) {`
`7700`	`7814`	`const int64_t cne1 = matrix_row_counts[cur_a];`
`7701`	`7815`
`7702`	`7816`	`if (cne1 == 0) {`
`7703`	`7817`	`continue;`
`7704`	`7818`	`}`
`7705`	`7819`
`7706`		`- const char * src0_cur = (const char ) src0->data + cur_anb02;`
`7707`		`-`
`7708`		`- const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;`
	`7820`	`+ const char * src0_cur = (const char ) src0->data + cur_a nb02;`
	`7821`	`+ const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata;`
`7709`	`7822`	`const size_t row_size = ggml_row_size(vec_dot_type, ne10);`
`7710`	`7823`
`7711`		`- const int64_t nr0 = ne01; // src0 rows`
`7712`		`- const int64_t nr1 = cne1; // src1 rows`
`7713`		`-`
`7714`		`- // distribute the thread work across the inner or outer loop based on which one is larger`
`7715`		`-`
`7716`		`- const int64_t nth0 = nr0 > nr1 ? nth : 1; // parallelize by src0 rows`
`7717`		`- const int64_t nth1 = nr0 > nr1 ? 1 : nth; // parallelize by src1 rows`
`7718`		`-`
`7719`		`- const int64_t ith0 = ith % nth0;`
`7720`		`- const int64_t ith1 = ith / nth0;`
`7721`		`-`
`7722`		`- const int64_t dr0 = (nr0 + nth0 - 1)/nth0;`
`7723`		`- const int64_t dr1 = (nr1 + nth1 - 1)/nth1;`
`7724`		`-`
`7725`		`- const int64_t ir010 = dr0*ith0;`
`7726`		`- const int64_t ir011 = MIN(ir010 + dr0, nr0);`
	`7824`	`+ const int64_t nr0 = ne01;`
	`7825`	`+ const int64_t nr1 = cne1;`
`7727`	`7826`
`7728`		`- const int64_t ir110 = dr1*ith1;`
`7729`		`- const int64_t ir111 = MIN(ir110 + dr1, nr1);`
`7730`		`-`
`7731`		`- // threads with no work simply yield (not sure if it helps)`
`7732`		`- //if (ir010 >= ir011 \|\| ir110 >= ir111) {`
`7733`		`- // sched_yield();`
`7734`		`- // continue;`
`7735`		`- //}`
	`7827`	`+ int chunk_size = 16;`
	`7828`	`+ if (nr0 == 1 \|\| nr1 == 1) {`
	`7829`	`+ chunk_size = 64;`
	`7830`	`+ }`
`7736`	`7831`
`7737`		`- // block-tiling attempt`
`7738`		`- const int64_t blck_0 = 16;`
`7739`		`- const int64_t blck_1 = 16;`
	`7832`	`+#if defined(__aarch64__)`
	`7833`	`+ // disable for ARM`
	`7834`	`+ const bool disable_chunking = true;`
	`7835`	`+#else`
	`7836`	`+ // disable for NUMA`
	`7837`	`+ const bool disable_chunking = ggml_is_numa();`
	`7838`	`+#endif // defined(__aarch64__)`
`7740`	`7839`
`7741`		`- // attempt to reduce false-sharing (does not seem to make a difference)`
`7742`		`- floattmp[16];`
	`7840`	`+ int64_tnchunk0= (nr0+chunk_size-1) / chunk_size;`
	`7841`	`+ int64_tnchunk1= (nr1+chunk_size-1) / chunk_size;`
`7743`	`7842`
`7744`		`- for (int64_tiir1=ir110; iir1<ir111; iir1+=blck_1) {`
`7745`		`- for (int64_tiir0=ir010; iir0<ir011; iir0+=blck_0) {`
`7746`		`- for (int64_tir1=iir1; ir1<iir1+blck_1&&ir1<ir111; ++ir1) {`
`7747`		`- constint64_t_i12=ir1; // logical row index for this expert`
	`7843`	`+ if (nchunk0nchunk1<nth4\|\|disable_chunking) {`
	`7844`	`+ nchunk0=nr0>nr1 ? nth : 1;`
	`7845`	`+ nchunk1=nr0>nr1 ? 1 : nth;`
	`7846`	`+ }`
`7748`	`7847`
`7749`		`- structmmid_row_mappingrow_mapping=MMID_MATRIX_ROW(cur_a, _i12);`
`7750`		`- const intid=row_mapping.i1; // selected expert index`
	`7848`	`+ constint64_tdr0= (nr0+nchunk0-1) / nchunk0;`
	`7849`	`+ const int64_tdr1= (nr1+nchunk1-1) / nchunk1;`
`7751`	`7850`
`7752`		`- const int64_t i11 = id % ne11;`
`7753`		`- const int64_t i12 = row_mapping.i2; // row index in src1`
	`7851`	`+ int current_chunk = ith;`
`7754`	`7852`
`7755`		`- const int64_t i1 = id; // selected expert index`
`7756`		`- const int64_t i2 = i12; // row`
	`7853`	`+ atomic_int * current_chunk_ctr = (atomic_int *)(atomic_current_chunk + cur_a);`
`7757`	`7854`
`7758`		`- // desc: when src1 is not a contiguous memory block we have to calculate the offset using the strides`
`7759`		`- // if it is, then we have either copied the data to params->wdata and made it contiguous or we are using`
`7760`		`- // the original src1 data pointer, so we should index using the indices directly`
`7761`		`- // TODO: this is a bit of a hack, we should probably have a better way to handle this`
`7762`		`- const char * src1_col = (const char *) wdata +`
`7763`		`- (src1_cont \|\| src1->type != vec_dot_type`
`7764`		`- ? (i11 + i12ne11)row_size`
`7765`		`- : (i11nb11 + i12nb12));`
	`7855`	`+ while (current_chunk < nchunk0 * nchunk1) {`
	`7856`	`+ const int64_t ith0 = current_chunk % nchunk0;`
	`7857`	`+ const int64_t ith1 = current_chunk / nchunk0;`
`7766`	`7858`
`7767`		`- float * dst_col = (float ) ((char ) dst->data + (i1nb1 + i2nb2));`
	`7859`	`+ const int64_t ir0_start = dr0 * ith0;`
	`7860`	`+ const int64_t ir0_end = MIN(ir0_start + dr0, nr0);`
`7768`	`7861`
`7769`		`- //for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {`
`7770`		`- // vec_dot(ne00, &dst_col[ir0], src0_row + ir0*nb01, src1_col);`
`7771`		`- //}`
	`7862`	`+ const int64_t ir1_start = dr1 * ith1;`
	`7863`	`+ const int64_t ir1_end = MIN(ir1_start + dr1, nr1);`
`7772`	`7864`
`7773`		`- for (int64_t ir0 = iir0; ir0 < iir0 + blck_0 && ir0 < ir011; ++ir0) {`
`7774`		`- vec_dot(ne00, &tmp[ir0 - iir0], 0, src0_cur + ir0*nb01, 0, src1_col, 0, 1);`
`7775`		`- }`
	`7865`	`+ ggml_compute_forward_mul_mat_id_one_chunk(`
	`7866`	`+ dst, src0, src1, ids, cur_a,`
	`7867`	`+ ir0_start, ir0_end, ir1_start, ir1_end,`
	`7868`	`+ src0_cur, matrix_rows, row_size, src1_cont, wdata`
	`7869`	`+ );`
`7776`	`7870`
`7777`		`- memcpy(&dst_col[iir0], tmp, (MIN(iir0+blck_0, ir011) -iir0)*sizeof(float));`
`7778`		`- }`
	`7871`	`+ if (nth >= nchunk0*nchunk1) {`
	`7872`	`+ break;`
`7779`	`7873`	`}`
	`7874`	`+`
	`7875`	`+ current_chunk = atomic_fetch_add_explicit(current_chunk_ctr, 1, memory_order_relaxed);`
`7780`	`7876`	`}`
`7781`	`7877`	`}`
`7782`		`-`
`7783`		`-#undef MMID_MATRIX_ROW`
`7784`	`7878`	`}`
`7785`	`7879`
`7786`	`7880`	`// ggml_compute_forward_out_prod`
`@@ -13713,14 +13807,19 @@ struct ggml_cplan ggml_graph_plan(`
`13713`	`13807`	`cur = 0;`
`13714`	`13808`	`const struct ggml_tensor * src0 = node->src[0];`
`13715`	`13809`	`const struct ggml_tensor * src1 = node->src[1];`
	`13810`	`+ const struct ggml_tensor * ids = node->src[2];`
`13716`	`13811`	`const enum ggml_type vec_dot_type = type_traits_cpu[src0->type].vec_dot_type;`
	`13812`	`+ const int n_as = src0->ne[2];`
	`13813`	`+ // src1`
`13717`	`13814`	`if (src1->type != vec_dot_type) {`
`13718`		`- cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));`
	`13815`	`+ cur += ggml_row_size(vec_dot_type, ggml_nelements(src1))+sizeof(int64_t);`
`13719`	`13816`	`}`
`13720`		`- const int n_as = src0->ne[2];`
`13721`		`- cur += GGML_PAD(cur, sizeof(int64_t)); // align`
`13722`		`- cur += n_as * sizeof(int64_t); // matrix_row_counts`
`13723`		`- cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows`
	`13817`	`+ // matrix_row_counts`
	`13818`	`+ cur += n_as * sizeof(int64_t) + sizeof(int64_t);`
	`13819`	`+ // matrix_rows`
	`13820`	`+ cur += n_asids->ne[0]ids->ne[1]*sizeof(struct mmid_row_mapping) + sizeof(int64_t);`
	`13821`	`+ // atomic_current_chunk`
	`13822`	`+ cur += CACHE_LINE_SIZE*n_as + CACHE_LINE_SIZE;`
`13724`	`13823`	`} break;`
`13725`	`13824`	`case GGML_OP_OUT_PROD:`
`13726`	`13825`	`{`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit a394039

File tree

1 file changed

1 file changed

`‎ggml/src/ggml-cpu/ggml-cpu.c‎`

0 commit comments