Oct 17, 2025 · Sep 23, 2025 · Sep 2, 2025 · Oct 11, 2025 · Oct 15, 2025 · Oct 15, 2025
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
diff --git a/paddle/phi/backends/xpu/xpu3_op_list.cc b/paddle/phi/backends/xpu/xpu3_op_list.cc
diff --git a/paddle/phi/kernels/cpu/index_elementwise_get_kernel.cc b/paddle/phi/kernels/cpu/index_elementwise_get_kernel.cc
diff --git a/paddle/phi/kernels/cpu/index_elementwise_put_grad_kernel.cc b/paddle/phi/kernels/cpu/index_elementwise_put_grad_kernel.cc
diff --git a/paddle/phi/kernels/cpu/index_elementwise_put_kernel.cc b/paddle/phi/kernels/cpu/index_elementwise_put_kernel.cc
diff --git a/paddle/phi/kernels/funcs/index_elementwise_utils.h b/paddle/phi/kernels/funcs/index_elementwise_utils.h
diff --git a/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_get_kernel.cu
diff --git a/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_put_grad_kernel.cu
diff --git a/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu b/paddle/phi/kernels/gpu/index_elementwise_put_kernel.cu
diff --git a/paddle/phi/kernels/stride/indexing_kernel.cu b/paddle/phi/kernels/stride/indexing_kernel.cu
diff --git a/paddle/phi/kernels/xpu/index_elementwise_get_kernel.cc b/paddle/phi/kernels/xpu/index_elementwise_get_kernel.cc
Original file line number	Diff line number	Diff line change
Expand Up		@@ -36,7 +36,7 @@ add_compile_definitions(XPUAPI_NOT_INCLUDE_DEPRECATED)
		if(NOT DEFINED XPU_XHPC_BASE_DATE)
		set(XPU_XHPC_BASE_DATE "dev/20250922")
		endif()
	set(XPU_XCCL_BASE_VERSION "3.0.3.1") # For XRE5
	set(XPU_XCCL_BASE_VERSION "3.0.3.3") # For XRE5
		if(NOT DEFINED XPU_XFT_BASE_VERSION)
		set(XPU_XFT_BASE_VERSION "20250507/xpu3")
		endif()
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -823,6 +823,39 @@ XPUOpMap& get_kl3_ops() {
		XPUKernelSet({phi::DataType::FLOAT32,
		phi::DataType::INT32,
		phi::DataType::INT64})},
	{"index_elementwise_get",
	XPUKernelSet({phi::DataType::BOOL,
	phi::DataType::INT32,
	phi::DataType::INT8,
	phi::DataType::UINT8,
	phi::DataType::INT64,
	phi::DataType::FLOAT32,
	phi::DataType::FLOAT64,
	phi::DataType::FLOAT16,
	phi::DataType::BFLOAT16,
	phi::DataType::FLOAT64})},
	{"index_elementwise_put",
	XPUKernelSet({phi::DataType::BOOL,
	phi::DataType::INT32,
	phi::DataType::INT8,
	phi::DataType::UINT8,
	phi::DataType::INT64,
	phi::DataType::FLOAT32,
	phi::DataType::FLOAT64,
	phi::DataType::FLOAT16,
	phi::DataType::BFLOAT16,
	phi::DataType::FLOAT64})},
	{"index_elementwise_put_with_tensor",
	XPUKernelSet({phi::DataType::BOOL,
	phi::DataType::INT32,
	phi::DataType::INT8,
	phi::DataType::UINT8,
	phi::DataType::INT64,
	phi::DataType::FLOAT32,
	phi::DataType::FLOAT64,
	phi::DataType::FLOAT16,
	phi::DataType::BFLOAT16,
	phi::DataType::FLOAT64})},
		{"index_put",
		XPUKernelSet({phi::DataType::FLOAT32,
		phi::DataType::INT32,
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,11 +24,11 @@ namespace phi {
		template <typename T, typename IndexT = int>
		void CPUIndexElementwiseGetKernel(const phi::CPUContext& dev_ctx,
		const DenseTensor& input,
	const std::vector<const DenseTensor*> index,
	const std::vector<const DenseTensor*>& index,
		const std::vector<int64_t>& input_dims,
		const std::vector<int64_t>& input_strides,
		const std::vector<int64_t>& index_dims,
	const std::vector<int64_t>& index_stride,
	const std::vector<int64_t>& index_strides,
		const int64_t slice_offset,
		DenseTensor* output) {
		int64_t numel = 0;
Expand All		@@ -42,7 +42,7 @@ void CPUIndexElementwiseGetKernel(const phi::CPUContext& dev_ctx,
		auto strides = std::array<int64_t, DDim::kMaxRank>{};
		for (int64_t i = 0; i < num_indices; i++) {
		sizes[i] = index_dims[i];
	strides[i] = index_stride[i];
	strides[i] = index_strides[i];
		}
		std::array<int64_t*, 3> strides_array;
		std::vector<int64_t> desired_shape;
Expand Down Expand Up		@@ -97,7 +97,7 @@ void IndexElementwiseGetKernel(const Context& dev_ctx,
		const std::vector<int64_t>& input_dims,
		const std::vector<int64_t>& input_strides,
		const std::vector<int64_t>& index_dims,
	const std::vector<int64_t>& index_stride,
	const std::vector<int64_t>& index_strides,
		const int64_t slice_offset,
		const bool accumulate,
		const bool is_combined,
Expand All		@@ -124,7 +124,7 @@ void IndexElementwiseGetKernel(const Context& dev_ctx,
		input_dims,
		input_strides,
		index_dims,
	index_stride,
	index_strides,
		slice_offset,
		out);
		}
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -74,8 +74,11 @@ void CPUIndexElementwisePutGradKernel(
		auto offset_calc =
		funcs::CPUmake_offset_calculator_put<3>(desired_shape, strides_array);
		const int64_t N = numel;
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
		using dtype = funcs::OpaqueType<sizeof(T)>;
		if (!value_grad) {
		char* out_ptr = reinterpret_cast<char*>(x_grad->data<T>());
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -76,8 +76,11 @@ void CPUIndexElementwisePutWithTensorKernel(
		auto offset_calc =
		funcs::CPUmake_offset_calculator_put<3>(desired_shape, strides_array);
		const int64_t N = numel;
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
		using dtype = funcs::OpaqueType<sizeof(T)>;
		const char* in_ptr = reinterpret_cast<const char*>(value.data<T>());
		char* out_ptr = reinterpret_cast<char*>(output_);
Expand Down Expand Up		@@ -150,14 +153,17 @@ void CPUIndexElementwisePutKernel(const phi::CPUContext& dev_ctx,
		auto offset_calc =
		funcs::CPUmake_offset_calculator_put<3>(desired_shape, strides_array);
		const int64_t N = numel;
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	char* out_ptr = reinterpret_cast<char*>(output_);
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
	char* out_ptr = reinterpret_cast<char*>(output_) + slice_offset;
		if (index.size() == 1 && index[0]->dtype() == phi::DataType::BOOL) {
		const bool* mask_data = index[0]->data<bool>();
		for (int64_t idx = 0; idx < N; idx++) {
		const auto offsets = offset_calc.cpu_get(idx);
	char* const out_data = out_ptr + offsets[0] + slice_offset;
	char* const out_data = out_ptr + offsets[0];
		if (mask_data[idx]) {
		reinterpret_cast<T>(out_data) = value_T;
		}
Expand All		@@ -166,7 +172,7 @@ void CPUIndexElementwisePutKernel(const phi::CPUContext& dev_ctx,
		auto index_ptrs = funcs::GetIndexDataPtrs<IndexT>(index);
		for (int64_t idx = 0; idx < N; idx++) {
		const auto offsets = offset_calc.cpu_get(idx);
	char* const out_data = out_ptr + offsets[0] + slice_offset;
	char* const out_data = out_ptr + offsets[0];
		int64_t offset = 0;
		for (int64_t i = 0; i < num_indices; i++) {
		int64_t index = reinterpret_cast<int64_t>(index_ptrs[i] + offsets[2]);
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -37,7 +37,7 @@ struct alignas(N) OpaqueType {

		template <typename IndexT>
		std::array<char*, DDim::kMaxRank> GetIndexDataPtrs(
	const std::vector<const DenseTensor*> index) {
	const std::vector<const DenseTensor*>& index) {
		std::array<char*, DDim::kMaxRank> index_ptrs{};

		PADDLE_ENFORCE_LE(index.size(),
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -24,7 +24,7 @@ namespace phi {
		template <typename T, typename IndexT = int>
		void GPUIndexElementwiseGetKernel(const phi::GPUContext& dev_ctx,
		const DenseTensor& input,
	const std::vector<const DenseTensor*> index,
	const std::vector<const DenseTensor*>& index,
		const std::vector<int64_t>& input_dims,
		const std::vector<int64_t>& input_strides,
		const std::vector<int64_t>& index_dims,
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -79,8 +79,11 @@ void GPUIndexElementwisePutGradKernel(
		auto offset_calc =
		funcs::make_offset_calculator_put<3>(desired_shape, strides_array);
		const int64_t N = numel;
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
		constexpr int nt = 128;
		constexpr int vt = 4;
		const dim3 block(nt);
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -70,8 +70,11 @@ void GPUIndexElementwisePutKernel(const phi::GPUContext& dev_ctx,
		funcs::make_offset_calculator_put<3>(desired_shape, strides_array);

		const int64_t N = numel;
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
		constexpr int nt = 128;
		constexpr int vt = 4;
		const dim3 block(nt);
Expand Down Expand Up		@@ -159,8 +162,11 @@ void GPUIndexElementwisePutWithTensorKernel(
		funcs::make_offset_calculator_put<3>(desired_shape, strides_array);

		const int64_t N = numel;
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
		constexpr int nt = 128;
		constexpr int vt = 4;
		const dim3 block(nt);
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -158,8 +158,11 @@ void LaunchIndexPutKernel_V2(const Context& dev_ctx,
		funcs::OffsetCalculator offset_calc = funcs::make_offset_calculator<3>(iter);

		const int64_t N = iter.numel();
	PADDLE_ENFORCE(N >= 0 && N <= std::numeric_limits<int32_t>::max(),
	"N >= 0 && N <= std::numeric_limits<int32_t>::max()");
	PADDLE_ENFORCE_EQ(true,
	(N >= 0 && N <= std::numeric_limits<int32_t>::max()),
	common::errors::PreconditionNotMet(
	"the value of N should be in [0, "
	"std::numeric_limits<int32_t>::max()]"));
		constexpr int nt = 128;
		constexpr int vt = 4;
		const dim3 block(nt);
Expand Down