How to use Thrust to sort the rows of a matrix?

Question 1

I have a 5000x500 matrix and I want to sort each row separately with cuda. I can use arrayfire but this is just a for loop over the thrust::sort, which should not be efficient.

https://github.com/arrayfire/arrayfire/blob/devel/src/backend/cuda/kernel/sort.hpp

for(dim_type w = 0; w < val.dims[3]; w++) {
 dim_type valW = w * val.strides[3];
 for(dim_type z = 0; z < val.dims[2]; z++) {
 dim_type valWZ = valW + z * val.strides[2];
 for(dim_type y = 0; y < val.dims[1]; y++) {
 dim_type valOffset = valWZ + y * val.strides[1];
 if(isAscending) {
 thrust::sort(val_ptr + valOffset, val_ptr + valOffset + val.dims[0]);
 } else {
 thrust::sort(val_ptr + valOffset, val_ptr + valOffset + val.dims[0],
 thrust::greater<T>());
 }
 }
 }
 }

Is there a way to fuse operations in thrust so as to have the sorts run in parallel? Indeed, what I am looking for is a generic way to fuse for loop iterations into.

Question 2

Could you adapt the approach in How to normalize matrix columns in CUDA with max performance??

Question 3

I would try nesting a call to thrust::sort inside a call to thrust::for_each.

Question 4

I am trying to understand both approaches... Thank you.

Question 5

Ok! I give up. Going to do it the easy way.

Question 6

I can think of 2 possibilities, one of which is suggested already by @JaredHoberock. I don't know of a general methodology to fuse for-loop iterations in thrust, but the second method is the more general approach. My guess is that the first method would be the faster of the two approaches, in this case.

Use a vectorized sort. If the regions to be sorted by your nested for-loops are non-overlapping, you can do a vectorized sort using 2 back-to-back stable-sort operations as discussed here.
Thrust v1.8 (available with CUDA 7 RC, or via direct download from the thrust github repository includes support for nesting thrust algorithms, by including a thrust algorithm call within a custom functor passed to another thrust algorithm. If you use the thrust::for_each operation to select the individual sorts you need to perform, you can run those sorts with a single thrust algorithm call, by including the thrust::sort operation in the functor you pass to thrust::for_each.

Here's a fully worked comparison between 3 methods:

the original sort-in-a-loop method
vectorized/batch sort
nested sort

In each case, we're sorting the same 16000 sets of 1000 ints each.

$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 16000
#define DSIZE 1000
int my_mod_start = 0;
int my_mod(){
 return (my_mod_start++)/DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
 return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
 thrust::device_ptr<int> data;
 int dsize;
 __host__ __device__
 void operator()(int start_idx)
 {
 thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
 }
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
 timeval tv;
 gettimeofday(&tv, 0);
 return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main(){
 cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
 thrust::host_vector<int> h_data(DSIZE*NSORTS);
 thrust::generate(h_data.begin(), h_data.end(), rand);
 thrust::device_vector<int> d_data = h_data;
 // first time a loop
 thrust::device_vector<int> d_result1 = d_data;
 thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
 unsigned long long mytime = dtime_usec(0);
 for (int i = 0; i < NSORTS; i++)
 thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
 cudaDeviceSynchronize();
 mytime = dtime_usec(mytime);
 std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;
 //vectorized sort
 thrust::device_vector<int> d_result2 = d_data;
 thrust::host_vector<int> h_segments(DSIZE*NSORTS);
 thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
 thrust::device_vector<int> d_segments = h_segments;
 mytime = dtime_usec(0);
 thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
 thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
 cudaDeviceSynchronize();
 mytime = dtime_usec(mytime);
 std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
 if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
 //nested sort
 thrust::device_vector<int> d_result3 = d_data;
 sort_functor f = {d_result3.data(), DSIZE};
 thrust::device_vector<int> idxs(NSORTS);
 thrust::sequence(idxs.begin(), idxs.end());
 mytime = dtime_usec(0);
 thrust::for_each(idxs.begin(), idxs.end(), f);
 cudaDeviceSynchronize();
 mytime = dtime_usec(mytime);
 std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
 if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
 return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$

Notes:

These results will vary significantly from GPU to GPU.
The "nested" time/method may vary significantly on a GPU that can support dynamic parallelism, as this will affect how thrust runs the nested sort functions. To test with dynamic parallelism, change the compile switches from -arch=sm_20 to -arch=sm_35 -rdc=true -lcudadevrt
This code requires CUDA 7 RC. I used Fedora 20.
The nested sort method also will allocate from the device side, therefore we must substantially increase the device allocation heap using cudaDeviceSetLimit.
If you are using dynamic parallelism, and depending on the type of GPU you are running on, the amount of memory reserved with cudaDeviceSetLimit may need to be increased perhaps by an additional factor of 8.

Question 7

Thank you extremely much! Not only have you answered my question, you have also shown me how to do a lot of things correctly. I must add I saw this: stackoverflow.com/questions/9037906/… And realized that the normal thrust sort would be faster than anything else (besides a hand coded parallel sorting approach). My data was unsigned and the 16 most significant bits of my data were all zero. So I just put the row number in the 16 msb and sorted it will thrust sort.

Question 8

Hi, Do you know how to do argsort along rows of matrix? It means to find out the order of the elements of rows in a matrix rather than the sorted matrix.

score 16 · Accepted Answer · 2015-01-31 18:00:30Z

I can think of 2 possibilities, one of which is suggested already by @JaredHoberock. I don't know of a general methodology to fuse for-loop iterations in thrust, but the second method is the more general approach. My guess is that the first method would be the faster of the two approaches, in this case.

Use a vectorized sort. If the regions to be sorted by your nested for-loops are non-overlapping, you can do a vectorized sort using 2 back-to-back stable-sort operations as discussed here.
Thrust v1.8 (available with CUDA 7 RC, or via direct download from the thrust github repository includes support for nesting thrust algorithms, by including a thrust algorithm call within a custom functor passed to another thrust algorithm. If you use the thrust::for_each operation to select the individual sorts you need to perform, you can run those sorts with a single thrust algorithm call, by including the thrust::sort operation in the functor you pass to thrust::for_each.

Here's a fully worked comparison between 3 methods:

the original sort-in-a-loop method
vectorized/batch sort
nested sort

In each case, we're sorting the same 16000 sets of 1000 ints each.

$ cat t617.cu
#include <thrust/device_vector.h>
#include <thrust/device_ptr.h>
#include <thrust/host_vector.h>
#include <thrust/sort.h>
#include <thrust/execution_policy.h>
#include <thrust/generate.h>
#include <thrust/equal.h>
#include <thrust/sequence.h>
#include <thrust/for_each.h>
#include <iostream>
#include <stdlib.h>
#define NSORTS 16000
#define DSIZE 1000
int my_mod_start = 0;
int my_mod(){
 return (my_mod_start++)/DSIZE;
}
bool validate(thrust::device_vector<int> &d1, thrust::device_vector<int> &d2){
 return thrust::equal(d1.begin(), d1.end(), d2.begin());
}
struct sort_functor
{
 thrust::device_ptr<int> data;
 int dsize;
 __host__ __device__
 void operator()(int start_idx)
 {
 thrust::sort(thrust::device, data+(dsize*start_idx), data+(dsize*(start_idx+1)));
 }
};
#include <time.h>
#include <sys/time.h>
#define USECPSEC 1000000ULL
unsigned long long dtime_usec(unsigned long long start){
 timeval tv;
 gettimeofday(&tv, 0);
 return ((tv.tv_sec*USECPSEC)+tv.tv_usec)-start;
}
int main(){
 cudaDeviceSetLimit(cudaLimitMallocHeapSize, (16*DSIZE*NSORTS));
 thrust::host_vector<int> h_data(DSIZE*NSORTS);
 thrust::generate(h_data.begin(), h_data.end(), rand);
 thrust::device_vector<int> d_data = h_data;
 // first time a loop
 thrust::device_vector<int> d_result1 = d_data;
 thrust::device_ptr<int> r1ptr = thrust::device_pointer_cast<int>(d_result1.data());
 unsigned long long mytime = dtime_usec(0);
 for (int i = 0; i < NSORTS; i++)
 thrust::sort(r1ptr+(i*DSIZE), r1ptr+((i+1)*DSIZE));
 cudaDeviceSynchronize();
 mytime = dtime_usec(mytime);
 std::cout << "loop time: " << mytime/(float)USECPSEC << "s" << std::endl;
 //vectorized sort
 thrust::device_vector<int> d_result2 = d_data;
 thrust::host_vector<int> h_segments(DSIZE*NSORTS);
 thrust::generate(h_segments.begin(), h_segments.end(), my_mod);
 thrust::device_vector<int> d_segments = h_segments;
 mytime = dtime_usec(0);
 thrust::stable_sort_by_key(d_result2.begin(), d_result2.end(), d_segments.begin());
 thrust::stable_sort_by_key(d_segments.begin(), d_segments.end(), d_result2.begin());
 cudaDeviceSynchronize();
 mytime = dtime_usec(mytime);
 std::cout << "vectorized time: " << mytime/(float)USECPSEC << "s" << std::endl;
 if (!validate(d_result1, d_result2)) std::cout << "mismatch 1!" << std::endl;
 //nested sort
 thrust::device_vector<int> d_result3 = d_data;
 sort_functor f = {d_result3.data(), DSIZE};
 thrust::device_vector<int> idxs(NSORTS);
 thrust::sequence(idxs.begin(), idxs.end());
 mytime = dtime_usec(0);
 thrust::for_each(idxs.begin(), idxs.end(), f);
 cudaDeviceSynchronize();
 mytime = dtime_usec(mytime);
 std::cout << "nested time: " << mytime/(float)USECPSEC << "s" << std::endl;
 if (!validate(d_result1, d_result3)) std::cout << "mismatch 2!" << std::endl;
 return 0;
}
$ nvcc -arch=sm_20 -std=c++11 -o t617 t617.cu
$ ./t617
loop time: 8.51577s
vectorized time: 0.068802s
nested time: 0.567959s
$

Notes:

These results will vary significantly from GPU to GPU.
The "nested" time/method may vary significantly on a GPU that can support dynamic parallelism, as this will affect how thrust runs the nested sort functions. To test with dynamic parallelism, change the compile switches from -arch=sm_20 to -arch=sm_35 -rdc=true -lcudadevrt
This code requires CUDA 7 RC. I used Fedora 20.
The nested sort method also will allocate from the device side, therefore we must substantially increase the device allocation heap using cudaDeviceSetLimit.
If you are using dynamic parallelism, and depending on the type of GPU you are running on, the amount of memory reserved with cudaDeviceSetLimit may need to be increased perhaps by an additional factor of 8.

Thank you extremely much! Not only have you answered my question, you have also shown me how to do a lot of things correctly. I must add I saw this: stackoverflow.com/questions/9037906/… And realized that the normal thrust sort would be faster than anything else (besides a hand coded parallel sorting approach). My data was unsigned and the 16 most significant bits of my data were all zero. So I just put the row number in the 16 msb and sorted it will thrust sort.
Hi, Do you know how to do argsort along rows of matrix? It means to find out the order of the elements of rows in a matrix rather than the sorted matrix.

CollectivesTM on Stack Overflow

How to use Thrust to sort the rows of a matrix?

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Hot Network Questions

CollectivesTM on Stack Overflow

1 Answer 1

2 Comments

Your Answer

Sign up or log in

Post as a guest

Post as a guest

Linked

Related