40
\$\begingroup\$

I've written a program to compare different sorting algorithms with the array size being 10, 1,000, 100,000, 1,000,000 and 10,000,000. I, of course, expect Insertion to win out on 10 and merge heap and quick to really perform at the upper levels.

However, below are the times I'm getting. (Note that quick sort is growing much faster than heap and merge even though they are all \$\theta(n \log n)\$.)

Things I've considered:

  • Each algorithm uses the same seed to initialize each array, so the numbers are the same.
  • I am only timing the algorithm and nothing extra.
  • My professor approves of the code and doesn't know what's wrong, but maybe we're both missing something.
  • I moved the program from my flash drive to the desktop to test possible data transfer problems.
  • The algorithm (except for test 2) has only run at night with nothing else going.
Key: Hours:Minutes:Seconds:Milliseconds:Microseconds

Test 3

n Insertion Merge Heap Quick 
10 0:0:0:0:3 0:0:0:0:28 0:0:0:0:35 0:0:0:0:43 
1000 0:0:0:11:470 0:0:0:2:358 0:0:0:7:787 0:0:0:3:596 
100000 0:0:1:911:865 0:0:0:51:506 0:0:0:24:257 0:0:0:55:519 
1000000 0:3:59:769:105 0:0:0:351:129 0:0:0:238:878 0:0:0:916:885 
10000000 8:11:44:552:820 0:0:3:521:108 0:0:3:560:178 0:1:13:709:830

Test 2

n Insertion Merge Heap Quick 
10 0:0:0:0:5 0:0:0:0:49 0:0:0:0:37 0:0:0:0:50 
1000 0:0:0:15:473 0:0:0:2:893 0:0:0:8:402 0:0:0:5:230 
100000 0:0:2:518:566 0:0:0:57:845 0:0:0:32:917 0:0:0:71:243 
1000000 0:5:38:538:795 0:0:0:460:796 0:0:0:312:66 0:0:1:398:508 
10000000 11:48:6:630:.&checktime(6390,0,3,':'):690:329 0:0:3:518:281 0:1:18:180:11

Test 1

 n Insertion Merge Heap Quick
 10 2676ns 19626ns 26316ns 33454ns
 1000 11504040ns 2298935ns 6835250ns 3741456ns
 100000 1849274815ns 47654052ns 23620952ns 52819295ns
 1000000 0:3:58ns 0:0:0ns 0:0:0ns 0:0:0ns
 10000000 8:10:25ns 0:0:3ns 0:0:3ns 0:1:15ns

Here's my quick sort implementation (35 lines):

public static long quick(int[] list) {
 long startTime = System.nanoTime();
 quickSort(list, 0, list.length - 1);
 long endTime = System.nanoTime();
 return endTime - startTime;
}
public static void quickSort(int[] A, int p, int r) {
 if(p < r) {
 int q = randomizedPartition(A, p, r);
 quickSort(A, p, q-1);
 quickSort(A, q+1, r);
 }
}
public static int randomizedPartition(int[] A, int p, int r) {
 Random random = new Random();
 int i = random.nextInt((r-p)+1)+p;
 swap(A,r,i);
 return partition(A,p,r);
}
public static int partition(int[] A, int p, int r) {
 int x = A[r];
 int i = p-1;
 for(int j = p; j < r; j++) {
 if(A[j] <= x) {
 i++;
 swap(A, i, j);
 }
 }
 swap(A, i+1, r);
 return i+1;
}

And if needed (267 lines) here's my entire code:

import java.util.Random;
import java.util.concurrent.TimeUnit;
import java.io.*;
public class algComp {
public static void main(String[] args) {
 driver(10); // Sort array of length 10
 driver(1_000); // Sort array of length 1000
 driver(100_000);
 /* WARNING: Running program with the below values takes a lot of time!! */
 driver(1_000_000);
 //driver(10_000_000);
 /* You are now leaving the danger zone. */
 System.out.println("-----------------------------------------------");
 content = String.format(content + "\nKey: Hours:Minutes:Seconds:Milliseconds:Microseconds");
 printToFile(); // Prints data to times.txt
}
public static void driver(int n) {
 // Insertion sort
 int[] list = data(n);
 if(n == 10) {
 System.out.format("%10s","Unsorted: ");
 printList(list);
 }
 long iTime = insertion(list);
 if(n == 10) {
 System.out.format("%10s","iSorted: ");
 printList(list);
 }
 // Merge sort
 list = data(n);
 long mTime = mergeSort(list);
 if(n == 10) {
 System.out.format("%10s","mSorted: ");
 printList(list);
 }
 // Heap sort
 list=data(n);
 long hTime = heap(list);
 if(n == 10) {
 System.out.format("%10s","hSorted: ");
 printList(list);
 }
 // Quick sort
 list=data(n);
 long qTime = quick(list);
 if(n == 10) {
 System.out.format("%10s","qSorted: ");
 printList(list);
 }
 if(n == 10) { // This will only print once
 // Print prettifying stuff
 System.out.println("Data is being written to times.txt...");
 content = String.format(content + "%-9s%-17s%-17s%-17s%-17s\n",
 "n","Insertion","Merge","Heap","Quick");
 }
 content = String.format(content + "%-9d%-17s%-17s%-17s%-17s%-1s",n,displayTime(iTime),
 displayTime(mTime),displayTime(hTime),displayTime(qTime),"\n");
}
public static long insertion(int[] A) {
 long startTime = System.nanoTime();
 int i, j, key;
 for(j = 1; j < A.length; j++) {
 key = A[j];
 // If previous is greater than selected (key) swap
 for(i = j - 1; (i >= 0) && (A[i] > key); i--) {
 A[i+1] = A[i];
 }
 A[i+1] = key;
 }
 long endTime = System.nanoTime();
 return endTime - startTime;
}
public static long mergeSort(int[] A) {
 long startTime = System.nanoTime();
 if(A.length > 1) {
 // First Half
 int[] firstHalf = new int[A.length/2];
 System.arraycopy(A, 0, firstHalf, 0, A.length/2);
 mergeSort(firstHalf);
 // Second Half
 int secondHalfLength = A.length - A.length/2;
 int[] secondHalf = new int[secondHalfLength];
 System.arraycopy(A, A.length/2, secondHalf, 0, secondHalfLength);
 mergeSort(secondHalf);
 // Merge two arrays
 merge(firstHalf,secondHalf,A);
 }
 long endTime = System.nanoTime();
 return endTime - startTime;
}
public static void merge(int[] A1, int[] A2, int[] temp) {
 int current1 = 0; // Current index in list 1
 int current2 = 0; // Current index in list 2
 int current3 = 0; // Current index in temp
 // Compares elemets in A1 and A2 and sorts them
 while(current1 < A1.length && current2 < A2.length) {
 if(A1[current1] < A2[current2])
 temp[current3++] = A1[current1++];
 else
 temp[current3++] = A2[current2++];
 }
 // Merge two arrays into temp
 while(current1 < A1.length)
 temp[current3++] = A1[current1++];
 while(current2 < A2.length)
 temp[current3++] = A2[current2++];
}
public static long heap(int[] A) {
 long startTime = System.nanoTime();
 int temp;
 int heapSize = A.length-1;
 buildMaxHeap(A);
 for(int i = A.length-1; i >= 1; i--) {
 swap(A,0,i); // Root is now biggest element, swap to end of array
 heapSize--; // Reduce heapSize to ignore sorted elements
 maxHeapify(A,0,heapSize);
 }
 long endTime = System.nanoTime();
 return endTime - startTime;
}
public static void buildMaxHeap(int[] A) {
 int heapSize = A.length-1;
 // Bottom up, check parents children, sort and move up tree
 for(int i = (heapSize/2); i >= 0; i--)
 maxHeapify(A,i,heapSize);
}
public static void maxHeapify(int[] A, int i, int heapSize) {
 int temp,largest;
 int l = left(i); // 2i
 int r = right(i); // 2i + 1
 if(l <= heapSize && A[l] > A[i]) // Check left child (which is largest?)
 largest = l;
 else largest = i;
 if(r <= heapSize && A[r] > A[largest]) // Check right child
 largest = r;
 if(largest != i) { // If parent is biggest do nothing, else make parent largest
 swap(A,i,largest);
 maxHeapify(A,largest,heapSize);
 }
}
public static int left(int i) {
 return 2*i;
}
public static int right(int i) {
 return 2*i+1;
}
public static long quick(int[] list) {
 long startTime = System.nanoTime();
 quickSort(list, 0, list.length - 1);
 long endTime = System.nanoTime();
 return endTime - startTime;
}
public static void quickSort(int[] A, int p, int r) {
 if(p < r) {
 int q = randomizedPartition(A, p, r);
 quickSort(A, p, q-1);
 quickSort(A, q+1, r);
 }
}
public static int randomizedPartition(int[] A, int p, int r) {
 Random random = new Random();
 int i = random.nextInt((r-p)+1)+p;
 swap(A,r,i);
 return partition(A,p,r);
}
public static int partition(int[] A, int p, int r) {
 int x = A[r];
 int i = p-1;
 for(int j = p; j < r; j++) {
 if(A[j] <= x) {
 i++;
 swap(A, i, j);
 }
 }
 swap(A, i+1, r);
 return i+1;
}
public static void swap(int[] list, int i, int j) {
 int temp = list[i];
 list[i] = list[j];
 list[j] = temp;
}
public static String displayTime(long n) {
 long hours = TimeUnit.NANOSECONDS.toHours(n);
 long minutes = TimeUnit.NANOSECONDS.toMinutes(n) - (TimeUnit.NANOSECONDS.toHours(n)*60);
 long seconds = TimeUnit.NANOSECONDS.toSeconds(n) - (TimeUnit.NANOSECONDS.toMinutes(n) *60);
 long milliseconds = TimeUnit.NANOSECONDS.toMillis(n) - (TimeUnit.NANOSECONDS.toSeconds(n)*1000);
 long microseconds = TimeUnit.NANOSECONDS.toMicros(n) - (TimeUnit.NANOSECONDS.toMillis(n)*1000);
 String displayThis = (hours + ":" + minutes + ":" + seconds + ":" + milliseconds + ":" + microseconds);
 return displayThis;
}
public static int[] data(int n) {
 Random random = new Random(seed); // Random seed stays same for all sorts
 int[] list = new int[n];
 for(int i = 0; i < list.length; i++) {
 list[i] = random.nextInt(1000);
 }
 return list;
}
public static void printList(int[] list) {
 for(int i = 0; i < list.length; i++) {
 System.out.format("%5d",list[i]);
 }
 System.out.println();
}
public static void printToFile() {
 // Print to file
 try {
 File file = new File("times.txt");
 if(!file.exists())
 file.createNewFile();
 FileWriter fw = new FileWriter(file.getAbsoluteFile());
 BufferedWriter bw = new BufferedWriter(fw);
 bw.write(content);
 bw.close();
 System.out.println("Done.");
 } catch (IOException e) {
 e.printStackTrace();
 }
}
// Global variables
public static String content = ""; // Used to print data to text file times.txt
public static int seed = (int)(Math.random()*10_000); // Seed for generating lists
}

What do you think? Surely quick sort should be running near 3 seconds at 10mil rather than a minute. What am I doing wrong?

mjolka
16.3k2 gold badges30 silver badges73 bronze badges
asked Oct 21, 2014 at 1:46
\$\endgroup\$
15
  • 4
    \$\begingroup\$ Welcome to Code Review! I like that you've included your entire code, +1! Expect the whole thing to get reviewed ;) \$\endgroup\$ Commented Oct 21, 2014 at 2:12
  • \$\begingroup\$ @Mats Mug Thanks, I appreciate any review although I'm most concerned as to why I'm getting unexpected output. Looking forward to any ideas. ;) \$\endgroup\$ Commented Oct 21, 2014 at 2:24
  • \$\begingroup\$ @mjolka This is standard implementation. I'll try what you say later but I'm not sure how you could get SO. Any ideas where its coming from? \$\endgroup\$ Commented Oct 21, 2014 at 2:33
  • \$\begingroup\$ I'll play with it tomorrow (early day), but using my code as is causes no errors. \$\endgroup\$ Commented Oct 21, 2014 at 2:38
  • \$\begingroup\$ Updated my answer with explanation of why it's slow. \$\endgroup\$ Commented Oct 21, 2014 at 4:21

2 Answers 2

27
\$\begingroup\$

@mjolka was able to identify the root problem before I was ;-) There are two issues in addition to what he has pointed out.

The first is that it is very bad form to create new Random instances on each partitioning. Random instances internally synchronize on some logic structures, and cause a lot of overhead in their creation. You should instead have a different mechanism for the partitioning.

Additionally, your naming is horrible:

  • your class name starts with a lower-case
  • your variables are 1-letter long and some are upper-case (A, p, q, r, ...).

So, I did a few tests with your code, except I changed the data generator to:

public static int[] data(int n) {
 Random random = new Random(seed); // Random seed stays same for all sorts
 int[] list = new int[n];
 for(int i = 0; i < list.length; i++) {
 list[i] = random.nextInt(list.length * 10);
 }
 return list;
}

When I ran yur code with this, I got the results:

n Insertion Merge Heap Quick 
10 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:34 
1000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:2:480 
100000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:21:987 
1000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:139:194
10000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:1:468:183

As a result, you don't run in to the duplicate data issue quite as much, and the sort is fast.

Then I removed the new Random() from the random partition, and just used the mid-value, and he results are:

n Insertion Merge Heap Quick 
10 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:17 
1000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:708 
100000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:18:462 
1000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:103:790
10000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:1:182:385

Finally, I implemented my own sort using the equals-value grouping, and turned the data back to limit it to 1000, and the results are:

n Insertion Merge Heap Quick Monkey 
10 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:16 0:0:0:0:9 
1000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:562 0:0:0:0:684 
100000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:16:675 0:0:0:18:119 
1000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:532:437 0:0:0:65:608 
10000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:48:533:774 0:0:0:694:184 

With more random data the monkey sort slows down, and the quick-sort speeds up:

n Insertion Merge Heap Quick Monkey 
10 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:16 0:0:0:0:8 
1000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:0:631 0:0:0:0:819 
100000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:16:834 0:0:0:22:353 
1000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:0:103:699 0:0:0:127:421 
10000000 0:0:0:0:0 0:0:0:0:0 0:0:0:0:0 0:0:1:223:673 0:0:1:372:614 

Bottom line is that even with good data, 20% of your time is going to creating new Random instances. I have been using the randomizePartition method:

public static int randomizedPartition(int[] data, int first, int last) {
 // Random random = new Random();
 // int i = random.nextInt((last-first)+1)+first;
 swap(data,last,first + (last - first)/2);
 return partition(data,first,last);

Then, if you are interested in the monkey sort:

public static long monkey(int[] list) {
 long startTime = System.nanoTime();
 monkeySort(list, 0, list.length - 1);
 return System.nanoTime() - startTime;
}
private static void monkeySort(final int[] data, final int left, final int right) {
 if (left >= right) {
 return;
 }
 // partition in this method because we need two outputs, the 'same' and the 'lft'.
 // swap values the same as the partition to the end as well.
 final int val = data[right];
 int lft = left;
 int rht = right - 1;
 int same = right;
 while (lft <= rht) {
 if (data[lft] > val) {
 swap(data, lft, rht);
 rht--;
 } else if (data[rht] == val) {
 same--;
 swap(data, rht, same);
 rht--;
 } else {
 lft++;
 }
 }
 // move all the 'same' values in to the pivot point.
 int ntop = lft - 1;
 while (same <= right) {
 swap(data, lft++, same++);
 }
 monkeySort(data, left, ntop);
 monkeySort(data, lft, right);
}

More Detail on Random()

I made a reference to Random, and it is worth understanding more about what I mean. This is the (slightly reorganized) source code for java.util.Random:

private static final AtomicLong seedUniquifier
 = new AtomicLong(8682522807148012L);
public Random() {
 this(seedUniquifier() ^ System.nanoTime());
}
private static long seedUniquifier() {
 // L'Ecuyer, "Tables of Linear Congruential Generators of
 // Different Sizes and Good Lattice Structure", 1999
 for (;;) {
 long current = seedUniquifier.get();
 long next = current * 181783497276652981L;
 if (seedUniquifier.compareAndSet(current, next))
 return next;
 }
}
public Random(long seed) {
 if (getClass() == Random.class)
 this.seed = new AtomicLong(initialScramble(seed));
 else {
 // subclass might have overriden setSeed
 this.seed = new AtomicLong();
 setSeed(seed);
 }
}

Notice that there is a static AtomicLong called seedUniquifier. Every time you create a new Random, two references are made to that AtomicLong, causing a number of memory effects that are unnecessary. Additionally, there is a potential race condition in there which may cause the process to loop and retry.

It is already somewhat bad that Random has a single AtomicLong reference to ensure that the Random class is thread safe, but the entire Random constructor is globally safe too essentially (you effectively cannot create two Random instances at the same time, with the same seed). The implementation of that requirement is... costly.

answered Oct 21, 2014 at 4:41
\$\endgroup\$
3
  • \$\begingroup\$ I'm not sure what you mean with your last point. Are you saying you took away randomization completely and went with normal quicksort? If this is the case, I agree it might be better to a small degree because we can already assume the data is random enough to not need it. \$\endgroup\$ Commented Oct 21, 2014 at 16:06
  • \$\begingroup\$ The point about the random is that you should not be creating a new one each partition. If you can pass an instance around in your stack then that would be a better solution. new Random() should be used very sparingly. \$\endgroup\$ Commented Oct 21, 2014 at 16:09
  • \$\begingroup\$ I removed the new Random() from the random partition() to go on and throw out the baby with the bath water, eliminating random.nextInt(). An alternative would be to actually instantiate a QuickSorter and use a Random data member. \$\endgroup\$ Commented Dec 21, 2017 at 15:46
38
\$\begingroup\$

This implementation of Quicksort has poor performance for arrays with many repeated elements. From Wikipedia, emphasis mine

With a partitioning algorithm such as the one described above (even with one that chooses good pivot values), quicksort exhibits poor performance for inputs that contain many repeated elements. The problem is clearly apparent when all the input elements are equal: at each recursion, the left partition is empty (no input values are less than the pivot), and the right partition has only decreased by one element (the pivot is removed). Consequently, the algorithm takes quadratic time to sort an array of equal values.

To solve this quicksort equivalent of the Dutch national flag problem, an alternative linear-time partition routine can be used that separates the values into three groups: values less than the pivot, values equal to the pivot, and values greater than the pivot.

You can see this quadratic behaviour by running it on the input new int[10000], for example. In fact, you'll most likely get a StackOverflowError.

Now in your test data, you have \10ドル{,}000{,}000\$ elements, but you're only picking random values in the range \$[0,1{,}000)\$. So... you have lots of repeated elements!

Let's run it as-is on my computer (I didn't run insertion sort, as it's too slow)

$ java AlgComp && cat times.txt
Unsorted: 323 653 751 33 350 378 913 280 243 792
 mSorted: 33 243 280 323 350 378 653 751 792 913
 hSorted: 33 243 280 323 350 378 653 751 792 913
 qSorted: 33 243 280 323 350 378 653 751 792 913
Data is being written to times.txt...
-----------------------------------------------
Done.
n Insertion Merge Heap Quick
10 0:0:0:0:0 0:0:0:0:27 0:0:0:0:35 0:0:0:0:38
1000 0:0:0:0:0 0:0:0:1:852 0:0:0:0:822 0:0:0:3:224
100000 0:0:0:0:0 0:0:0:28:421 0:0:0:20:784 0:0:0:37:872
1000000 0:0:0:0:0 0:0:0:233:576 0:0:0:202:443 0:0:0:905:65
10000000 0:0:0:0:0 0:0:2:359:261 0:0:2:752:239 0:1:20:914:310

Now let's change this one line in data:

list[i] = random.nextInt(1000);

to this

list[i] = random.nextInt(1000000);

Now the results are more in line with our expectations:

$ java AlgComp && cat times.txt
Unsorted: 1662389001535502665295332468356126461089888942823562420254
 mSorted: 2356224683166238420254529533550266561264610898889428900153
 hSorted: 2356224683166238420254529533550266561264610898889428900153
 qSorted: 2356224683166238420254529533550266561264610898889428900153
Data is being written to times.txt...
-----------------------------------------------
Done.
n Insertion Merge Heap Quick
10 0:0:0:0:0 0:0:0:0:21 0:0:0:0:98 0:0:0:0:56
1000 0:0:0:0:0 0:0:0:1:997 0:0:0:1:14 0:0:0:2:41
100000 0:0:0:0:0 0:0:0:27:223 0:0:0:22:562 0:0:0:21:587
1000000 0:0:0:0:0 0:0:0:283:939 0:0:0:215:551 0:0:0:137:658
10000000 0:0:0:0:0 0:0:2:899:176 0:0:3:681:388 0:0:1:845:255

Of course the real fix is not to change data, but to change the partitioning algorithm.

answered Oct 21, 2014 at 3:37
\$\endgroup\$
0

Your Answer

Draft saved
Draft discarded

Sign up or log in

Sign up using Google
Sign up using Email and Password

Post as a guest

Required, but never shown

Post as a guest

Required, but never shown

By clicking "Post Your Answer", you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.