OpenMP : Parallel QuickSort

23,781

Solution 1

I feel sorry for my first comment.It does not matter with your problem.I have not found the true problem of your question(Maybe your move element has the problem).According to your opinion, I wrote a similar program, it works fine.(I am also new on OpenMP).

#include <stdio.h>
#include <stdlib.h>


int partition(int * a, int p, int r)
{
    int lt[r-p];
    int gt[r-p];
    int i;
    int j;
    int key = a[r];
    int lt_n = 0;
    int gt_n = 0;

#pragma omp parallel for
    for(i = p; i < r; i++){
        if(a[i] < a[r]){
            lt[lt_n++] = a[i];
        }else{
            gt[gt_n++] = a[i];
        }   
    }   

    for(i = 0; i < lt_n; i++){
        a[p + i] = lt[i];
    }   

    a[p + lt_n] = key;

    for(j = 0; j < gt_n; j++){
        a[p + lt_n + j + 1] = gt[j];
    }   

    return p + lt_n;
}

void quicksort(int * a, int p, int r)
{
    int div;

    if(p < r){ 
        div = partition(a, p, r); 
#pragma omp parallel sections
        {   
#pragma omp section
            quicksort(a, p, div - 1); 
#pragma omp section
            quicksort(a, div + 1, r); 

        }
    }
}

int main(void)
{
    int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
    int i;

    quicksort(a, 0, 9);

    for(i = 0;i < 10; i++){
        printf("%d\t", a[i]);
    }
    printf("\n");
    return 0;
}

Solution 2

I decided to post this answer because:

  1. the accepted answer is wrong, and the user seems inactive these days. There is a race-condition on

     #pragma omp parallel for
     for(i = p; i < r; i++){
         if(a[i] < a[r]){
             lt[lt_n++] = a[i]; //<- race condition lt_n is shared
         }else{
             gt[gt_n++] = a[i];  //<- race condition gt_n is shared
         }   
     }   
    
  2. Nonetheless, even if it was correct, the modern answer to this question is to use OpenMP tasks instead of sections.

I am providing the community with full runnable example of such approach including tests and profiling.

#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>

#define TASK_SIZE 100

unsigned int rand_interval(unsigned int min, unsigned int max)
{
    // https://stackoverflow.com/questions/2509679/
    int r;
    const unsigned int range = 1 + max - min;
    const unsigned int buckets = RAND_MAX / range;
    const unsigned int limit = buckets * range;

    do
    {
        r = rand();
    } 
    while (r >= limit);

    return min + (r / buckets);
}

void fillupRandomly (int *m, int size, unsigned int min, unsigned int max){
    for (int i = 0; i < size; i++)
    m[i] = rand_interval(min, max);
} 


void init(int *a, int size){
   for(int i = 0; i < size; i++)
       a[i] = 0;
}

void printArray(int *a, int size){
   for(int i = 0; i < size; i++)
       printf("%d ", a[i]);
   printf("\n");
}

int isSorted(int *a, int size){
   for(int i = 0; i < size - 1; i++)
      if(a[i] > a[i + 1])
        return 0;
   return 1;
}


int partition(int * a, int p, int r)
{
    int lt[r-p];
    int gt[r-p];
    int i;
    int j;
    int key = a[r];
    int lt_n = 0;
    int gt_n = 0;

    for(i = p; i < r; i++){
        if(a[i] < a[r]){
            lt[lt_n++] = a[i];
        }else{
            gt[gt_n++] = a[i];
        }   
    }   

    for(i = 0; i < lt_n; i++){
        a[p + i] = lt[i];
    }   

    a[p + lt_n] = key;

    for(j = 0; j < gt_n; j++){
        a[p + lt_n + j + 1] = gt[j];
    }   

    return p + lt_n;
}

void quicksort(int * a, int p, int r)
{
    int div;

    if(p < r){ 
        div = partition(a, p, r); 
        #pragma omp task shared(a) if(r - p > TASK_SIZE) 
        quicksort(a, p, div - 1); 
        #pragma omp task shared(a) if(r - p > TASK_SIZE)
        quicksort(a, div + 1, r); 
    }
}

int main(int argc, char *argv[])
{
        srand(123456);
        int N  = (argc > 1) ? atoi(argv[1]) : 10;
        int print = (argc > 2) ? atoi(argv[2]) : 0;
        int numThreads = (argc > 3) ? atoi(argv[3]) : 2;
        int *X = malloc(N * sizeof(int));
        int *tmp = malloc(N * sizeof(int));

        omp_set_dynamic(0);              /** Explicitly disable dynamic teams **/
        omp_set_num_threads(numThreads); /** Use N threads for all parallel regions **/

         // Dealing with fail memory allocation
        if(!X || !tmp)
        { 
           if(X) free(X);
           if(tmp) free(tmp);
           return (EXIT_FAILURE);
        }

        fillupRandomly (X, N, 0, 5);

        double begin = omp_get_wtime();
        #pragma omp parallel
        {
            #pragma omp single
             quicksort(X, 0, N);
        }   
        double end = omp_get_wtime();
        printf("Time: %f (s) \n",end-begin);
    
        assert(1 == isSorted(X, N));

        if(print){
           printArray(X, N);
        }

        free(X);
        free(tmp);
        return (EXIT_SUCCESS);

    return 0;
}

How to run:

This program accepts three parameters:

  1. The size of the array;
  2. Print or not the array, 0 for no, otherwise yes;
  3. The number of Threads to run in parallel.

Mini Benchmark

In a 4 core machine : Input 100000 with

1 Thread  -> Time: 0.784504 (s)
2 Threads -> Time: 0.424008 (s) ~ speedup 1.85x
4 Threads -> Time: 0.282944 (s) ~ speedup 2.77x
Share:
23,781
randomp
Author by

randomp

Updated on March 26, 2021

Comments

  • randomp
    randomp about 3 years

    I try to use OpenMP to parallelize QuickSort in partition part and QuickSort part. My C code is as follows:

    #include "stdlib.h"
    #include "stdio.h"
    #include "omp.h"
    
    // parallel partition
    int ParPartition(int *a, int p, int r) {
        int b[r-p];
        int key = *(a+r); // use the last element in the array as the pivot
        int lt[r-p]; // mark 1 at the position where its element is smaller than the key, else 0
        int gt[r-p]; // mark 1 at the position where its element is bigger than the key, else 0
        int cnt_lt = 0; // count 1 in the lt array
        int cnt_gt = 0; // count 1 in the gt array
        int j=p;
        int k = 0; // the position of the pivot
        // deal with gt and lt array
        #pragma omp parallel for
        for ( j=p; j<r; ++j) {
            b[j-p] = *(a+j);
            if (*(a+j) < key) {
                lt[j-p] = 1;
                gt[j-p] = 0;
            } else {
                lt[j-p] = 0;
                gt[j-p] = 1;
            }
        }
        // calculate the new position of the elements
        for ( j=0; j<(r-p); ++j) {
            if (lt[j]) {
                ++cnt_lt;
                lt[j] = cnt_lt;
            } else
                lt[j] = cnt_lt;
            if (gt[j]) {
                ++cnt_gt;
                gt[j] = cnt_gt;
            } else
                gt[j] = cnt_gt;
        }
        // move the pivot
        k = lt[r-p-1];
        *(a+p+k) = key;
        // move elements to their new positon
        #pragma omp parallel for 
        for ( j=p; j<r; ++j) {
            if (b[j-p] < key)
                *(a+p+lt[j-p]-1) = b[j-p];
            else if (b[j-p] > key)
                *(a+k+gt[j-p]) = b[j-p];
        }
        return (k+p);
    }
    
    void ParQuickSort(int *a, int p, int r) {
        int q;
        if (p<r) {
            q = ParPartition(a, p, r);
            #pragma omp parallel sections
            {
            #pragma omp section
            ParQuickSort(a, p, q-1);
            #pragma omp section
            ParQuickSort(a, q+1, r);
            }
        }
    }
    
    int main() {
        int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
        ParQuickSort(a, 0, 9);
        int i=0;
        for (; i!=10; ++i)
            printf("%d\t", a[i]);
        printf("\n");
        return 0;
    }
    

    For the example in the main function, the sorting result is:

    0   9   9   2   2   2   6   7   7   7
    

    I used gdb to debug. In the early recursion, all went well. But in some recursions, it suddenly messed up to begin duplicate elements. Then generate the above result.

    Can someone help me figure out where the problem is?

  • randomp
    randomp about 11 years
    Thank you! I guess it's because of the b array?
  • MYMNeo
    MYMNeo about 11 years
    @randomp, I can just say 'maybe'.:-)
  • Quxflux
    Quxflux over 9 years
    This seems to be wrong. The algorithm divides the array to be sorted in smaller partial list of the initial list. As long as the spawned threads just swap objects inside their "own" partial list, no race conditions can arise.
  • ftfish
    ftfish about 9 years
    I see no reason why this would ever work. In the parallel for, lt_n and gt_n are possibly modified by more than one threads without any synchronization. Maybe the array is just so small that only one thread is working on that section.
  • ftfish
    ftfish about 9 years
    Update: I ran this multiple times and indeed saw wrong result: 0 1 2 3 5 6 6 7 8 9. Therefore the code is wrong. @randomp
  • dreamcrash
    dreamcrash about 3 years
    @ftfish yep, this code seems wrong, I have posted a solution if you want to check