OpenMP : Parallel QuickSort
Solution 1
I feel sorry for my first comment.It does not matter with your problem.I have not found the true problem of your question(Maybe your move element has the problem).According to your opinion, I wrote a similar program, it works fine.(I am also new on OpenMP).
#include <stdio.h>
#include <stdlib.h>
int partition(int * a, int p, int r)
{
int lt[r-p];
int gt[r-p];
int i;
int j;
int key = a[r];
int lt_n = 0;
int gt_n = 0;
#pragma omp parallel for
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i];
}else{
gt[gt_n++] = a[i];
}
}
for(i = 0; i < lt_n; i++){
a[p + i] = lt[i];
}
a[p + lt_n] = key;
for(j = 0; j < gt_n; j++){
a[p + lt_n + j + 1] = gt[j];
}
return p + lt_n;
}
void quicksort(int * a, int p, int r)
{
int div;
if(p < r){
div = partition(a, p, r);
#pragma omp parallel sections
{
#pragma omp section
quicksort(a, p, div - 1);
#pragma omp section
quicksort(a, div + 1, r);
}
}
}
int main(void)
{
int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6};
int i;
quicksort(a, 0, 9);
for(i = 0;i < 10; i++){
printf("%d\t", a[i]);
}
printf("\n");
return 0;
}
Solution 2
I decided to post this answer because:
-
the accepted answer is wrong, and the user seems inactive these days. There is a race-condition on
#pragma omp parallel for for(i = p; i < r; i++){ if(a[i] < a[r]){ lt[lt_n++] = a[i]; //<- race condition lt_n is shared }else{ gt[gt_n++] = a[i]; //<- race condition gt_n is shared } }
-
Nonetheless, even if it was correct, the modern answer to this question is to use OpenMP
tasks
instead ofsections
.
I am providing the community with full runnable example of such approach including tests and profiling.
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <omp.h>
#define TASK_SIZE 100
unsigned int rand_interval(unsigned int min, unsigned int max)
{
// https://stackoverflow.com/questions/2509679/
int r;
const unsigned int range = 1 + max - min;
const unsigned int buckets = RAND_MAX / range;
const unsigned int limit = buckets * range;
do
{
r = rand();
}
while (r >= limit);
return min + (r / buckets);
}
void fillupRandomly (int *m, int size, unsigned int min, unsigned int max){
for (int i = 0; i < size; i++)
m[i] = rand_interval(min, max);
}
void init(int *a, int size){
for(int i = 0; i < size; i++)
a[i] = 0;
}
void printArray(int *a, int size){
for(int i = 0; i < size; i++)
printf("%d ", a[i]);
printf("\n");
}
int isSorted(int *a, int size){
for(int i = 0; i < size - 1; i++)
if(a[i] > a[i + 1])
return 0;
return 1;
}
int partition(int * a, int p, int r)
{
int lt[r-p];
int gt[r-p];
int i;
int j;
int key = a[r];
int lt_n = 0;
int gt_n = 0;
for(i = p; i < r; i++){
if(a[i] < a[r]){
lt[lt_n++] = a[i];
}else{
gt[gt_n++] = a[i];
}
}
for(i = 0; i < lt_n; i++){
a[p + i] = lt[i];
}
a[p + lt_n] = key;
for(j = 0; j < gt_n; j++){
a[p + lt_n + j + 1] = gt[j];
}
return p + lt_n;
}
void quicksort(int * a, int p, int r)
{
int div;
if(p < r){
div = partition(a, p, r);
#pragma omp task shared(a) if(r - p > TASK_SIZE)
quicksort(a, p, div - 1);
#pragma omp task shared(a) if(r - p > TASK_SIZE)
quicksort(a, div + 1, r);
}
}
int main(int argc, char *argv[])
{
srand(123456);
int N = (argc > 1) ? atoi(argv[1]) : 10;
int print = (argc > 2) ? atoi(argv[2]) : 0;
int numThreads = (argc > 3) ? atoi(argv[3]) : 2;
int *X = malloc(N * sizeof(int));
int *tmp = malloc(N * sizeof(int));
omp_set_dynamic(0); /** Explicitly disable dynamic teams **/
omp_set_num_threads(numThreads); /** Use N threads for all parallel regions **/
// Dealing with fail memory allocation
if(!X || !tmp)
{
if(X) free(X);
if(tmp) free(tmp);
return (EXIT_FAILURE);
}
fillupRandomly (X, N, 0, 5);
double begin = omp_get_wtime();
#pragma omp parallel
{
#pragma omp single
quicksort(X, 0, N);
}
double end = omp_get_wtime();
printf("Time: %f (s) \n",end-begin);
assert(1 == isSorted(X, N));
if(print){
printArray(X, N);
}
free(X);
free(tmp);
return (EXIT_SUCCESS);
return 0;
}
How to run:
This program accepts three parameters:
- The size of the array;
- Print or not the array, 0 for no, otherwise yes;
- The number of Threads to run in parallel.
Mini Benchmark
In a 4 core machine : Input 100000 with
1 Thread -> Time: 0.784504 (s)
2 Threads -> Time: 0.424008 (s) ~ speedup 1.85x
4 Threads -> Time: 0.282944 (s) ~ speedup 2.77x
randomp
Updated on March 26, 2021Comments
-
randomp about 3 years
I try to use OpenMP to parallelize QuickSort in partition part and QuickSort part. My C code is as follows:
#include "stdlib.h" #include "stdio.h" #include "omp.h" // parallel partition int ParPartition(int *a, int p, int r) { int b[r-p]; int key = *(a+r); // use the last element in the array as the pivot int lt[r-p]; // mark 1 at the position where its element is smaller than the key, else 0 int gt[r-p]; // mark 1 at the position where its element is bigger than the key, else 0 int cnt_lt = 0; // count 1 in the lt array int cnt_gt = 0; // count 1 in the gt array int j=p; int k = 0; // the position of the pivot // deal with gt and lt array #pragma omp parallel for for ( j=p; j<r; ++j) { b[j-p] = *(a+j); if (*(a+j) < key) { lt[j-p] = 1; gt[j-p] = 0; } else { lt[j-p] = 0; gt[j-p] = 1; } } // calculate the new position of the elements for ( j=0; j<(r-p); ++j) { if (lt[j]) { ++cnt_lt; lt[j] = cnt_lt; } else lt[j] = cnt_lt; if (gt[j]) { ++cnt_gt; gt[j] = cnt_gt; } else gt[j] = cnt_gt; } // move the pivot k = lt[r-p-1]; *(a+p+k) = key; // move elements to their new positon #pragma omp parallel for for ( j=p; j<r; ++j) { if (b[j-p] < key) *(a+p+lt[j-p]-1) = b[j-p]; else if (b[j-p] > key) *(a+k+gt[j-p]) = b[j-p]; } return (k+p); } void ParQuickSort(int *a, int p, int r) { int q; if (p<r) { q = ParPartition(a, p, r); #pragma omp parallel sections { #pragma omp section ParQuickSort(a, p, q-1); #pragma omp section ParQuickSort(a, q+1, r); } } } int main() { int a[10] = {5, 3, 8, 4, 0, 9, 2, 1, 7, 6}; ParQuickSort(a, 0, 9); int i=0; for (; i!=10; ++i) printf("%d\t", a[i]); printf("\n"); return 0; }
For the example in the main function, the sorting result is:
0 9 9 2 2 2 6 7 7 7
I used gdb to debug. In the early recursion, all went well. But in some recursions, it suddenly messed up to begin duplicate elements. Then generate the above result.
Can someone help me figure out where the problem is?
-
randomp about 11 yearsThank you! I guess it's because of the b array?
-
MYMNeo about 11 years@randomp, I can just say 'maybe'.:-)
-
Quxflux over 9 yearsThis seems to be wrong. The algorithm divides the array to be sorted in smaller partial list of the initial list. As long as the spawned threads just swap objects inside their "own" partial list, no race conditions can arise.
-
ftfish about 9 yearsI see no reason why this would ever work. In the
parallel for
,lt_n
andgt_n
are possibly modified by more than one threads without any synchronization. Maybe the array is just so small that only one thread is working on that section. -
ftfish about 9 yearsUpdate: I ran this multiple times and indeed saw wrong result:
0 1 2 3 5 6 6 7 8 9
. Therefore the code is wrong. @randomp -
dreamcrash about 3 years@ftfish yep, this code seems wrong, I have posted a solution if you want to check