▶ 《并行程序设计导论》第六章中讨论了 n 体问题,分别使用了 MPI,Pthreads,OpenMP 来进行实现,这里是 OpenMP 的代码,分为基本算法和简化算法(引力计算量为基本算法的一半,但是消息传递较为复杂)
● 基本算法
// omp_nbody_basic.c,OpenMP 基本算法
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h> #define OUTPUT
#define DEBUG
#define DIM 2
#define X 0
#define Y 1
typedef double vect_t[DIM];
const double G = 6.673e-11;
struct particle_s // 使用一个结构来同时保存一个颗粒的质量,位置,速度
{
double m;
vect_t s;
vect_t v;
}; void Usage(char* prog_name)
{
fprintf(stderr, "usage: %s <nThreads> <nParticles> <nTimestep> <sizeTimestep> <outputFrequency> <g|i>\n", prog_name);
fprintf(stderr, " 'g': inite condition by random\n 'i': inite condition from stdin\n");
exit();
} void Get_args(int argc, char* argv[], int* thread_count_p, int* n_p, int* n_steps_p, double* delta_t_p, int* output_freq_p, char* g_i_p)
{
if (argc != )
Usage(argv[]);
*thread_count_p = strtol(argv[], NULL, );
*n_p = strtol(argv[], NULL, );
*n_steps_p = strtol(argv[], NULL, );
*delta_t_p = strtod(argv[], NULL);
*output_freq_p = strtol(argv[], NULL, );
*g_i_p = argv[][];
if (*thread_count_p < || *n_p <= || *n_p % *thread_count_p || *n_steps_p < || *delta_t_p <= || *g_i_p != 'g' && *g_i_p != 'i')
{
Usage(argv[]);
exit();
}
# ifdef DEBUG
printf("Get_args, nThread%2d n %d n_steps %d delta_t %e output_freq %d g_i %c\n",
*thread_count_p, *n_p, *n_steps_p, *delta_t_p, *output_freq_p, *g_i_p);
fflush(stdout);
# endif
} void Gen_init_cond(struct particle_s curr[], int n)
{
const double mass = 5.0e24, gap = 1.0e5, speed = 3.0e4;
//srand(2);
for (int i = ; i < n; i++)
{
curr[i].m = mass;
curr[i].s[X] = i * gap;
curr[i].s[Y] = 0.0;
curr[i].v[X] = 0.0;
// vel[i][Y] = speed * (2 * rand() / (double)RAND_MAX) - 1);
curr[i].v[Y] = (i % ) ? -speed : speed;
}
} void Get_init_cond(struct particle_s curr[], int n)
{
printf("For each particle, enter (in order): mass x-coord y-coord x-velocity y-velocity\n");
for (int i = ; i < n; i++)
{
scanf_s("%lf", &curr[i].m);
scanf_s("%lf", &curr[i].s[X]);
scanf_s("%lf", &curr[i].s[Y]);
scanf_s("%lf", &curr[i].v[X]);
scanf_s("%lf", &curr[i].v[Y]);
}
} void Output_state(double time, struct particle_s curr[], int n)
{
printf("Output_state, time = %.2f\n", time);
for (int i = ; i < n; i++)
printf("%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", i, curr[i].m, curr[i].s[X], curr[i].s[Y], curr[i].v[X], curr[i].v[Y]);
printf("\n");
fflush(stdout);
} void Compute_force(int part, vect_t forces[], struct particle_s curr[], int n)
{
int k;
vect_t f_part_k;
double len, fact;
for (forces[part][X] = forces[part][Y] = 0.0, k = ; k < n; k++)
{
if (k != part)
{
f_part_k[X] = curr[part].s[X] - curr[k].s[X];
f_part_k[Y] = curr[part].s[Y] - curr[k].s[Y];
len = sqrt(f_part_k[X] * f_part_k[X] + f_part_k[Y] * f_part_k[Y]);
fact = -G * curr[part].m * curr[k].m / (len * len * len);
f_part_k[X] *= fact;
f_part_k[Y] *= fact;
forces[part][X] += f_part_k[X];
forces[part][Y] += f_part_k[Y];
# ifdef DEBUG
printf("Compute_force, %d, %2d> %10.3e %10.3e %10.3e %10.3e\n", k, len, fact, f_part_k[X], f_part_k[Y]);
fflush(stdout);
# endif
}
}
} void Update_part(int part, vect_t forces[], struct particle_s curr[], int n, double delta_t)
{
const double fact = delta_t / curr[part].m;
# ifdef DEBUG
printf("Update_part before, part%2d %10.3e %10.3e %10.3e %10.3e %10.3e %10.3e\n",
part, curr[part].s[X], curr[part].s[Y], curr[part].v[X], curr[part].v[Y], forces[part][X], forces[part][Y]);
fflush(stdout);
# endif
curr[part].s[X] += delta_t * curr[part].v[X];
curr[part].s[Y] += delta_t * curr[part].v[Y];
curr[part].v[X] += fact * forces[part][X];
curr[part].v[Y] += fact * forces[part][Y];
# ifdef DEBUG
printf("Update_part after, part%2d %10.3e %10.3e %10.3e %10.3e)\n",
part, curr[part].s[X], curr[part].s[Y], curr[part].v[X], curr[part].v[Y]);
# endif
} void Compute_energy(struct particle_s curr[], int n, double* kin_en_p, double* pot_en_p)// 计算系统轨道能量,没用到
{
int i, j;
vect_t diff;
double temp;
for (i = , temp = 0.0; i < n; temp += curr[i].m * curr[i].v[X] * curr[i].v[X] + curr[i].v[Y] * curr[i].v[Y], i++);
*kin_en_p = temp * 0.5;
for (i = , temp = .; i < n - ; i++)
{
for (j = i + ; j < n; j++)
{
diff[X] = curr[i].s[X] - curr[j].s[X];
diff[Y] = curr[i].s[Y] - curr[j].s[Y];
temp += -G * curr[i].m * curr[j].m / sqrt(diff[X] * diff[X] + diff[Y] * diff[Y]);
}
}
*pot_en_p = temp;
} int main(int argc, char* argv[])
{
int n, part;
int n_steps, step;
double delta_t;
int output_freq;
struct particle_s* curr; // 颗粒信息
vect_t* forces; // 引力信息
int thread_count; // 线程数,用于函数参数和 omp 子句,不是全局变量
char g_i;
double start, finish; Get_args(argc, argv, &thread_count, &n, &n_steps, &delta_t, &output_freq, &g_i);
curr = (particle_s*)malloc(n * sizeof(struct particle_s));
forces = (vect_t*)malloc(n * sizeof(vect_t));
if (g_i == 'g')
Gen_init_cond(curr, n);
else
Get_init_cond(curr, n); start = omp_get_wtime();
# ifdef OUTPUT
Output_state(, curr, n);
# endif
# pragma omp parallel num_threads(thread_count) default(none) shared(curr, forces, thread_count, delta_t, n, n_steps, output_freq) private(step, part)
for (step = ; step <= n_steps; step++)
{
// memset(forces, 0, n*sizeof(vect_t));
# pragma omp for
for (part = ; part < n; part++)
Compute_force(part, forces, curr, n);
# pragma omp for
for (part = ; part < n; part++)
Update_part(part, forces, curr, n, delta_t);
# ifdef OUTPUT
# pragma omp single
if (step % output_freq == )
Output_state(step * delta_t, curr, n);
# endif
} finish = omp_get_wtime();
printf("Elapsed time = %e ms\n", (finish - start) * );
free(curr);
free(forces);
return ;
}
● 输出结果。8 进程 16 体,3 秒,时间步长 1 秒,舍去 debug 输出 1.094827e+01 ms;8 进程 1024 体,3600 秒,时间步长 1 秒,舍去 output 和 debug 输出 1.485764e+04 ms
D:\Code\OpenMP\OpenMPProjectTemp\x64\Debug>OpenMPProjectTemp.exe g
Output_state, time = 1.00
5.000e+24 0.000e+00 3.000e+04 5.273e+04 3.000e+04
5.000e+24 1.000e+05 -3.000e+04 1.922e+04 -3.000e+04
5.000e+24 2.000e+05 3.000e+04 1.071e+04 3.000e+04
5.000e+24 3.000e+05 -3.000e+04 6.802e+03 -3.000e+04
5.000e+24 4.000e+05 3.000e+04 4.485e+03 3.000e+04
5.000e+24 5.000e+05 -3.000e+04 2.875e+03 -3.000e+04
5.000e+24 6.000e+05 3.000e+04 1.614e+03 3.000e+04
5.000e+24 7.000e+05 -3.000e+04 5.213e+02 -3.000e+04
5.000e+24 8.000e+05 3.000e+04 -5.213e+02 3.000e+04
5.000e+24 9.000e+05 -3.000e+04 -1.614e+03 -3.000e+04
5.000e+24 1.000e+06 3.000e+04 -2.875e+03 3.000e+04
5.000e+24 1.100e+06 -3.000e+04 -4.485e+03 -3.000e+04
5.000e+24 1.200e+06 3.000e+04 -6.802e+03 3.000e+04
5.000e+24 1.300e+06 -3.000e+04 -1.071e+04 -3.000e+04
5.000e+24 1.400e+06 3.000e+04 -1.922e+04 3.000e+04
5.000e+24 1.500e+06 -3.000e+04 -5.273e+04 -3.000e+04 Output_state, time = 2.00
5.000e+24 5.273e+04 6.000e+04 9.288e+04 1.641e+04
5.000e+24 1.192e+05 -6.000e+04 3.818e+04 -3.791e+03
5.000e+24 2.107e+05 6.000e+04 2.116e+04 3.791e+03
5.000e+24 3.068e+05 -6.000e+04 1.356e+04 -3.101e+03
5.000e+24 4.045e+05 6.000e+04 8.930e+03 3.101e+03
5.000e+24 5.029e+05 -6.000e+04 5.739e+03 -2.959e+03
5.000e+24 6.016e+05 6.000e+04 3.218e+03 2.959e+03
5.000e+24 7.005e+05 -6.000e+04 1.043e+03 -2.929e+03
5.000e+24 7.995e+05 6.000e+04 -1.043e+03 2.929e+03
5.000e+24 8.984e+05 -6.000e+04 -3.218e+03 -2.959e+03
5.000e+24 9.971e+05 6.000e+04 -5.739e+03 2.959e+03
5.000e+24 1.096e+06 -6.000e+04 -8.930e+03 -3.101e+03
5.000e+24 1.193e+06 6.000e+04 -1.356e+04 3.101e+03
5.000e+24 1.289e+06 -6.000e+04 -2.116e+04 -3.791e+03
5.000e+24 1.381e+06 6.000e+04 -3.818e+04 3.791e+03
5.000e+24 1.447e+06 -6.000e+04 -9.288e+04 -1.641e+04 Output_state, time = 3.00
5.000e+24 1.456e+05 7.641e+04 1.273e+05 -1.575e+03
5.000e+24 1.574e+05 -6.379e+04 5.867e+04 2.528e+04
5.000e+24 2.319e+05 6.379e+04 2.685e+04 -2.069e+04
5.000e+24 3.204e+05 -6.310e+04 1.884e+04 2.228e+04
5.000e+24 4.134e+05 6.310e+04 1.235e+04 -2.151e+04
5.000e+24 5.086e+05 -6.296e+04 8.111e+03 2.179e+04
5.000e+24 6.048e+05 6.296e+04 4.537e+03 -2.163e+04
5.000e+24 7.016e+05 -6.293e+04 1.493e+03 2.169e+04
5.000e+24 7.984e+05 6.293e+04 -1.493e+03 -2.169e+04
5.000e+24 8.952e+05 -6.296e+04 -4.537e+03 2.163e+04
5.000e+24 9.914e+05 6.296e+04 -8.111e+03 -2.179e+04
5.000e+24 1.087e+06 -6.310e+04 -1.235e+04 2.151e+04
5.000e+24 1.180e+06 6.310e+04 -1.884e+04 -2.228e+04
5.000e+24 1.268e+06 -6.379e+04 -2.685e+04 2.069e+04
5.000e+24 1.343e+06 6.379e+04 -5.867e+04 -2.528e+04
5.000e+24 1.354e+06 -7.641e+04 -1.273e+05 1.575e+03 Elapsed time = 1.094827e-02 seconds
● 简化算法
// omp_nbody_red.c,OpenMP 简化算法
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <omp.h> #define OUTPUT
//#define DEBUG
#define DIM 2
#define X 0
#define Y 1
typedef double vect_t[DIM];
const double G = 6.673e-11;
struct particle_s
{
double m;
vect_t s;
vect_t v;
}; void Usage(char* prog_name)
{
fprintf(stderr, "usage: %s <nThreads> <nParticles> <nTimestep> <sizeTimestep> <outputFrequency> <g|i>\n", prog_name);
fprintf(stderr, " 'g': inite condition by random\n 'i': inite condition from stdin\n");
exit();
} void Get_args(int argc, char* argv[], int* thread_count_p, int* n_p, int* n_steps_p, double* delta_t_p, int* output_freq_p, char* g_i_p)
{
if (argc != )
Usage(argv[]);
*thread_count_p = strtol(argv[], NULL, );
*n_p = strtol(argv[], NULL, );
*n_steps_p = strtol(argv[], NULL, );
*delta_t_p = strtod(argv[], NULL);
*output_freq_p = strtol(argv[], NULL, );
*g_i_p = argv[][];
if (*thread_count_p < || *n_p <= || *n_p % *thread_count_p || *n_steps_p < || *delta_t_p <= || *g_i_p != 'g' && *g_i_p != 'i')
{
Usage(argv[]);
exit();
}
# ifdef DEBUG
printf("Get_args, nThread%2d n %d n_steps %d delta_t %e output_freq %d g_i %c\n",
*thread_count_p, *n_p, *n_steps_p, *delta_t_p, *output_freq_p, *g_i_p);
fflush(stdout);
# endif
} void Gen_init_cond(struct particle_s curr[], int n)
{
const double mass = 5.0e24, gap = 1.0e5, speed = 3.0e4;
//srand(2);
for (int i = ; i < n; i++)
{
curr[i].m = mass;
curr[i].s[X] = i * gap;
curr[i].s[Y] = 0.0;
curr[i].v[X] = 0.0;
// vel[i][Y] = speed * (2 * rand() / (double)RAND_MAX) - 1);
curr[i].v[Y] = (i % ) ? -speed : speed;
}
} void Get_init_cond(struct particle_s curr[], int n)
{
printf("For each particle, enter (in order): mass x-coord y-coord x-velocity y-velocity\n");
for (int i = ; i < n; i++)
{
scanf_s("%lf", &curr[i].m);
scanf_s("%lf", &curr[i].s[X]);
scanf_s("%lf", &curr[i].s[Y]);
scanf_s("%lf", &curr[i].v[X]);
scanf_s("%lf", &curr[i].v[Y]);
}
} void Output_state(double time, struct particle_s curr[], int n)
{
printf("Output_state, time = %.2f\n", time);
for (int i = ; i < n; i++)
printf("%2d %10.3e %10.3e %10.3e %10.3e %10.3e\n", i, curr[i].m, curr[i].s[X], curr[i].s[Y], curr[i].v[X], curr[i].v[Y]);
printf("\n");
fflush(stdout);
} void Compute_force(int part, vect_t forces[], struct particle_s curr[], int n)
{
vect_t f_part_k;
double len, fact;
for (int k = part + ; k < n; k++)
{
f_part_k[X] = curr[part].s[X] - curr[k].s[X];
f_part_k[Y] = curr[part].s[Y] - curr[k].s[Y];
len = sqrt(f_part_k[X] * f_part_k[X] + f_part_k[Y] * f_part_k[Y]);
fact = -G * curr[part].m * curr[k].m / (len * len * len);
f_part_k[X] *= fact;
f_part_k[Y] *= fact;
forces[part][X] += f_part_k[X]; // 靠前的颗粒加上本次计算结果的相反数
forces[part][Y] += f_part_k[Y];
forces[k][X] -= f_part_k[X]; // 靠后的颗粒加上本次计算结果
forces[k][Y] -= f_part_k[Y];
# ifdef DEBUG
printf("Compute_force, k%2d> %10.3e %10.3e %10.3e %10.3e\n", k, len, fact, f_part_k[X], f_part_k[Y]);
# endif
}
} void Update_part(int part, vect_t forces[], struct particle_s curr[], int n, double delta_t)
{
const double fact = delta_t / curr[part].m;
# ifdef DEBUG
printf("Update_part before, part%2d %10.3e %10.3e %10.3e %10.3e %10.3e %10.3e\n",
part, curr[part].s[X], curr[part].s[Y], curr[part].v[X], curr[part].v[Y], forces[part][X], forces[part][Y]);
fflush(stdout);
# endif
curr[part].s[X] += delta_t * curr[part].v[X];
curr[part].s[Y] += delta_t * curr[part].v[Y];
curr[part].v[X] += fact * forces[part][X];
curr[part].v[Y] += fact * forces[part][Y];
# ifdef DEBUG
printf("Update_part after, part%2d %10.3e %10.3e %10.3e %10.3e)\n",
part, curr[part].s[X], curr[part].s[Y], curr[part].v[X], curr[part].v[Y]);
# endif
} int main(int argc, char* argv[])
{
int n, part;
int n_steps, step;
double delta_t;
int output_freq;
struct particle_s* curr; // 颗粒信息
vect_t* forces; // 引力信息
int nThread, thread, my_rank; // 线程数,用于函数参数和 omp 子句,不是全局变量,当前线程编号(循环变量)
char g_i;
double start, finish;
vect_t* loc_forces; // 每个线程局部引力 Get_args(argc, argv, &nThread, &n, &n_steps, &delta_t, &output_freq, &g_i);
curr = (particle_s*)malloc(n * sizeof(struct particle_s));
forces = (vect_t*)malloc(n * sizeof(vect_t));
loc_forces = (vect_t*)malloc(nThread*n * sizeof(vect_t));
if (g_i == 'g')
Gen_init_cond(curr, n);
else
Get_init_cond(curr, n); start = omp_get_wtime();
# ifdef OUTPUT
Output_state(, curr, n);
# endif
# pragma omp parallel num_threads(nThread) default(none) shared(curr, forces, nThread, delta_t, n, n_steps, output_freq, loc_forces) private(step, part,thread, my_rank)
{
my_rank = omp_get_thread_num();
for (step = ; step <= n_steps; step++)
{
// memset(loc_forces + my_rank*n, 0, n*sizeof(vect_t));
# pragma omp for
for (part = ; part < nThread * n; part++)
loc_forces[part][X] = loc_forces[part][Y] = 0.0;
# pragma omp for schedule(static, ) // 注意使用循环调度
for (part = ; part < n - ; part++)
Compute_force(part, loc_forces + my_rank * n, curr, n);
# pragma omp for
for (part = ; part < n; part++) // 计算引力
{
forces[part][X] = forces[part][Y] = 0.0;
for (thread = ; thread < nThread; thread++)
{
forces[part][X] += loc_forces[thread * n + part][X];
forces[part][Y] += loc_forces[thread * n + part][Y];
}
}
# pragma omp for
for (part = ; part < n; part++) // 更新位置
Update_part(part, forces, curr, n, delta_t);
# ifdef OUTPUT
if (step % output_freq == )
{
# pragma omp single
Output_state(step*delta_t, curr, n);
}
# endif
}
} finish = omp_get_wtime();
printf("Elapsed time = %e ms\n", (finish - start) * );
free(curr);
free(forces);
free(loc_forces);
return ;
}
● 输出结果,与基本算法类似。8 进程 16 体,3 秒,时间步长 1 秒,舍去 debug 输出 9.887694e-00 ms;8 进程 1024 体,3600 秒,时间步长 1 秒,舍去 output 和 debug 输出 7.857710e+03 ms。数据规模扩大后计算时间趋近基本算法的一半,说明计算花费时间较多,而数据传输花费时间较少。