当前位置:文档之家› omp并行计算课程报告

omp并行计算课程报告

并行计算与多核多线程技术课程报告专业班级学号姓名成绩2015 年9月21日4.1 基于OpenMP的并行算法实现4.1.1 主要功能模块与实现方法#pragma omp parallel num_threads(2){for (xxj_g = 0; xxj_g <xxj_M/2; xxj_g++){for (xxj_d = 0; xxj_d< xxj_P; xxj_d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] *xxj_B[xxj_k][xxj_d];}}}for (xxj_g = xxj_M/2; xxj_g <xxj_M; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P; xxj_d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){ // cout << "2num=" << omp_get_thread_num() << endl;xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}}(1)将两个矩阵的乘法进行并行运算打开2个进程#pragma omp parallel num_threads(4)注意输出和输入部分的for循环不能用并行,否则会出现输入输出的个数出错的问题。

(2)将得到最终结果的xxj_C[i][j]分成两个子矩阵进行运算第一个子矩阵为xxj_A[i][j]的M/2行乘以xxj_B[i][j]的P列的结果,第二个子矩阵为xxj_A[i][j]的M-M/2行乘以xxj_B[i][j]的P列的结果。

参照顺序型的程序for (xxj_i = 0; xxj_i < xxj_M; xxj_i++){for (xxj_j = 0; xxj_j < xxj_P; xxj_j++){xxj_C[xxj_i][xxj_j] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){xxj_C[xxj_i][xxj_j] += xxj_A[xxj_i][xxj_k] * xxj_B[xxj_k][xxj_j];}}}4.1.2 实验加速比分析取三组数据进行比较第一次第二次第三次顺序时间124748 27512 31403串行时间72290 18597 18541加速比 1.7256 1.4793 1.69375. 设计体会通过这次实验,我了解了并行计算的运行与设计方式,更了解了它的方便和快捷,对于一个程序来说,节省了时间无疑是一个最大的优势,尤其是面对很多的数据的时候,特别的是在我测试数据的时候,要想办法找出成百上千的数据进行测试,虽然它的优点很多,也具有一定的挑战性,不过并行计算的地位始终是非常重要的。

6. 附录6.1 基于OMP的并行程序设计6.1.1 代码及注释并行代码:#include "stdafx.h"#include <windows.h>#include<stdio.h>#include<time.h>#include<omp.h>#include<iostream>//使用C++语句using namespace std;int _tmain(int argc, _TCHAR* argv[]){clock_t xxj_t1 = clock();//获取当前时间int xxj_M, xxj_N, xxj_P, xxj_i, xxj_j, xxj_k,xxj_a,xxj_b,xxj_g,xxj_d,xxj_e,xxj_f;cin >> xxj_M >>xxj_ N >> xxj_P;int xxj_A[100][100];int xxj_B[100][100];int xxj_C[100][100];for (xxj_i = 0; xxj_i < xxj_M;xxj_ i++){for (xxj_j = 0; xxj_j < xxj_N; xxj_j++){cin >> xxj_A[xxj_i][xxj_j];}}for (xxj_a = 0; xxj_a < N; xxj_a++){for (xxj_b = 0; xxj_b < xxj_P; xxj_b++){cin >> xxj_B[xxj_a][xxj_b];}}#pragma omp parallel num_threads(2){for (xxj_g = 0; xxj_g < xxj_M/2; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P;xxj_ d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}for (xxj_g = xxj_M/2; xxj_g < xxj_M; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P; xxj_d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){// cout << "xxj_num=" << omp_get_thread_num() << endl;xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}}for (xxj_e = 0; xxj_e < xxj_M; xxj_e++){cout << endl;for (xxj_f = 0; xxj_f < xxj_P; xxj_f++){cout << xxj_C[xxj_e][xxj_f] << " ";}}clock_t xxj_t2 = clock();int xxj_t;xxj_t = xxj_t2 - xxj_t1;cout << endl;cout << "t=" << xxj_ct << "s" << endl;system("pause");return 0;}顺序代码:#include "stdafx.h"#include <windows.h>#include<stdio.h>#include<time.h>#include<omp.h>#include<iostream>//使用C++语句using namespace std;int _tmain(int argc, _TCHAR* argv[]){clock_t xxj_t1 = clock();//获取当前时间int xxj_M, xxj_N, xxj_P, xxj_i, xxj_j, xxj_k,xxj_a,xxj_b,xxj_g,xxj_d,xxj_e,xxj_f;cin >> xxj_M >>xxj_ N >> xxj_P;int xxj_A[100][100];int xxj_B[100][100];int xxj_C[100][100];for (xxj_i = 0; xxj_i < xxj_M;xxj_ i++){for (xxj_j = 0; xxj_j < xxj_N; xxj_j++){cin >> xxj_A[xxj_i][xxj_j];}}for (xxj_a = 0; xxj_a < N; xxj_a++){for (xxj_b = 0; xxj_b < xxj_P; xxj_b++){cin >> xxj_B[xxj_a][xxj_b];}}for (xxj_g = 0; xxj_g < xxj_M; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P;xxj_ d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}for (xxj_e = 0; xxj_e < xxj_M; xxj_e++){cout << endl;for (xxj_f = 0; xxj_f < xxj_P; xxj_f++){cout << xxj_C[xxj_e][xxj_f] << " ";}}clock_t xxj_t2 = clock();int xxj_t;xxj_t = xxj_t2 - xxj_t1;cout << endl;cout << "xxj_st=" << xxj_t << "s" << endl;system("pause");return 0;}6.1.2 执行结果截图为了方便已经将顺序和并行部分合为一个代码,(1)小数据量验证正确性的执行结果首先利用小数据判断程序是否正确:截图为:(2)大数据量获得较好加速比的执行结果顺序的时间为xxj_st并行的时间为xxj_ct加速比为xxj_bi测试数据为50*50的两个矩阵,数据为随机生成的,生成随机数据的代码为:#include<iostream>#include<stdio.h>#include<time.h>#include<stdlib.h>#include <fstream>using namespace std;int main(){int xxj_nl=0;int xxj_nj=1000;int xxj_nCont=0;srand(time(NULL));ofstream xxj_ofs("xxj_d2.txt");while(1){xxj_nl=rand()%xxj_nj;xxj_ofs<<xxj_nl<<" ";xxj_nCont++;if(xxj_nCont==5000){break;}}xxj_ofs.close();}截图为:加速比为1.53896.1.3 遇到的问题及解决方案(1)问题一将输入输出的for循环放入了并行语句中错误代码及后果#pragma omp parallel num_threads(2){for (xxj_i = 0; xxj_i < xxj_M;xxj_ i++){for (xxj_j = 0; xxj_j < xxj_N; xxj_j++){cin >> xxj_A[xxj_i][xxj_j];}}for (xxj_a = 0; xxj_a < N; xxj_a++){for (xxj_b = 0; xxj_b < xxj_P; xxj_b++){cin >> xxj_B[xxj_a][xxj_b];}}for (xxj_g = 0; xxj_g < xxj_M/2; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P;xxj_ d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}for (xxj_g = xxj_M/2; xxj_g < xxj_M; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P; xxj_d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){// cout << "xxj_num=" << omp_get_thread_num() << endl;xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}for (xxj_e = 0; xxj_e < xxj_M; xxj_e++){cout << endl;for (xxj_f = 0; xxj_f < xxj_P; xxj_f++){cout << xxj_C[xxj_e][xxj_f] << " ";}}}正确代码for (xxj_i = 0; xxj_i < xxj_M;xxj_ i++){for (xxj_j = 0; xxj_j < xxj_N; xxj_j++){cin >> xxj_A[xxj_i][xxj_j];}}for (xxj_a = 0; xxj_a < N; xxj_a++){for (xxj_b = 0; xxj_b < xxj_P; xxj_b++){cin >> xxj_B[xxj_a][xxj_b];}}#pragma omp parallel num_threads(2){for (xxj_g = 0; xxj_g < xxj_M/2; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P;xxj_ d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}for (xxj_g = xxj_M/2; xxj_g < xxj_M; xxj_g++){for (xxj_d = 0; xxj_d < xxj_P; xxj_d++){xxj_C[xxj_g][xxj_d] = 0;for (xxj_k = 0; xxj_k < xxj_N; ++xxj_k){// cout << "xxj_num=" << omp_get_thread_num() << endl;xxj_C[xxj_g][xxj_d] += xxj_A[xxj_g][xxj_k] * xxj_B[xxj_k][xxj_d];}}}班级____________ 学号_________________ 姓名______________ 算法名称______________}for (xxj_e = 0; xxj_e < xxj_M; xxj_e++){cout << endl;for (xxj_f = 0; xxj_f < xxj_P; xxj_f++){cout << xxj_C[xxj_e][xxj_f] << " ";}}分析因为并行计算具有同时性,放到输入和输出上用的话会导致输入输出的矩阵含有的数字和个数发生混乱。

相关主题