当前位置:文档之家› 程序性能调优

程序性能调优

"" /* Second member email addr (leave blank if none) */
};
/*
* naive_rotate - The naive baseline version of rotate
*/
char naive_rotate_descr[] = "naive_rotate: Naive baseline implementation";
* register_rotate_functions - Register all of your different versions
* of the rotate kernel with the driver by calling the
* add_rotate_function() for each test function. When you run the
dst_op=dst_op_cpy;
for(j=0;j<dim;j++)
{
*dst_op=*src_op;
dst_op++;src_op+=dim;
*dst_op=*src_op;
dst_op++;src_op+=dim;
*dst_op=*src_op;
dst_op++;src_op+=dim;
*dst_op=*src_op;
dst[RIDX(dim-1, j, dim)] = avg(dim, dim-1, j, src);
}
for (i = 0; i < dim; i++)
{
dst[RIDX(i, 0, dim)] = avg(dim, i, 0, src);
dst[RIDX(i, dim-1, dim)] = avg(dim, i, dim-1, src);
实验题目:程序性能调优
实验要求:本次实验,要求针对每个函数、每个人均至少写出3种优化版本、并根据driver报告的结果进行性能分析
实验目的:理解编译器,学习程序优化,从优化程序代码和程序执行速度两方面着手。
实验环境:WIN7 64位、ubuntu ,VMware workstation,
实验内容及操作步骤:
}
for (i = 1; i < dim-1; i++)
dst++;src+=dim;
*dst=*src;
dst++;src+=dim;
*dst=*src;
dst++;src+=dim;
*dst=*src;
src++;
ห้องสมุดไป่ตู้src-=tmp2;
dst-=tmp5;
}
src+=tmp2;
dst+=tmp4;
}
}
/*********************************************************************
}
2)分析
这段代码很多次地调用avg函数,而avg函数内也频繁调用initialize_pixel_sum、accumulate_sum、assign_sum_to_pixel这几个函数,且含有2层for循环。虽然会以损害程序的模块性为代价,但消除函数调用的时间开销,得到的代码运行速度会快得多。所以,需要改写代码,不调用avg函数。Smooth函数处理分为以下3部分,
将下载下来的kernels.c中的rotate、smooth函数进行优化。
本实验的实验原理是通过循环展开、cache友好、替换变量等手段来实现程序优化。
实验过程及分析:
由于优化代码较长,就不进行截图。
1. Naive_rotate
1)原代码
char naive_rotate_descr[] = "naive_rotate: Naive baseline implementation";
char rotate_descr_v1[] = "rotate_v1: version1 break into 4*4 blocks";
void rotate_v1(int dim, pixel *src, pixel *dst)
{
int i, j,ii,jj;
for(ii=0; ii < dim; ii+=4)
for(i=0; i< dim; i+=32)
{
for(j=0;j<dim;j++)
{
*dst=*src;
dst++;src+=dim;
*dst=*src;
dst++;src+=dim;
*dst=*src;
dst++;src+=dim;
*dst=*src;
dst++;src+=dim;
*dst=*src;
void naive_rotate(int dim, pixel *src, pixel *dst)
{
int i, j;
for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
dst[RIDX(dim-1-j,i,dim)] = src[RIDX(i,j,dim)];
}
char rotate_descr_v3[] = "rotate_v3: version3 break into 4*1 blocks with 4 parallel paths";
void rotate_v3(int dim, pixel *src, pixel *dst)
{
int i;
int j;
int tmp=(dim-1)*dim;
pixel *src_op;
pixel *dst_op;
for(i=0; i< dim; i+=4)
{
pixel *src_op_cpy=src+i*dim;
pixel *dst_op_cpy=dst+tmp+i;
src_op=src_op_cpy;
}
2)分析:这段代码的作用就是将所有的像素进行行列调位、导致整幅图画进行了90度旋转。P从defs.h中可以找到#define RIDX(i,j,n) ((i)*(n)+(j))。这段代码本来很短,但是从cache友好性来分析,这个代码的效率机会很低,所以按照cache的大小,应在存储的时候进行32个像素依次存储(列存储)。做到cache友好这样就可以可以大幅度提高效率。
}
/*
* rotate - Your current working version of rotate
* IMPORTANT: This is the version you will be graded on
*/
char rotate_descr[] = "rotate: Current working version,using pointer rather than computing address";
* driver program, it will test and report the performance of each
* registered test function.
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "defs.h"
team_t team = {
"201308060228", /*队名*/
"201308060228", /*序号*/
"747660816@", /*邮箱*/
"", /* Second member full name (leave blank if none) */
void rotate(int dim, pixel *src, pixel *dst)
{
int i;
int j;
int tmp1=dim*dim;
int tmp2=dim *31;
int tmp3=tmp1-dim;
int tmp4=tmp1+32;
int tmp5=dim+31;
dst+=tmp3;
void smooth_v1(int dim, pixel *src, pixel *dst)
{
int i, j, ii, jj;
pixel_sum sum;
pixel current_pixel, cp;
for (j = 0; j < dim; j++)
{
dst[RIDX(0, j, dim)] = avg(dim, 0, j, src);
一.主体内部,由9点求平均值;
二.4个角,由4点求平均值;
三.4条边界,由6点求平均值。
由图片的顶部开始处理,再上边界,顺序处理下来,其中在处理左边界时,for循环处理一行主体部分
3)优化代码
char smooth_descr_v1[] = "smooth_v1: with less func call and grossly simplified calculation for central parts";
相关主题