#include "pputask.h" #include #include #include #include #include #include "spu/compute_task.h" extern spe_program_handle_t spu_main; compute_task_t task __attribute__ ((aligned (128))); cache_test_task_t cache_test_task __attribute__ ((aligned (128))); #define COUNT (1024*1024*8) #define NUMA #ifndef NUMA float bufa[COUNT] __attribute__ ((aligned (128))); float bufb[COUNT] __attribute__ ((aligned (128))); float bufc[COUNT] __attribute__ ((aligned (128))); float resultc[COUNT] __attribute__ ((aligned (128))); void alloc_mem(){} #else float* bufa; float* bufb; float* bufc; float* resultc; void alloc_mem() { printf("allocating memory from numa node 0\n"); bufa = (float*)numa_alloc_onnode(sizeof(float)*COUNT, 0); bufb = (float*)numa_alloc_onnode(sizeof(float)*COUNT, 0); bufc = (float*)numa_alloc_onnode(sizeof(float)*COUNT, 0); resultc = (float*)numa_alloc_onnode(sizeof(float)*COUNT, 0); if (bufa == 0 || bufb == 0 || bufc == 0 || resultc == 0){ printf("malloc failed\n"); exit(1); } if ( (((unsigned int)bufa) & 0x7f) != 0 || (((unsigned int)bufb) & 0x7f) != 0 || (((unsigned int)bufc) & 0x7f) != 0 || (((unsigned int)resultc & 0x7f) != 0)){ printf("address not aligned to 128 bytes\n"); exit(1); } } #endif void display_matrix(float*a ) { int i, j; for (i =0 ; i < 4; i++){ for (j=0; j< 4; j++){ printf("\t%f", a[i*4+j]); } printf("\n"); } printf("\n"); } int init_value(float* a, float* b, float* c) { float* fa,*fb, *fc; int i; memset(c, 0, COUNT*sizeof(float)); fa = (float*)a; fb = (float*)b; fc = (float*)c; for (i = 0; i< COUNT;i++){ fa[i] = 1.0*rand()/RAND_MAX; fb[i] = 1.0*rand()/RAND_MAX; } return 0; } void matrix_mult_ppu(float* c, float* a, float*b) { int i, j, k, count; float* fa,*fb, *fc; fa = (float*)a; fb = (float*)b; fc = (float*)c; for(count = 0; count < COUNT/16; count++){ for (i = 0; i< 4; i++){ for (j = 0; j< 4;j++){ fc[i*4+j] = 0; for (k =0; k < 4; k++){ fc[i*4+j] += fa[i*4 +k]* fb[k*4+j]; } } } fa += 16; fb += 16; fc += 16; } } void matrix_compare(float* m1, float*m2) { int errcount =0; int i; float tmp; for (i = 0; i< COUNT ; i++){ tmp = m1[i] - m2[i]; if (tmp < 0) tmp = -tmp; if (tmp > 1.0e-4){ printf("i=%d, m1=%f, m2=%f\n", i, m1[i], m2[i]); errcount ++; } } if (errcount == 0){ printf("Verification passed\n"); }else{ printf("Verification failed(%d)\n", errcount); exit(1); } return; } int do_work_in_ppu() { matrix_mult_ppu((float*)bufc, (float*)bufa, (float*)bufb); return 0; } extern double gettime(void); char databuf[128*128] __attribute__ ((aligned (128))); /* static void init_databuf() { int i; int* idata = (int*)databuf; for (i = 0; i < 4*1024; i++){ idata[i] = i; } return; } */ extern char *optarg; extern int optind, opterr, optopt; int num_spes; int processoption(int argc, char **argv) { char *cvalue = NULL; int n; signed char c; opterr = 0; n = 1;/*default number of spus*/ while ((c = getopt (argc, argv, "n:")) != -1){ switch (c) { case 'n': cvalue = optarg; if (sscanf(cvalue, "%d", &n) ==0 || n < 0){ fprintf(stderr, "Invalid spu number(%d)\n", n); return -1; } break; case '?': fprintf (stderr, "Unknown option `-%c'.\n", optopt); return - 1; default: return -1; } } num_spes = n; printf("Program running on %d SPUs\n", n); return 0; } int main(int argc, char** argv) { int totalcount = COUNT*sizeof(float); int n; int repeat_num=1; int num_runs = 0; int need_verify = 1; float totaltime = 0; float ave_time = 0; processoption(argc, argv); alloc_mem(); ppu_task_physid_as_rank_reset(); ppu_task_spe_num_set(num_spes); ppu_task_debug_level_set(1); if (ppu_task_init(0, NULL, spu_main) < 0){ printf("Init failed\n"); return -1; } /* init_databuf(); cache_test_task.common.cmd = CACHE_TEST; cache_test_task.common.size= sizeof(cache_test_task); cache_test_task.start_addr = databuf; ppu_task_run((task_t*)&cache_test_task); */ n = ppu_task_spe_num_get(); init_value(bufa, bufb, bufc); repeat: if (need_verify){ memset(bufc, 0, COUNT*sizeof(float)); memset(resultc, 0, COUNT*sizeof(float)); matrix_mult_ppu((float*)resultc, (float*)bufa, (float*)bufb); } gettime(); if (n == 0){ do_work_in_ppu(); totaltime += gettime(); }else{ task.common.cmd = SAMPLE_TASK; task.common.size = sizeof(task); task.dest = (char*)bufc; task.srca = (char*)bufa; task.srcb = (char*)bufb; task.count =totalcount; ppu_task_run((task_t*)&task); totaltime += gettime(); } if(need_verify){ matrix_compare((float*)resultc, (float*)bufc); } if (++num_runs < repeat_num){ goto repeat; } ave_time = totaltime /num_runs; printf("ave_time =%f\n", ave_time); printf("Bandwidth=%f GB/s\n", COUNT*sizeof(float)*3/ave_time/1000000000); printf("program exits\n"); return 0; }