The perf* routines are a higher level way to access the hw counters. The perf* routines are a layer on top of genperf. This is for the user who wants to just do:
for(each_hw_event_in_PP_EVENTS_DESC)
{
do_my_sub();
}
print_a_report_on_each_hw_event.
Acknowledgement and Disclaimer
The perf* routines basically do this. The user doesn't have
to anything about hex event id's, event descriptions etc.
The user can do:
perfinit();
for(i=0; i<=PP_EVENTS_MAX;i+2)
{
perfsel();
perfi();
do_my_sub();
perff();
}
perfrep();
perfinit Sets up the perf structures perfrst Resets the perf structures
perfsel Select the next pair of events to monitor
perfi Zero and start the hw counters
perff Stop the counters and accumulate the change
perfrep print a report on all the values collected
Another set of routines monitor the flops on all the nodes in an application. It is about as easy to use as possible. It also monitors the flops on the 2nd processor (but you have to be using 'cop' to access the 2nd processor). These are global operations so every node must call the routines. The interface is simple:
beginflopmon(); do_your_work(); endflopmon(); printflopmon();The fortran interface:
call beginflopmon call do_your_work call endflopmon call printflopmonSince the routines don't take any arguments and don't return anything, I'll omit the synopsis section.
The following genperf routines take no arguments and return no value. They print the results to stdout.
Here is an example using mflops in fortran:
implicit none
include 'fgenperf.h'
integer*4 nmax,imax
real*8 mega
parameter (imax=1024*124,mega=1024.0*1024.0)
real*8 a(imax),b(imax),x,y
integer*4 i,j ,k,l,m
real*8 dclock,begtime,endtime
external dclock
integer*8 iflop
integer*8 ilng,jlng,ilng0,jlng0
integer*8 ilng2,jlng2, ilng3,jlng3
nmax=imax
x=0.0
do i=1,nmax
x=x+1.1
a(i) = x
b(i) = 1.0
end do
x=1.10
call genbeginmflops()
begtime=dclock()
330 continue
do i=1,1000,1
x = x + a(i)
end do
endtime=dclock()
if(endtime-begtime.lt.5.0)then
x = 0.0
iflop=iflop+i
goto 330
endif
350 continue
call genprintmflops()
call genendmflops()
print *,'Mflop/s=',iflop/(endtime-begtime)/1.0d6
print *,'flop=',i,iflop,x+y
print *,' '
genbeginmflops Start counting mflopsgenendmflops End counting mflops
genprintmflops Print the mflops
genrebegindcachehit Restart the L1 cache hit calculation
genbegindcachehit Start the L1 cache hit estimate
genprintdcachehit Print the L1 cache hit estimate
genbeginl2hit Start the L2 cache hit estimate
genendl2hit End the L2 cache hit estimate
genprintl2hit Print the L2 cache hit estimate
genbeginmemspeed Start the memory speed estimate
genendmemspeed End the memory speed estimate
genprintmemspeed Printthe memory speed estimate
genbeginbranchpred Start the branch prediction estimate
genendbranchpred End the branch predicition estimate
genprintbranchpred Print the branch prediction estimate
The following routines are lower level utility routines to use the hw counters.
Here is a fortran example which uses most of the routines:
implicit none
include 'fgenperf.h'
integer*4 nmax,imax
real*8 mega
parameter (imax=1024*124,mega=1024.0*1024.0)
real*8 a(imax),b(imax),x,y
integer*4 i,j ,k,l,m
real*8 dclock,begtime,endtime
external dclock
integer*8 ilng,jlng,ilng0,jlng0
integer*8 ilng2,jlng2, ilng3,jlng3
nmax=imax
x=0.0
do i=1,nmax
x=x+1.1
a(i) = x
b(i) = 1.0
end do
c loop over PP_EVENTS_MAX by 2.
c k is index into the PP_EVENTS_LIST table in cgenperf.h
c Should start at 0.
do k = 0,PP_EVENTS_MAX,2
x=0.00
do i=1,nmax
x=x+1.00
a(i) = x
end do
do i=1,nmax
a(i) = b(i)
end do
c monitor counter value k and k+1.
c monitor PP_EVENT_ID[k] and PP_EVENT_ID[k+1]
call gensetsimple(k)
c get initial values of counters , arg '0' is currently ignored.
call gengetperf(0,ilng2,jlng2)
do i=2,nmax
a(i) = b(i)*1.1d0
end do
c get ending values of counters
call gengetperf(0,ilng3,jlng3)
c stop the counters
call genstopperf(0)
c print a description of event k
call genprintperfbyind(k)
c print the change in counter 0
write(6,108)ilng3-ilng2
call genprintperfbyind(k+1)
c print the change in counter 1
write(6,108)jlng3-jlng2
enddo
gengetevent returns the i_th element from the PP_EVENTS_LISTgenstopperf stop the perfmon counters
genstartperf starts the perfmon counter on cpu iproc
gensetperf selects which counters to monitor and more
gengetdescbyhex returns a pointer to a description of event
gengethexbydesc returns the event id given the event description
gengetperf gets the two 64 bit counter values
genlprintperfbyhex prints a long description of 'event id' to stdout
genprintperfbyhex prints a short description of 'event id' to stdout
genprintperfbyind prints the i_th description of PP_EVENT_DESC
int gengetevent(const int which_event)
void genstopperf(const int iproc) void genstopperf_(const int *iproc)
void genstartperf(const int iproc) void genstartperf_(const int *iproc)
void gensetperf_(const int *iproc, const int *ievent0, const int *ievent1, const int *cmask0, const int *cmask1) void gensetperf(const int iproc, const int ievent0, const int ievent1, const int cmask0, const int cmask1)
char * gengetdescbyhex_(const int *event_id) char * gengetdescbyhex(const int event_id)
int gengethexbydesc_(const char *str) int gengethexbydesc(const char *str)
void gengetperf_(const int *iproc, long long *cntr_event0, long long *cntr_event1) void gengetperf(const int iproc, long long *cntr_event0, long long *cntr_event1)
void genlprintperfbyhex_(const int *ihex) void genlprintperfbyhex(const int ihex)
void genprintperfbyhex_(const int *ihex) void genprintperfbyhex(const int ihex)
void genprintperfbyind_(const int *iindex) void genprintperfbyind(const int iindex)
void perfinit_(void) void perfinit(void)
void perfrst_(void) void perfrst(void)
void perfsel_(void) void perfsel(void)
void perfi_(void) void perfi(void)
void perff_(void) void perff(void)
void perfrep_(void) void perfrep()
long long ll_arr[PP_EVENTS_MAX+2],lla,llb;
for(i=0; i<= PP_EVENTS_MAX; i+=2)
{
gensetsimple(i);
gengetperf(0, &ll_arr[i], &ll_arr[i+1]);
do_my_sub();
gengetperf(0, &lla, &llb);
ll_arr[i] = lla - ll_arr[i];
ll_arr[i+1] = llb - ll_arr[i+1];
}
So the perf* routines hide all the accumulating of the counters.
void gensetsimple_(const int *k) void gensetsimple(const int k)
void genbeginmflops_(void) void genbeginmflops(void)
void genprintmflops_(void) void genprintmflops(void)
void genendmflops_(void) void genendmflops(void)
void genrebegindcachehit_(void) void genrebegindcachehit(void)
void genbegindcachehit_(void) void genbegindcachehit(void)
void genprintdcachehit_(void) void genprintdcachehit(void)
void genenddcachehit_(void) void genenddcachehit(void)
void genbeginl2hit(void) void genbeginl2hit_(void) void genrebeginl2hit(void) void genrebeginl2hit_(void) void genprintl2hit(void) void genprintl2hit_(void) void genendl2hit(void) void genendl2hit_(void)
void genbeginmemspeed(void) void genbeginmemspeed_(void) void genprintmemspeed(void) void genprintmemspeed_(void) void genendmemspeed(void) void genendmemspeed_(void)
void genbeginbranchpred(void) void genbeginbranchpred_(void) void genprintbranchpred(void) void genprintbranchpred_(void) void genendbranchpred(void) void genendbranchpred_(void)