20 integer,
parameter:: maxproc=100, maxdepth=20
21 integer,
save:: ntimer=0
28 integer(8):: calls(maxproc)=0
29 real(8),
dimension(maxproc):: start=0d0, total=0d0
30 integer,
dimension(maxdepth):: stack
32 real(8):: bytes_recv=0.0_8
33 real(8):: sec_per_report=10d0
34 integer(8):: n_master(4)=0
35 integer(8):: n_update=0
36 integer(8):: n_recv=0_8
37 integer(8):: mpi_test=0_8, mpi_hit=0_8
38 integer(8):: mem_test=0_8, mem_hit=0_8
39 real(8):: busy_time=0d0, spin_time=0d0
40 integer:: nq_send_max=0
41 integer:: nq_recv_max=0
43 integer:: n_mhd=0, n_solve=0
44 type(latency_t):: latency
45 real(8),
allocatable:: levelcost(:)
46 integer(8),
allocatable:: levelpop(:)
47 integer:: levelmin, levelmax
48 real:: dead_mans_hand=0.0
49 logical:: detailed=.false.
55 procedure,
nopass:: tic
56 procedure,
nopass:: toc_1
57 procedure,
nopass:: toc_i4
58 procedure,
nopass:: toc_i8
59 procedure,
nopass:: toc_r
60 generic:: toc => toc_1, toc_i4, toc_i8, toc_r
62 character(len=64),
dimension(maxproc):: names
63 type(timer_t),
pointer:: timers(:)
64 type(timer_t),
public:: timer
65 integer,
save:: nprint=20
66 integer,
save:: verbose=0
67 real(8),
save:: next_report_sec=0d0
68 real(8),
save:: prev2=0d0
69 real,
save:: min_fraction=0.0005
71 module procedure toc_i4, toc_i8, toc_1
73 real(8),
save:: wt=0d0
80 SUBROUTINE init (self)
82 logical,
save:: first_time=.true., detailed=.false.
84 real(8):: sec_per_report=10.
85 namelist /timer_params/ sec_per_report, min_fraction, verbose, detailed
91 read (io_unit%input, timer_params, iostat=ignore)
92 if (io_unit%master)
write (*, timer_params)
93 self%sec_per_report = sec_per_report
94 next_report_sec = wallclock() + sec_per_report
95 omp%nthreads = max(1,omp%nthreads)
96 timer%detailed = detailed
97 allocate (timers(0:omp%nthreads-1))
101 do i=0,omp%nthreads-1
103 timers(i)%start = 0d0
104 timers(i)%total = 0d0
112 SUBROUTINE begin (self, name, itimer)
113 class(timer_t):: self
114 character(len=*):: name
117 integer:: thread, i, otimer, istack
121 if (io_unit%do_validate)
return 122 thread = omp_get_thread_num()
128 ok = ok .and. trim(names(i)) /= trim(name)
132 if (ntimer > maxproc)
call mpi%abort (
'timer_mod: increase maxproc')
136 write (stdout,*)
'timer_t%begin: WARNING, duplicate name: '//trim(name)
143 if (itimer <= 0)
return 148 istack = timers(thread)%istack
151 otimer = timers(thread)%stack(istack)
152 timers(thread)%total(otimer) = timers(thread)%total(otimer) &
153 + (wc - timers(thread)%start(otimer))
158 timers(thread)%start(itimer) = wc
160 if (istack>maxdepth)
call mpi%abort (
'too many timer levels;'//trim(name))
161 timers(thread)%stack(istack) = itimer
162 timers(thread)%istack = istack
163 if (verbose > 0)
then 164 print
'(a,2i5,3x,a)',
'timer%begin: istack, itimer, name =', istack, itimer, trim(name)
172 SUBROUTINE end (self, itimer)
173 class(timer_t):: self
174 integer,
optional:: itimer
175 integer::thread, istack, otimer
178 if (io_unit%do_validate)
return 182 thread = omp_get_thread_num()
183 istack = timers(thread)%istack
184 if (istack <= 0)
then 185 print *,
'WARNING, timer%end out-of-sync: thread, istack =', omp%thread, istack
186 timers(thread)%istack = 0
189 otimer = timers(thread)%stack(istack)
190 if (verbose > 0)
then 191 print
'(a,2i5,3x,a)',
'timer%end: istack, otimer, name =', istack, otimer, trim(names(otimer))
197 if (
present(itimer))
then 198 if (itimer /= otimer)
then 199 write (stdout,
'(i6,2(3x,a,i4))') &
200 omp%thread,
'WARNING: itimer =',itimer, &
201 ' not equal to expected value =', otimer
204 timers(thread)%calls(otimer) = timers(thread)%calls(otimer) + 1
205 timers(thread)%total(otimer) = timers(thread)%total(otimer) + &
206 wc - timers(thread)%start(otimer)
212 otimer = timers(thread)%stack(istack)
213 timers(thread)%start(otimer) = wc
214 else if (istack<0)
then 215 print *,
'WARNNING: too many timer%end calls' 218 timers(thread)%istack = istack
222 if (wc > next_report_sec)
then 224 if (wc > next_report_sec)
then 225 call real_print (self)
226 next_report_sec = (nint(wc/self%sec_per_report)+1)*self%sec_per_report
236 SUBROUTINE print (self)
237 class(timer_t):: self
238 if (omp%master)
call real_print (self)
240 SUBROUTINE real_print (self)
241 class(timer_t):: self
242 real(8):: total, proc(maxproc), ncalls(maxproc)
243 real(8):: updates, musppt, calls
244 real(8),
save:: n_previous=0
246 integer:: i, n_recv, lntimer
248 if (io_unit%do_validate)
return 259 proc(i) = sum(timers(:)%total(i))
260 ncalls(i) = sum(timers(:)%calls(i))
261 total = total + proc(i)
262 calls = calls + ncalls(i)
271 updates = max(1.0,
real(timer%n_update-n_previous))
272 musppt = sec*1d6*min(omp%nthreads,omp%ncores)/updates
273 n_recv = max(timer%n_recv,1)
275 call write_to (io_unit%output)
276 call write_to (io_unit%mpi)
279 timer%bytes_recv = 0.0_8
281 timer%nq_send_max = 0
282 timer%nq_recv_max = 0
283 n_previous = timer%n_update
285 timer%latency%max = 0.0
286 timer%latency%aver = 0d0
291 timers(:)%total(i) = 0d0
292 timers(:)%calls(i) = 0
296 subroutine write_to (unit)
297 integer:: unit, level, popu
298 real:: cost, per_call
300 write (unit,
'(23x,a,7x,a,10x,a,7x,a,5x,a)')
'procedure',
'calls',
'time',
'percent',
' s/call' 301 if (unit==io_unit%output)
then 305 if (total > 0.0)
then 308 if (proc(i)/total > min_fraction)
then 309 per_call = proc(i)/max(ncalls(i),1d0)
310 warn = per_call < 0.5e-6
311 mesg = mesg .or. warn
312 write (unit,
'(a32,1p,2g15.3,0p,f10.1,0p,f12.6,1x,a1)') &
313 trim(names(i)), ncalls(i), proc(i), proc(i)/total*100., &
314 per_call, merge(
'W',
' ',warn)
318 write (unit,*)
'W: much of the item time may be due to timer calls' 323 if (
allocated(timer%levelcost) .and. timer%levelmax > timer%levelmin)
then 324 do level=timer%levelmin,timer%levelmax
326 cost = timer%levelcost(level)
328 popu = timer%levelpop(level)
329 write(unit,
'(22x,a,i3,g12.3,i7)') &
330 'level, cost, population =', level, cost, popu
332 do level=timer%levelmin,timer%levelmax
334 timer%levelcost(level) = 0d0
340 write (unit,
'(a32,1p,2g15.3,g14.4,2(g12.3,2x,a,5x))') &
341 "TOTAL thread time, calls", calls, total, 100., musppt,
'core-mus/cell-upd', wc,
'wall sec' 343 timer%latency%aver = timer%latency%aver/max(1,timer%latency%n)
344 write (unit,
'(1x,"MPI recv:",f10.1," MB/s",f11.3," MB/mesg",i8," nq_send_max",i8, & 345 " nq_recv_max",2f8.3," max, aver latency",f7.2, " f_unpk",f7.2," f_mem",f8.3," f_q")') &
346 timer%bytes_recv/1024.**2/sec, timer%bytes_recv/1024.**2/n_recv, &
347 timer%nq_send_max, timer%nq_recv_max, timer%latency%max, timer%latency%aver, &
348 timer%mpi_hit/
real(max(1,timer%mpi_test)), &
349 timer%mem_hit/
real(max(1,timer%mem_test)), &
350 timer%busy_time/max(timer%spin_time + timer%busy_time,1d-30)
353 timer%busy_time = 0d0
354 timer%spin_time = 0d0
356 call omp_lock%info (unit)
358 self%n_mhd = 0; self%n_solve=0
359 end subroutine write_to
360 END SUBROUTINE real_print
364 SUBROUTINE tic (time)
366 real(8),
optional:: time
368 if (
present(time))
then 377 SUBROUTINE toc_1 (label, time)
379 character(len=*):: label
380 real(8),
optional:: time
383 call toc_r (label, 1.0, time)
388 SUBROUTINE toc_i4 (label, n, time)
390 character(len=*):: label
391 real(8),
optional:: time
394 call toc_r (label,
real(n), time)
395 END SUBROUTINE toc_i4
399 SUBROUTINE toc_i8 (label, n, time)
401 character(len=*):: label
402 real(8),
optional:: time
405 call toc_r (label,
real(n), time)
406 END SUBROUTINE toc_i8
409 FUNCTION str_f4 (t)
RESULT (out)
411 character(len=8):: out
413 write (out,
'(f8.3)') t
419 FUNCTION time_str (t)
RESULT (out)
421 character(len=8):: out
422 real,
parameter:: ns=1e-9, mu=1e-6, ms=1e-3, s=1e0, mi=6e1, hr=36e2, dy=24*hr
423 real,
parameter:: wk=7*dy, yr=365.*dy, mo=yr/12.
426 out=trim(str_f4(t/ns))//
' ns' 428 else if (t/ms < 1d0)
then 429 out=trim(str_f4(t/mu))//
' mus' 430 else if ( t < 1d0)
then 431 out=trim(str_f4(t/ms))//
' ms' 432 else if ( t < 100)
then 433 out=trim(str_f4( t))//
' s ' 434 else if (t/mi < 100)
then 435 out=trim(str_f4(t/mi))//
' mn' 436 else if (t/dy < 1d0)
then 437 out=trim(str_f4(t/hr))//
' hr' 438 else if (t/wk < 1d0)
then 439 out=trim(str_f4(t/dy))//
' dy' 440 else if (t/mo < 1d0)
then 441 out=trim(str_f4(t/wk))//
' wk' 442 else if (t/yr < 1d0)
then 443 out=trim(str_f4(t/mo))//
' mo' 445 out=trim(str_f4(t/yr))//
' yr' 453 SUBROUTINE toc_r (label, n, time)
455 character(len=*):: label
457 real(8),
optional:: time
459 integer:: n_cores, n_threads
461 if (io_unit%do_validate)
return 463 if (
present(time))
then 470 n_cores = max(omp%ncores,1)
471 n_threads = max(omp%nthreads,1)
472 if (n_cores > n_threads) n_cores = min(n_cores,n_threads)
474 1
format (1x, a,
': ', a,
', ', a, a, 1p, e10.2, 1x, a, 3(i4, 1x, a))
475 2
format (1x, a,
': ', a,
', ', a, a, 1p, e10.2, 1x, a, 4(i4, 1x, a))
476 if (n_cores == n_threads)
then 477 write (io_unit%log,1) label, trim(time_str(dtime)), &
478 trim(time_str(dtime*n_cores/max(n,1.))),
'/upd,', n,
'updates/process,', &
479 mpi%size, merge(
'processes,',
'process, ',mpi%size>1), &
480 n_cores, merge(
'cores/process',
'core/process ',n_cores>1)
482 write (io_unit%log,1) label, trim(time_str(dtime)), &
483 trim(time_str(dtime*n_cores/max(n,1.))),
'/upd,', n,
'updates/process,', &
484 mpi%size, merge(
'processes,',
'process, ',mpi%size>1), &
485 n_cores, merge(
'cores/process',
'core/process ',n_cores>1), &
486 n_threads, merge(
'threads/process',
'thread/process ',n_threads>1)
489 if (mpi%rank==0.and.omp%thread==0)
then 491 if (n_cores == n_threads)
then 492 print 1, label, trim(time_str(dtime)), &
493 trim(time_str(dtime*n_cores/max(n,1.))),
'/upd,', n,
'updates/process,', &
494 mpi%size, merge(
'processes,',
'process, ',mpi%size>1), &
495 n_cores, merge(
'cores/process',
'core/process ',n_cores>1)
497 print 1, label, trim(time_str(dtime)), &
498 trim(time_str(dtime*n_cores/max(n,1.))),
'/upd,', n,
'updates/process,', &
499 mpi%size, merge(
'processes,',
'process, ',mpi%size>1), &
500 n_cores, merge(
'cores/process',
'core/process ',n_cores>1), &
501 n_threads, merge(
'threads/process',
'thread/process ',n_threads>1)
504 print 1, label, trim(time_str(dtime))
508 if (
present(time))
then Each thread uses a private timer data type, with arrays for start time and total time for each regist...
Support tic/toc timing, as in MATLAB, and accurate wallclock() function. The timing is generally much...
The lock module uses nested locks, to allow versatile use of locks, where a procedure may want to mak...