DISPATCH
timer_mod.f90
1 !===============================================================================
2 !> Each thread uses a private timer data type, with arrays for start time and
3 !> total time for each registered procedure. At begin, it stores the start time,
4 !> and at end it increments the call counter and sums the time used into the
5 !> total counter.
6 !>
7 !> We suspend counting of the active procedure if a new begin occurs, and resume
8 !> it again on the next end call. Each subsequent begin call suspends the
9 !> ongoing call, so we need a list of active counters.
10 !===============================================================================
11 MODULE timer_mod
12  USE mpi_mod
13  USE io_unit_mod
14  USE omp_mod
15  USE omp_timer_mod
16  USE omp_lock_mod
17  USE omp_lib
18  implicit none
19  private
20  integer, parameter:: maxproc=100, maxdepth=20
21  integer, save:: ntimer=0
22  type:: latency_t
23  real:: max=0.0
24  real(8):: aver=0d0
25  integer:: n=0
26  end type
27  type:: timer_t
28  integer(8):: calls(maxproc)=0
29  real(8), dimension(maxproc):: start=0d0, total=0d0
30  integer, dimension(maxdepth):: stack
31  integer:: istack=0
32  real(8):: bytes_recv=0.0_8
33  real(8):: sec_per_report=10d0
34  integer(8):: n_master(4)=0
35  integer(8):: n_update=0
36  integer(8):: n_recv=0_8
37  integer(8):: mpi_test=0_8, mpi_hit=0_8
38  integer(8):: mem_test=0_8, mem_hit=0_8
39  real(8):: busy_time=0d0, spin_time=0d0
40  integer:: nq_send_max=0
41  integer:: nq_recv_max=0
42  integer:: n_lines=0
43  integer:: n_mhd=0, n_solve=0
44  type(latency_t):: latency
45  real(8), allocatable:: levelcost(:)
46  integer(8), allocatable:: levelpop(:)
47  integer:: levelmin, levelmax
48  real:: dead_mans_hand=0.0
49  logical:: detailed=.false.
50  contains
51  procedure:: init
52  procedure:: begin
53  procedure:: end
54  procedure:: print
55  procedure, nopass:: tic
56  procedure, nopass:: toc_1
57  procedure, nopass:: toc_i4
58  procedure, nopass:: toc_i8
59  procedure, nopass:: toc_r
60  generic:: toc => toc_1, toc_i4, toc_i8, toc_r
61  end type
62  character(len=64), dimension(maxproc):: names
63  type(timer_t), pointer:: timers(:)
64  type(timer_t), public:: timer
65  integer, save:: nprint=20
66  integer, save:: verbose=0
67  real(8), save:: next_report_sec=0d0
68  real(8), save:: prev2=0d0
69  real, save:: min_fraction=0.0005
70  interface toc
71  module procedure toc_i4, toc_i8, toc_1
72  end interface
73  real(8), save:: wt=0d0
74  !$omp threadprivate (wt)
75  public tic, toc
76 CONTAINS
77 
78 !===============================================================================
79 !===============================================================================
80 SUBROUTINE init (self)
81  class(timer_t):: self
82  logical, save:: first_time=.true., detailed=.false.
83  integer:: ignore, i
84  real(8):: sec_per_report=10.
85  namelist /timer_params/ sec_per_report, min_fraction, verbose, detailed
86  !.............................................................................
87  !$omp critical (timer_cr)
88  if (first_time) then
89  first_time = .false.
90  rewind(io_unit%input)
91  read (io_unit%input, timer_params, iostat=ignore)
92  if (io_unit%master) write (*, timer_params)
93  self%sec_per_report = sec_per_report
94  next_report_sec = wallclock() + sec_per_report
95  omp%nthreads = max(1,omp%nthreads)
96  timer%detailed = detailed
97  allocate (timers(0:omp%nthreads-1))
98  end if
99  !$omp end critical (timer_cr)
100  ntimer = 0
101  do i=0,omp%nthreads-1
102  timers(i)%calls = 0
103  timers(i)%start = 0d0
104  timers(i)%total = 0d0
105  end do
106 END SUBROUTINE init
107 
108 !===============================================================================
109 !> If itimer==0, find a suitable value for it, and store the name with it.
110 !> The name argument is not needed when itimer has already been assigned a value.
111 !===============================================================================
112 SUBROUTINE begin (self, name, itimer)
113  class(timer_t):: self
114  character(len=*):: name
115  integer:: itimer
116  !.............................................................................
117  integer:: thread, i, otimer, istack
118  logical:: ok
119  real(8):: wc
120  !-----------------------------------------------------------------------------
121  if (io_unit%do_validate) return
122  thread = omp_get_thread_num()
123  if (itimer==0) then
124  !$omp critical (timer_cr)
125  if (itimer==0) then
126  ok = .true.
127  do i=1,ntimer
128  ok = ok .and. trim(names(i)) /= trim(name)
129  end do
130  if (ok) then
131  ntimer = ntimer+1
132  if (ntimer > maxproc) call mpi%abort ('timer_mod: increase maxproc')
133  itimer = ntimer
134  names(itimer) = name
135  else
136  write (stdout,*) 'timer_t%begin: WARNING, duplicate name: '//trim(name)
137  end if
138  end if
139  !$omp end critical (timer_cr)
140  !---------------------------------------------------------------------------
141  ! If the name already exists in names then itimer remains =0
142  !---------------------------------------------------------------------------
143  if (itimer <= 0) return
144  end if
145  !-----------------------------------------------------------------------------
146  ! If a procedure is being timed, accumulate to its total
147  !-----------------------------------------------------------------------------
148  istack = timers(thread)%istack
149  wc = wallclock()
150  if (istack>0) then
151  otimer = timers(thread)%stack(istack)
152  timers(thread)%total(otimer) = timers(thread)%total(otimer) &
153  + (wc - timers(thread)%start(otimer))
154  end if
155  !-----------------------------------------------------------------------------
156  ! Start the new timer, and record it on the stack
157  !-----------------------------------------------------------------------------
158  timers(thread)%start(itimer) = wc
159  istack = istack+1
160  if (istack>maxdepth) call mpi%abort ('too many timer levels;'//trim(name))
161  timers(thread)%stack(istack) = itimer
162  timers(thread)%istack = istack
163  if (verbose > 0) then
164  print '(a,2i5,3x,a)', 'timer%begin: istack, itimer, name =', istack, itimer, trim(name)
165  end if
166 END SUBROUTINE begin
167 
168 !===============================================================================
169 !> At end call, increment number of calls and total time for the active timer
170 !> and resume any suspended timer
171 !===============================================================================
172 SUBROUTINE end (self, itimer)
173  class(timer_t):: self
174  integer, optional:: itimer
175  integer::thread, istack, otimer
176  real(8):: wc
177  !.............................................................................
178  if (io_unit%do_validate) return
179  !-----------------------------------------------------------------------------
180  ! Find the active timer for this thread
181  !-----------------------------------------------------------------------------
182  thread = omp_get_thread_num()
183  istack = timers(thread)%istack
184  if (istack <= 0) then
185  print *, 'WARNING, timer%end out-of-sync: thread, istack =', omp%thread, istack
186  timers(thread)%istack = 0
187  return
188  end if
189  otimer = timers(thread)%stack(istack)
190  if (verbose > 0) then
191  print '(a,2i5,3x,a)', 'timer%end: istack, otimer, name =', istack, otimer, trim(names(otimer))
192  end if
193  !-----------------------------------------------------------------------------
194  ! Increment call counter and total time used
195  !-----------------------------------------------------------------------------
196  wc = wallclock()
197  if (present(itimer)) then
198  if (itimer /= otimer) then
199  write (stdout,'(i6,2(3x,a,i4))') &
200  omp%thread, 'WARNING: itimer =',itimer, &
201  ' not equal to expected value =', otimer
202  end if
203  end if
204  timers(thread)%calls(otimer) = timers(thread)%calls(otimer) + 1
205  timers(thread)%total(otimer) = timers(thread)%total(otimer) + &
206  wc - timers(thread)%start(otimer)
207  !-----------------------------------------------------------------------------
208  ! Check if there is a suspended timer present, and if so restart it
209  !-----------------------------------------------------------------------------
210  istack = istack-1
211  if (istack>0) then
212  otimer = timers(thread)%stack(istack)
213  timers(thread)%start(otimer) = wc
214  else if (istack<0) then
215  print *,'WARNNING: too many timer%end calls'
216  istack = 0
217  end if
218  timers(thread)%istack = istack
219  !-----------------------------------------------------------------------------
220  ! Check if it is time to do a printout
221  !-----------------------------------------------------------------------------
222  if (wc > next_report_sec) then
223  !$omp critical (timer_print_cr)
224  if (wc > next_report_sec) then
225  call real_print (self)
226  next_report_sec = (nint(wc/self%sec_per_report)+1)*self%sec_per_report
227  end if
228  !$omp end critical (timer_print_cr)
229  end if
230 END SUBROUTINE end
231 
232 !===============================================================================
233 !> Printout times used in routines using more than 0.05% of the time, and reset
234 !> counters.
235 !===============================================================================
236 SUBROUTINE print (self)
237  class(timer_t):: self
238  if (omp%master) call real_print (self)
239 END SUBROUTINE print
240 SUBROUTINE real_print (self)
241  class(timer_t):: self
242  real(8):: total, proc(maxproc), ncalls(maxproc)
243  real(8):: updates, musppt, calls
244  real(8), save:: n_previous=0
245  real(8):: wc, sec
246  integer:: i, n_recv, lntimer
247  !.............................................................................
248  if (io_unit%do_validate) return
249  !-----------------------------------------------------------------------------
250  ! Prevent several treads from printing at the same time
251  !-----------------------------------------------------------------------------
252  wc = wallclock()
253  sec = wc - prev2
254  total = 0d0
255  calls = 0d0
256  !$omp atomic read
257  lntimer = ntimer
258  do i=1,lntimer
259  proc(i) = sum(timers(:)%total(i))
260  ncalls(i) = sum(timers(:)%calls(i))
261  total = total + proc(i)
262  calls = calls + ncalls(i)
263  end do
264  !-----------------------------------------------------------------------------
265  ! 'sec' measure wall clock time on a single thread, while 'total' is a sum over
266  ! all procedures being measured, over all threads.
267  ! 'updates' is the total number of active-cell updates (i.e. not including
268  ! ghost cells) across *all* threads.
269  ! `musppt` is defined to be the number of microseconds per update per thread.
270  !-----------------------------------------------------------------------------
271  updates = max(1.0,real(timer%n_update-n_previous))
272  musppt = sec*1d6*min(omp%nthreads,omp%ncores)/updates
273  n_recv = max(timer%n_recv,1)
274  if (mpi%master) &
275  call write_to (io_unit%output)
276  call write_to (io_unit%mpi)
277  !$omp atomic write
278  timer%n_lines = 1
279  timer%bytes_recv = 0.0_8
280  timer%n_recv = 0_8
281  timer%nq_send_max = 0
282  timer%nq_recv_max = 0
283  n_previous = timer%n_update
284  prev2 = wc
285  timer%latency%max = 0.0
286  timer%latency%aver = 0d0
287  timer%latency%n = 0
288  timer%mpi_test = 0_8
289  timer%mpi_hit = 0_8
290  do i=1,ntimer
291  timers(:)%total(i) = 0d0
292  timers(:)%calls(i) = 0
293  end do
294  return
295 contains
296  subroutine write_to (unit)
297  integer:: unit, level, popu
298  real:: cost, per_call
299  logical:: warn, mesg
300  write (unit,'(23x,a,7x,a,10x,a,7x,a,5x,a)') 'procedure', 'calls', 'time', 'percent', ' s/call'
301  if (unit==io_unit%output) then
302  !$omp atomic write
303  timer%n_lines = 1
304  end if
305  if (total > 0.0) then
306  mesg = .false.
307  do i=1,ntimer
308  if (proc(i)/total > min_fraction) then
309  per_call = proc(i)/max(ncalls(i),1d0)
310  warn = per_call < 0.5e-6
311  mesg = mesg .or. warn
312  write (unit,'(a32,1p,2g15.3,0p,f10.1,0p,f12.6,1x,a1)') &
313  trim(names(i)), ncalls(i), proc(i), proc(i)/total*100., &
314  per_call, merge('W',' ',warn)
315  end if
316  end do
317  if (mesg) &
318  write (unit,*) 'W: much of the item time may be due to timer calls'
319  end if
320  !---------------------------------------------------------------------------
321  ! Output AMR level costs
322  !---------------------------------------------------------------------------
323  if (allocated(timer%levelcost) .and. timer%levelmax > timer%levelmin) then
324  do level=timer%levelmin,timer%levelmax
325  !$omp atomic read
326  cost = timer%levelcost(level)
327  !$omp atomic read
328  popu = timer%levelpop(level)
329  write(unit,'(22x,a,i3,g12.3,i7)') &
330  'level, cost, population =', level, cost, popu
331  end do
332  do level=timer%levelmin,timer%levelmax
333  !$omp atomic write
334  timer%levelcost(level) = 0d0
335  end do
336  end if
337  !---------------------------------------------------------------------------
338  ! MPI and OMP statistics
339  !---------------------------------------------------------------------------
340  write (unit,'(a32,1p,2g15.3,g14.4,2(g12.3,2x,a,5x))') &
341  "TOTAL thread time, calls", calls, total, 100., musppt, 'core-mus/cell-upd', wc, 'wall sec'
342  if (mpi%size>1) then
343  timer%latency%aver = timer%latency%aver/max(1,timer%latency%n)
344  write (unit,'(1x,"MPI recv:",f10.1," MB/s",f11.3," MB/mesg",i8," nq_send_max",i8, &
345  " nq_recv_max",2f8.3," max, aver latency",f7.2, " f_unpk",f7.2," f_mem",f8.3," f_q")') &
346  timer%bytes_recv/1024.**2/sec, timer%bytes_recv/1024.**2/n_recv, &
347  timer%nq_send_max, timer%nq_recv_max, timer%latency%max, timer%latency%aver, &
348  timer%mpi_hit/real(max(1,timer%mpi_test)), &
349  timer%mem_hit/real(max(1,timer%mem_test)), &
350  timer%busy_time/max(timer%spin_time + timer%busy_time,1d-30)
351  timer%mpi_hit = 0
352  timer%mpi_test = 0
353  timer%busy_time = 0d0
354  timer%spin_time = 0d0
355  end if
356  call omp_lock%info (unit)
357  flush (unit)
358  self%n_mhd = 0; self%n_solve=0
359  end subroutine write_to
360 END SUBROUTINE real_print
361 
362 !===============================================================================
363 !===============================================================================
364 SUBROUTINE tic (time)
365  implicit none
366  real(8), optional:: time
367  !.............................................................................
368  if (present(time)) then
369  time = wallclock()
370  else
371  wt = wallclock()
372  end if
373 END SUBROUTINE tic
374 
375 !===============================================================================
376 !===============================================================================
377 SUBROUTINE toc_1 (label, time)
378  implicit none
379  character(len=*):: label
380  real(8), optional:: time
381  integer:: n
382  !.............................................................................
383  call toc_r (label, 1.0, time)
384 END SUBROUTINE toc_1
385 
386 !===============================================================================
387 !===============================================================================
388 SUBROUTINE toc_i4 (label, n, time)
389  implicit none
390  character(len=*):: label
391  real(8), optional:: time
392  integer:: n
393  !.............................................................................
394  call toc_r (label, real(n), time)
395 END SUBROUTINE toc_i4
396 
397 !===============================================================================
398 !===============================================================================
399 SUBROUTINE toc_i8 (label, n, time)
400  implicit none
401  character(len=*):: label
402  real(8), optional:: time
403  integer(8):: n
404  !.............................................................................
405  call toc_r (label, real(n), time)
406 END SUBROUTINE toc_i8
407 
408 !===============================================================================
409 FUNCTION str_f4 (t) RESULT (out)
410  real(8):: t
411  character(len=8):: out
412  !-----------------------------------------------------------------------------
413  write (out,'(f8.3)') t
414  out = adjustl(out)
415  out = trim(out(1:4))
416 END FUNCTION
417 
418 !===============================================================================
419 FUNCTION time_str (t) RESULT (out)
420  real(8):: t
421  character(len=8):: out
422  real, parameter:: ns=1e-9, mu=1e-6, ms=1e-3, s=1e0, mi=6e1, hr=36e2, dy=24*hr
423  real, parameter:: wk=7*dy, yr=365.*dy, mo=yr/12.
424  !-----------------------------------------------------------------------------
425  if (t/mu < 1d0) then
426  out=trim(str_f4(t/ns))//' ns'
427  1 format(f7.3,1x,a2)
428  else if (t/ms < 1d0) then
429  out=trim(str_f4(t/mu))//' mus'
430  else if ( t < 1d0) then
431  out=trim(str_f4(t/ms))//' ms'
432  else if ( t < 100) then
433  out=trim(str_f4( t))//' s '
434  else if (t/mi < 100) then
435  out=trim(str_f4(t/mi))//' mn'
436  else if (t/dy < 1d0) then
437  out=trim(str_f4(t/hr))//' hr'
438  else if (t/wk < 1d0) then
439  out=trim(str_f4(t/dy))//' dy'
440  else if (t/mo < 1d0) then
441  out=trim(str_f4(t/wk))//' wk'
442  else if (t/yr < 1d0) then
443  out=trim(str_f4(t/mo))//' mo'
444  else
445  out=trim(str_f4(t/yr))//' yr'
446  end if
447 END FUNCTION
448 
449 !===============================================================================
450 !> Print the time used per thread. Overuse is compensated for by the factor f =
451 !> the number of threads per core.
452 !===============================================================================
453 SUBROUTINE toc_r (label, n, time)
454  implicit none
455  character(len=*):: label
456  real:: n
457  real(8), optional:: time
458  real(8):: dtime, now
459  integer:: n_cores, n_threads
460  !.............................................................................
461  if (io_unit%do_validate) return
462  now = wallclock()
463  if (present(time)) then
464  dtime = (now - time)
465  time = now
466  else
467  dtime = (now - wt)
468  wt = now
469  end if
470  n_cores = max(omp%ncores,1)
471  n_threads = max(omp%nthreads,1)
472  if (n_cores > n_threads) n_cores = min(n_cores,n_threads)
473 
474  1 format (1x, a, ': ', a, ', ', a, a, 1p, e10.2, 1x, a, 3(i4, 1x, a))
475  2 format (1x, a, ': ', a, ', ', a, a, 1p, e10.2, 1x, a, 4(i4, 1x, a))
476  if (n_cores == n_threads) then
477  write (io_unit%log,1) label, trim(time_str(dtime)), &
478  trim(time_str(dtime*n_cores/max(n,1.))), '/upd,', n, 'updates/process,', &
479  mpi%size, merge('processes,','process, ',mpi%size>1), &
480  n_cores, merge('cores/process','core/process ',n_cores>1)
481  else
482  write (io_unit%log,1) label, trim(time_str(dtime)), &
483  trim(time_str(dtime*n_cores/max(n,1.))), '/upd,', n, 'updates/process,', &
484  mpi%size, merge('processes,','process, ',mpi%size>1), &
485  n_cores, merge('cores/process','core/process ',n_cores>1), &
486  n_threads, merge('threads/process','thread/process ',n_threads>1)
487  end if
488  flush (io_unit%log)
489  if (mpi%rank==0.and.omp%thread==0) then
490  if (n>1.0) then
491  if (n_cores == n_threads) then
492  print 1, label, trim(time_str(dtime)), &
493  trim(time_str(dtime*n_cores/max(n,1.))), '/upd,', n, 'updates/process,', &
494  mpi%size, merge('processes,','process, ',mpi%size>1), &
495  n_cores, merge('cores/process','core/process ',n_cores>1)
496  else
497  print 1, label, trim(time_str(dtime)), &
498  trim(time_str(dtime*n_cores/max(n,1.))), '/upd,', n, 'updates/process,', &
499  mpi%size, merge('processes,','process, ',mpi%size>1), &
500  n_cores, merge('cores/process','core/process ',n_cores>1), &
501  n_threads, merge('threads/process','thread/process ',n_threads>1)
502  end if
503  else
504  print 1, label, trim(time_str(dtime))
505  end if
506  flush (6)
507  end if
508  if (present(time)) then
509  time = wallclock()
510  else
511  wt = wallclock()
512  end if
513 END SUBROUTINE toc_r
514 
515 END MODULE timer_mod
Each thread uses a private timer data type, with arrays for start time and total time for each regist...
Definition: timer_mod.f90:11
Support tic/toc timing, as in MATLAB, and accurate wallclock() function. The timing is generally much...
The lock module uses nested locks, to allow versatile use of locks, where a procedure may want to mak...