36 real(8):: stall_start=0d0, max_stalled=600d0, retry_stalled=30d0
37 integer,
save:: mpi_only_master=100
38 integer,
save:: verbose=0
39 integer,
save:: stalled=0
40 integer,
save:: n_spin=0
41 integer,
save:: n_update=0
42 integer,
save:: min_nq=2**30
43 logical,
save:: debug=.false.
44 logical,
save:: do_delay=.false.
45 logical,
save:: track_active=.false.
46 logical,
save:: omp_pick=.false.
47 logical,
save:: detailed_timer=.false.
57 SUBROUTINE init (self, name)
59 character(len=*),
optional:: name
61 class(
link_t),
pointer:: link
66 namelist /dispatcher0_params/ verbose, max_stalled, retry_stalled, do_delay, &
67 mpi_only_master, debug, detailed_timer
68 character(len=120):: ids = &
69 '$Id: c68cfcf81b7fe3a0816f8608d703bded382b1f0c $ dispatchers/dispatcher0_mod.f90' 71 call trace%print_id (ids)
72 call trace%begin(
'dispatcher0_t%init')
74 read(io%input, dispatcher0_params, iostat=iostat)
75 write (io%output, dispatcher0_params)
77 call self%lock%init (
'disp')
86 SUBROUTINE execute (self, task_list, test)
93 integer,
save:: itimer=0
95 call trace%begin (
'dispatcher0_t%execute', itimer=itimer)
96 call io%header(
'begin dispatcher0_t%execute:')
97 call task_list%startup
104 do while (task_list%na > 0 .and. wallclock() < io%job_seconds)
105 call self%update (task_list, test)
106 if (task_list%na == task_list%n_tasks)
then 108 min_nq = min(min_nq,task_list%nq)
111 write (io_unit%log,*)
'thread',omp%thread,
' arrived' 116 call mpi_mesg%diagnostics(1)
117 call toc (
'wall time', timer%n_update, time=sec)
118 write (io_unit%mpi,*)
'at mpi%barrier' 120 call mpi%barrier (
'end')
121 write (io%output,*)
"task list finished, min_nq =", min_nq
122 if (validate%mode ==
"write")
then 124 write (io%output,
'(a)') &
125 ' validate file '//trim(io%outputname)//
'/rank_00000.val written' 127 else if (validate%mode ==
"compare")
then 129 write (io%output,*)
"validate%ok =", validate%ok
132 call trace%end (itimer)
133 write (io_unit%mpi,*)
'end dispatcher0_t%execute' 135 END SUBROUTINE execute
157 SUBROUTINE update (self, task_list, test)
162 class(
link_t),
pointer:: head, prev
163 class(
task_t),
pointer:: task, otask
164 class(
mesg_t),
pointer:: mesg
165 logical:: already_busy, was_refined, was_derefined
167 real(8),
save:: time, otime=0d0
168 integer,
save:: oid=0
169 integer:: i, id, nq, n_unpk
171 integer,
save:: itimer(5)=0
174 call trace%begin(
'dispatcher0_t%update(1)', itimer=itimer(i))
177 n_update = n_update+1
181 call task_list%check_mpi (n_unpk)
182 call mpi_io%iwrite_list%check
183 if (omp%nthreads >= mpi_only_master .and. omp%master)
then 184 call trace%end (itimer(i))
191 call data_io%open_and_close ()
192 if (task_list%verbose > 2) &
193 write (io%output,*) wallclock(),
' thread',omp%thread,
' waiting for tasklist(1)' 194 if (detailed_timer)
then 195 call trace%end (itimer(i))
197 call trace%begin(
'dispatcher0_t%update(2)', itimer=itimer(i))
207 call trace%end(itimer(i))
213 call task_list%lock%set (
'dispatch0_t%update 1')
214 head => task_list%queue
215 if (detailed_timer)
then 216 call trace%end (itimer(i))
218 call trace%begin(
'dispatcher0_t%update(3)', itimer=itimer(i))
221 if (
associated(head) .and. .not.task_list%syncing)
then 223 write(io_unit%log,
'(f12.6,2x,a,i6,i7)') wallclock(), &
224 'dispather0_t%update: na, id, time =', &
225 task_list%na, head%task%id, head%task%time
230 do while (
associated(head))
232 if (task%mem_thread==-1)
exit 233 if (omp%thread == task%mem_thread)
then 235 timer%mem_hit = timer%mem_hit+1
239 head => head%next_time
242 timer%mem_test = timer%mem_test+1
246 if (.not.
associated(head))
then 248 head => task_list%queue
254 call task%log (
'dispatcher')
257 write (io_unit%queue,
'(f12.6,2i7,3i5,f12.6,i6,2i9,7i7)') &
258 wallclock(), task%id, task%istep, task%it, task%new, omp%thread, &
259 task%time, task_list%nq, n_spin, timer%n_master
260 timer%n_master(:) = 0
268 if (stalled_l > 0)
then 269 write (io_unit%log,*) wallclock(),
' stall ended', stalled_l
275 task%atime = task%time
276 if (verbose > 1)
then 277 write (io_unit%log,
'(f12.6,2x,a,i4,2x,a,i6,2x,a,1p,g16.6,2x,a,i5)') &
278 wallclock(),
'thread', omp%thread,
'takes', task%id, &
279 'time', task%time,
'nq', task%nq
310 already_busy = task%is_set (bits%busy)
312 if (.not.already_busy)
then 313 call task%set (bits%busy)
314 if (
associated(prev))
then 317 prev%next_time => head%next_time
322 task_list%queue => head%next_time
325 call task%clear (bits%ready)
327 call task_list%queue_active (head)
329 task_list%nq = task_list%nq-1
330 task%nq = task_list%nq
331 if (verbose > 0)
then 332 if (track_active)
then 333 otask => task_list%active%task
336 write (*,
'(a,2(f12.6,i6))')
'TIME ERROR: otime, oid, time, id =', otime, oid, time, otask%id
342 write (io_unit%queue,
'(f12.6,2i7,2f12.6,2i5)') &
343 wallclock(), task%id, task%istep, task%time, &
344 otask%atime, task_list%nq, task_list%nac
345 flush (io_unit%queue)
349 write (io_unit%queue,
'(f12.6,2i7,f12.6,2i5)') &
350 wallclock(), task%id, task%istep, task%time, &
351 task_list%nq, task_list%nac
352 flush (io_unit%queue)
364 if (task%time == task_list%sync_next)
then 365 task_list%syncing = .true.
366 write (io_unit%mpi,*) task%id,omp%thread, &
367 ' is triggering a sync at t =', task_list%sync_next
371 write(io_unit%log,
'(f12.6,i6,2x,a)') wallclock(), task_list%na,
'no queue' 373 call task_list%lock%unset (
'dispatch0_t%update 1')
374 if (task_list%verbose > 2) &
375 write (io%output,*) wallclock(),
' thread',omp%thread,
' unlocked tasklist(1)' 381 if (
associated(head))
then 382 if (detailed_timer)
then 383 call trace%end (itimer(i))
385 call trace%begin(
'dispatcher0_t%update(4)', itimer=itimer(i))
387 if (already_busy)
then 389 write (io_unit%log,*) mpi%rank,
' WARNING: thread', omp%thread, &
390 ' tried to update busy task', task%id
391 write (stderr,*) mpi%rank,
' WARNING: thread', omp%thread, &
392 ' tried to update busy task', task%id
394 call trace%end (itimer(i))
397 call task_list%update (head, test, was_refined, was_derefined)
404 if (.not. was_derefined)
then 405 call task%clear (bits%busy)
406 call task_list%check_nbors (head)
409 timer%busy_time = timer%busy_time + (wallclock()-start)
413 call trace%end (itimer(i))
418 subroutine stall_handler
421 if (detailed_timer)
then 422 call trace%end (itimer(i))
424 call trace%begin(
'dispatcher0_t%update(5)', itimer=itimer(i))
442 if (stalled_l == 0)
then 444 stall_start = wallclock()
446 write (io_unit%log,*) wallclock(),
'queue stalled', stalled_l
450 if (wallclock()-stall_start > max_stalled)
then 451 print *, mpi%rank,
'STALLED diagnostics' 452 call mpi_mesg%diagnostics (1)
453 print *, mpi%rank,
'STALLED bailing out' 454 call mpi%abort (
'exceeded max_stalled')
455 else if (wallclock()-stall_start > retry_stalled)
then 457 if (wallclock()-stall_start > retry_stalled)
then 459 call task_list%check_all
460 if (
associated(task_list%queue))
then 461 write(stderr,1) mpi%rank, omp%thread,
'check_all', wallclock()
462 1
format(
"rank:",i5,2x,
"thread:",i4,3x,
"STALL revived by ",a,
" at",f12.3)
464 call io%abort (
'STALLED revided by check_all -- check thread log')
465 stall_start = wallclock()
467 call task_list%check_oldest
468 if (
associated(task_list%queue))
then 469 write(stderr,1) mpi%rank, omp%thread,
'check_oldest', wallclock()
471 call io%abort (
'STALLED revided by check_oldest -- check thread log')
472 stall_start = wallclock()
474 write (stderr,*) mpi%rank, omp%thread,
'STALLED, revived FAILED' 475 call io%abort (
'STALLED, revived FAILED')
480 else if (do_delay)
then 482 timer%spin_time = timer%spin_time + (wallclock()-start)
483 call mpi_mesg%delay (stalled)
487 timer%spin_time = timer%spin_time + (wallclock()-start)
488 end subroutine stall_handler
489 END SUBROUTINE update
Each thread uses a private timer data type, with arrays for start time and total time for each regist...
Module for handling blocking and non-blocking MPI parallel I/O to a single file. The module is initia...
Support tic/toc timing, as in MATLAB, and accurate wallclock() function. The timing is generally much...
Generic validation module. The general idea is to be able to compare two runs at critical points in t...
Module with list handling for generic class task_t objects.
Interface from gpatch_mod to a choice of binary data I/O methods, controlled by the iomethod text str...
This module handles checking max change between neighboring points. Each instance of it needs an inde...
Task list data type, with methods for startup and updates. Message handling is inherited from the tas...
Module with list handling for generic class task_t objects.
The lock module uses nested locks, to allow versatile use of locks, where a procedure may want to mak...
Dispatcher method that relies on all threads maintaining a "ready queue", with tasks ready for updati...
Template module for tasks.