DISPATCH
load_balance_mod.f90
1 !===============================================================================
2 !> Keep track of neighbor ranks and their loads, by sending and receiving short
3 !> messages, storing the info in a linked list
4 !>
5 !> When nbor lists are generated by init_all_nbors, an nbor_info_t data type
6 !> is added for each neighbor rank. Part of the data type is a mesg_t data type
7 !> with a buffer for receiving load balance messages from neighbor ranks. THe
8 !> first receive request is issued when the data type is first allocated. The
9 !> nbor_info_list%recv procedure checks the list for completed messages,
10 !> unpack them, and issues new receive requests.
11 !>
12 !> With respect to critical regions: The nbor_info list is essentially static,
13 !> except it can change if a new boundary patch is added, which has a neighbor
14 !> rank that was not on the list before. If load balancing is handled by one
15 !> patch at a time, inside a critical region, then no other critical regaion
16 !> shouls be needed.
17 !===============================================================================
19  USE io_mod
20  USE trace_mod
21  USE mpi_mod
22  USE mpi_mesg_mod
23  USE omp_timer_mod
24  USE omp_lock_mod
25  USE task_mod
26  USE patch_mod
27  USE list_mod
28  USE link_mod
29  USE bits_mod
30  USE random_mod
31  implicit none
32  private
33  type:: rank_info_t
34  logical:: ok
35  integer:: rank
36  integer:: nq
37  integer:: n_swap
38  real:: cadence, patch_cost
39  real(8):: cost=0.0
40  real(8):: time, wall, dtime
41  real(8):: otime, owall
42  type(rank_info_t), pointer:: next => null()
43  class(mesg_t), pointer:: mesg
44  type(lock_t):: lock
45  contains
46  procedure:: imbalance
47  procedure:: measure_load
48  end type
49  integer, save:: n_io_rank_info = (4*4 + 6*8)/4
50  type:: io_rank_info_t
51  sequence
52  logical:: ok
53  integer:: rank
54  integer:: nq
55  integer:: n_swap
56  real(8):: cost=0.0
57  real(8):: time, wall, dtime
58  real(8):: otime, owall
59  end type
60  type:: rank_info_list_t
61  type(rank_info_t), pointer:: head => null()
62  type(rank_info_t), pointer:: tail => null()
63  integer:: n=0
64  type(lock_t):: lock
65  contains
66  procedure:: append
67  procedure:: remove
68  procedure:: send
69  procedure:: recv
70  procedure:: find
71  end type
72  type(rank_info_t):: rank_info
73  type(rank_info_list_t):: rank_info_list
74  !
75  logical, save:: only_initial=.false.
76  real, save:: cadence=1., threshold=10., grace=0.3, duration=0.0
77  real, save:: next_info
78  integer, save:: excess=0
79  real:: q_min=10., q_max=40.
80  !
81  type, public:: load_balance_t
82  logical:: on=.false.
83  type(random_t):: random
84  type(lock_t):: lock
85  contains
86  procedure:: init
87  procedure:: active
88  procedure:: add
89  procedure:: pack
90  procedure:: unpack
91  procedure:: check_load
92  procedure:: print => print_lb
93  end type
94  type(load_balance_t), public:: load_balance
95  type(mesg_list_t):: nbor_sent_list
96 CONTAINS
97 
98 !===============================================================================
99 !===============================================================================
100 SUBROUTINE init (self)
101  class(load_balance_t):: self
102  logical, save:: on=.false.
103 #ifndef _CRAYFTN
104  type(io_rank_info_t):: io_rank_info
105 #endif
106  namelist /load_balance_params/ on, cadence, threshold, grace, &
107  duration, only_initial, q_min, q_max
108  integer:: iostat
109  !.............................................................................
110  call self%lock%init ('load')
111  rewind(io%input)
112  read(io%input, load_balance_params, iostat=iostat)
113  write (io%output, load_balance_params)
114  self%on = on
115  next_info = cadence
116 #ifndef _CRAYFTN
117  if (n_io_rank_info*4 /= storage_size(io_rank_info)/8) then
118  print *, n_io_rank_info*4, storage_size(io_rank_info)/8
119  error stop 'The hardwired loadbalance_mod::n_io_rank_info is incorrect'
120  end if
121  n_io_rank_info = storage_size(io_rank_info)/32
122 #endif
123 END SUBROUTINE init
124 
125 !===============================================================================
126 !===============================================================================
127 SUBROUTINE active (self, flag)
128  class(load_balance_t):: self
129  logical:: flag
130  !$omp atomic write
131  self%on = flag
132 END SUBROUTINE active
133 
134 !===============================================================================
135 !> Pack the rank_info into an mpi_mesg_t data type
136 !===============================================================================
137 SUBROUTINE pack (self, info, mesg)
138  class(load_balance_t):: self
139  type(rank_info_t):: info
140  class(mesg_t), pointer:: mesg
141  type(io_rank_info_t):: io_rank_info
142  integer:: n
143  !.............................................................................
144  call trace%begin ('rank_info%pack')
145  n = n_io_rank_info
146  allocate (mesg)
147  allocate (mesg%buffer(n))
148  allocate (mesg%reqs(rank_info_list%n))
149  mesg%id = 1
150  mesg%nbuf = n
151  io_rank_info%ok = info%ok
152  io_rank_info%rank = info%rank
153  io_rank_info%cost = info%cost
154  io_rank_info%nq = info%nq
155  io_rank_info%n_swap= info%n_swap
156  io_rank_info%time = info%time
157  io_rank_info%otime = info%otime
158  io_rank_info%dtime = info%dtime
159  io_rank_info%wall = info%wall
160  io_rank_info%owall = info%owall
161  call anonymous_copy (n, io_rank_info, mesg%buffer)
162  if (io%verbose>0) then
163  write (io_unit%log,'(a,i6,1p,g12.3,2i6,g16.6)') 'rank_info_mod::pack rank,cost,nq,n,time =', &
164  info%rank, info%cost, info%nq, n, info%time
165  flush (io_unit%log)
166  end if
167  write (io_unit%log,*) 'send buffer =', mesg%buffer(1:5)
168  call trace%end()
169 END SUBROUTINE pack
170 
171 !===============================================================================
172 !> Unpack an mpi_mesg into a rank_info data type
173 !===============================================================================
174 SUBROUTINE unpack (self, buffer)
175  class(load_balance_t):: self
176  integer, dimension(:), pointer:: buffer
177  class(rank_info_t), pointer:: nbor_info
178  type(io_rank_info_t):: io_rank_info
179  integer:: n
180  real(8):: wc
181  !.............................................................................
182  call trace%begin ('rank_info%unpack')
183  n = n_io_rank_info
184  write (io_unit%log,*) 'recv buffer =', buffer(1:5)
185  call anonymous_copy (n, buffer, io_rank_info)
186  if (io%verbose>1) then
187  write (io_unit%log,*) 'load_balance%unpack: rank', io_rank_info%rank
188  flush (io_unit%log)
189  end if
190  if (io_rank_info%rank==mpi%rank) return
191  !-----------------------------------------------------------------------------
192  ! Search for the relevant nbor_info
193  !-----------------------------------------------------------------------------
194  nbor_info => rank_info_list%head
195  do while (associated(nbor_info))
196  if (nbor_info%rank == io_rank_info%rank) exit
197  nbor_info => nbor_info%next
198  end do
199  !-----------------------------------------------------------------------------
200  ! If no nbor_info was found, allocate a new one and append
201  !-----------------------------------------------------------------------------
202  if (associated(nbor_info)) then
203  write (io_unit%log,*) 'unpack: old', io_rank_info%rank
204  else
205  write (io_unit%log,*) 'unpack: new', io_rank_info%rank
206  allocate (nbor_info)
207  call rank_info_list%append (nbor_info)
208  end if
209  !-----------------------------------------------------------------------------
210  ! Copy over info
211  !-----------------------------------------------------------------------------
212  nbor_info%ok = io_rank_info%ok
213  nbor_info%rank = io_rank_info%rank
214  nbor_info%cost = io_rank_info%cost
215  nbor_info%nq = io_rank_info%nq
216  nbor_info%n_swap= io_rank_info%n_swap
217  nbor_info%time = io_rank_info%time
218  nbor_info%otime = io_rank_info%otime
219  nbor_info%dtime = io_rank_info%dtime
220  nbor_info%wall = io_rank_info%wall
221  nbor_info%owall = io_rank_info%owall
222  wc = wallclock()-io_rank_info%wall
223  if (io%verbose>1) then
224  write (io_unit%log,'(a,i6,1p,g12.3,i6,2g16.6)') 'rank_info%unpack: rank,load,nq,time,latency =', &
225  io_rank_info%rank, io_rank_info%cost, io_rank_info%nq, io_rank_info%time, wc
226  end if
227  call trace%end()
228 END SUBROUTINE unpack
229 
230 !===============================================================================
231 !> Append to the rank_info_list -- called from init_nbors to initialize list
232 !===============================================================================
233 SUBROUTINE add (self, rank)
234  class(load_balance_t):: self
235  class(rank_info_t), pointer:: nbor_info
236  type(mesg_t), pointer:: mesg
237  integer:: rank
238  logical:: found
239  !.............................................................................
240  if (rank==mpi%rank) then
241  print *,mpi%rank,'WARNING: trying to add same rank to nbor_info'
242  return
243  end if
244  found = .false.
245  call nbor_info%lock%set
246  nbor_info => rank_info_list%head
247  do while (associated(nbor_info))
248  if (nbor_info%rank == rank) then
249  found = .true.
250  exit
251  end if
252  nbor_info => nbor_info%next
253  end do
254  !-----------------------------------------------------------------------------
255  ! For each new rank, allocate an nbor_info data type and the corresponding
256  ! message and message buffer, and then issue the first recv request into that
257  ! buffer.
258  !-----------------------------------------------------------------------------
259  if (found) then
260  if (io%verbose>2) then
261  write (io_unit%log,*) 'rank_info%add: already on list', rank
262  end if
263  else
264  allocate (nbor_info)
265  nbor_info%rank = rank
266  allocate (mesg)
267  nbor_info%mesg => mesg
268  mesg%nbuf = n_io_rank_info
269  allocate (mesg%buffer(mesg%nbuf))
270  call mesg%recv (rank, mesg%tag)
271  call rank_info_list%append (nbor_info)
272  if (io%verbose>1) then
273  write (io_unit%log,*) 'rank_info%add', rank, rank_info_list%n
274  end if
275  end if
276  call nbor_info%lock%unset
277 END SUBROUTINE add
278 
279 !===============================================================================
280 !> Append to the rank_info_list
281 !===============================================================================
282 SUBROUTINE append (self, rank_info)
283  class(rank_info_list_t):: self
284  class(rank_info_t), pointer:: rank_info
285  !.............................................................................
286  call trace%begin ('rank_info%append')
287  if (associated(self%head)) then
288  self%tail%next => rank_info
289  else
290  self%head => rank_info
291  end if
292  self%tail => rank_info
293  self%n = self%n+1
294  call trace%end()
295 END SUBROUTINE append
296 
297 !===============================================================================
298 !> Remove an item from the rank_info_list.
299 !===============================================================================
300 SUBROUTINE remove (self, this)
301  class(rank_info_list_t):: self
302  class(rank_info_t), pointer:: this, rank_info, prev
303  !.............................................................................
304  call trace%begin ('rank_info%remove')
305  call self%lock%set
306  nullify (prev)
307  rank_info => self%head
308  do while (associated(rank_info))
309  if (associated(rank_info,this)) then
310  !-------------------------------------------------------------------------
311  ! This item is not head, so jump over it from prev
312  !-------------------------------------------------------------------------
313  if (associated(prev)) then
314  prev%next => this%next
315  !-------------------------------------------------------------------------
316  ! This item is head, and if this is the last item, head becomes null
317  !-------------------------------------------------------------------------
318  else
319  self%head => this%next
320  endif
321  !-------------------------------------------------------------------------
322  ! This item is tail, and if this is the last item, tail becomes null
323  !-------------------------------------------------------------------------
324  if (associated(this,self%tail)) then
325  self%tail => prev
326  end if
327  self%n = self%n-1
328  exit
329  end if
330  prev => rank_info
331  rank_info => rank_info%next
332  end do
333  !-----------------------------------------------------------------------------
334  ! If this item was not in the list, it is nevertheless quietly deallocated
335  !-----------------------------------------------------------------------------
336  deallocate (this)
337  call self%lock%unset
338  call trace%end()
339 END SUBROUTINE remove
340 
341 !===============================================================================
342 !> Send a small package with load information to all nbor ranks, adding the
343 !> message to a list of sent messages, and checking the list for completed
344 !> messages
345 !===============================================================================
346 SUBROUTINE send (self)
347  class(rank_info_list_t):: self
348  class(rank_info_t), pointer:: nbor_info
349  class(mesg_t), pointer:: mesg
350  !.............................................................................
351  if (.not. associated(self%head)) return
352  call trace%begin('rank_info%send')
353  !-----------------------------------------------------------------------------
354  ! Send to all ranks in the rank_info list (which is sorted by rank)
355  !-----------------------------------------------------------------------------
356  allocate (mesg)
357  mesg%nbuf = n_io_rank_info
358  allocate (mesg%buffer(mesg%nbuf))
359  write (io_unit%log,*) 'mk3',rank_info%rank
360  call load_balance%pack (rank_info, mesg) ! pack into mesg
361  write (io_unit%log,*) 'mk4',rank_info%rank
362  write (io_unit%log,*) 'mesg%buffer =', mesg%buffer(1:5)
363  call self%lock%set
364  nbor_info => self%head ! first rank_info
365  do while (associated(nbor_info)) ! until end
366  if (io%verbose>1) then
367  write (io_unit%log,'(f12.6,2x,a,2i5,1p,e12.3)') &
368  wallclock(), 'rank_info%send', nbor_info%rank, &
369  rank_info%rank, rank_info%cost
370  flush (io_unit%log)
371  end if
372  call mesg%send (nbor_info%rank, mesg%tag) ! send it
373  nbor_info => nbor_info%next ! next rank_info
374  end do
375  call nbor_sent_list%add (mesg) ! add to list
376  call nbor_sent_list%remove_completed ! remove completed
377  call self%lock%unset
378  call trace%end()
379 END SUBROUTINE send
380 
381 !===============================================================================
382 !> Send a small package with load information to all nbor ranks
383 !===============================================================================
384 SUBROUTINE recv (self)
385  class(rank_info_list_t):: self
386  class(rank_info_t), pointer:: nbor_info
387  class(mesg_t), pointer:: mesg
388  !.............................................................................
389  if (.not. associated(self%head)) return
390  call trace%begin('rank_info%recv')
391  !-----------------------------------------------------------------------------
392  ! Send to all ranks in the rank_info list (which is sorted by rank)
393  !-----------------------------------------------------------------------------
394  call self%lock%set
395  nbor_info => self%head ! first rank_info
396  do while (associated(nbor_info)) ! until end
397  mesg => nbor_info%mesg
398  if (mesg%completed()) then ! previous mesg?
399  if (io%verbose>1) then
400  write(io_unit%log,*) 'rank_info_list%recv: from', nbor_info%rank
401  flush(io_unit%log)
402  end if
403  call load_balance%unpack (mesg%buffer) ! if recvd, unpack
404  call mesg%recv (nbor_info%rank, tag=mesg%tag) ! start a new receive
405  end if
406  nbor_info => nbor_info%next ! next rank_info
407  end do
408  call self%lock%unset
409  call trace%end()
410 END SUBROUTINE recv
411 
412 !===============================================================================
413 !> Interpolate the local code time to the wall clock time nbor%wall (the latest
414 !> known for nbor), and return the difference between that and nbor%time. A
415 !> positive value means that nbor is ahead of the local rank.
416 !> Evaluating on queue size, we want a formula that only turns positive when
417 !> two conditions are fulfilled: 1) The nbor queue is getting short, AND 2)
418 !> local queue is long enough. We want the formula to become agressive when
419 !> the load imbalance starts to get serious.
420 !===============================================================================
421 FUNCTION imbalance (self, nbor) RESULT (diff)
422  class(rank_info_t):: self
423  class(rank_info_t), pointer:: nbor
424  real(8):: p, time, diff
425  !.............................................................................
426  !p = (nbor%wall-self%owall)/(self%wall-self%owall)
427  !time = self%otime + p*(self%time-self%otime)
428  !diff = (nbor%time-time)/self%dtime
429  !-----------------------------------------------------------------------------
430  ! This turns on (positive) when nbor%nq < self%nq*q_min/q_max, but we also
431  ! do not want the formulate to become extremely sensisitve to values below
432  ! q_min and q_max
433  !-----------------------------------------------------------------------------
434  diff = 2.0*(q_min/(q_min+nbor%nq)-q_max/(q_max+self%nq))
435 END FUNCTION imbalance
436 
437 !===============================================================================
438 !> Compute a measure of the task load, which for patches is the number of cells
439 !> divided by the time step.
440 !===============================================================================
441 SUBROUTINE measure_load (self, head)
442  class(rank_info_t):: self
443  class(link_t), pointer:: head
444  class(link_t), pointer:: link
445  class(task_t), pointer:: task
446  real:: load, sum, sum_cost, cells, sum_cells, ready
447  real(8):: dt, sum_dt, wc
448  integer, save:: itimer=0
449  logical:: ok, active
450  !-----------------------------------------------------------------------------
451  call trace%begin ('rank_info%measure_load', itimer=itimer)
452  !
453  ok = .true.
454  link => head
455  do while (associated(link))
456  ok = merge(.false., ok, link%task%rank==mpi%rank .and. link%task%dtime==0d0)
457  link => link%next
458  end do
459  rank_info%ok = ok ! active patches have non-zero dtime
460  if (.not. ok) then
461  rank_info%dtime = 1.0
462  call trace_end (itimer)
463  return
464  end if
465  !
466  sum_dt = 0.0
467  sum_cost = 0.0
468  sum_cells = 0.0
469  link => head
470  do while (associated(link))
471  task => link%task
472  select type (task)
473  class is (patch_t)
474  if (task%rank==mpi%rank) then
475  cells = product(task%mesh%n)
476  sum_cells = sum_cells + cells
477  sum_cost = sum_cost + cells/task%dtime
478  end if
479  end select
480  link => link%next
481  end do
482  !-----------------------------------------------------------------------------
483  ! cost = number of cell updates per code time unit
484  !-----------------------------------------------------------------------------
485  rank_info%cost = sum_cost ! total rank update cost
486  rank_info%rank = mpi%rank ! local rank
487  write (io_unit%log,*) 'mk0', rank_info%rank, mpi%rank
488  rank_info%dtime = sum_cells/sum_cost ! cell-weighted dtime
489  wc = wallclock()
490  if (wc > rank_info%wall) then
491  rank_info%owall = rank_info%wall ! previous wall time
492  rank_info%wall = wc ! wall time
493  rank_info%otime = rank_info%time ! previsou queue time
494  rank_info%time = 0.9*rank_info%time & ! smoothed ..
495  + 0.1*head%task%time ! .. queue time
496  end if
497  if (io%verbose > 1) &
498  write (io_unit%log,'(a,i5,1p,3e12.3)') 'measure_load: nq,load,cost,dt =', &
499  rank_info%nq, rank_info%cost, rank_info%dtime
500  call trace%end (itimer)
501 END SUBROUTINE measure_load
502 
503 !===============================================================================
504 !> Look up the load for a neighbor rank
505 !===============================================================================
506 FUNCTION find (self, rank, debug) RESULT (nbor_info)
507  class(rank_info_list_t):: self
508  integer:: rank
509  logical, optional:: debug
510  class(rank_info_t), pointer:: nbor_info
511  !.............................................................................
512  call trace%begin('rank_info_list%find')
513  nbor_info => self%head
514  if (present(debug)) then
515  write (io_unit%log,*) 'debug: associated =', associated(nbor_info)
516  do while (associated(nbor_info))
517  write (io_unit%log,*) 'debug: rank =', nbor_info%rank
518  if (nbor_info%rank == rank) then
519  return
520  end if
521  nbor_info => nbor_info%next
522  end do
523  flush (io_unit%log)
524  else
525  do while (associated(nbor_info))
526  if (nbor_info%rank == rank) then
527  return
528  end if
529  nbor_info => nbor_info%next
530  end do
531  end if
532  call trace%end()
533 END FUNCTION find
534 
535 !===============================================================================
536 !> Print a rank_info list
537 !===============================================================================
538 SUBROUTINE print_lb (self, time)
539  class(load_balance_t):: self
540  real(8):: time
541  class(rank_info_t), pointer:: nbor_info
542  real:: load_diff, imbalance
543  !.............................................................................
544  if (.not.rank_info%ok) return
545  call self%lock%set
546  nbor_info => rank_info_list%head
547  do while (associated(nbor_info))
548  load_diff = (rank_info%cost-nbor_info%cost)/rank_info%patch_cost
549  !imbalance = (nbor_info%time-rank_info%time)/rank_info%dtime
550  imbalance = rank_info%imbalance (nbor_info)
551  write (io_unit%log,'(2f12.6,2x,a,i6,1p,4g12.3,3i6)') &
552  wallclock(), &
553  time, &
554  'rank_info_list: rnk,load,time,cost[12],wall,nq =', &
555  nbor_info%rank,load_diff,imbalance, &
556  nbor_info%cost, rank_info%cost, &
557  nbor_info%nq, rank_info%nq, nbor_info%n_swap
558  nbor_info => nbor_info%next
559  end do
560  call self%lock%unset
561 END SUBROUTINE print_lb
562 
563 !===============================================================================
564 !> Decide whether to give up ownership of a patch, to increase the load of an
565 !> nbor task that needs more load. After a rank passes the critical threshold
566 !> below, there is a delay before it's boundary patches are sent to its nbor
567 !> ranks, are unpacked, and end up in a load_balance comparison. This will
568 !> lead to a burst of patch transfers, and potentially oscillatory behavior.
569 !> It is better to transfer the LB info separately, in small packages that do
570 !> not take long to transfer; this is implemented.
571 !>
572 !> The measure of imbalance used below is defined in rank_info_mod::imbalance.
573 !> Ideally, it should contain both a measure of the actual load, the time skew,
574 !> and the size of the ready queue on the two ranks being compared.
575 !===============================================================================
576 FUNCTION check_load (self, head) RESULT (sell)
577  class(load_balance_t):: self
578  class(link_t), pointer:: head, nbor
579  logical:: sell
580  class(task_t), pointer:: task
581  class(patch_t), pointer:: patch
582  class(rank_info_t), pointer:: nbor_info
583  integer,save:: delay=0
584  logical:: load_condition, time_condition
585  real(8):: wc
586  real:: load_diff, cost, imbalance, randomu
587  logical, save:: make_estimate=.true.
588  integer, save:: itimer=0
589  !-----------------------------------------------------------------------------
590  ! Only one thread should do load balancing, and that thread should make sure
591  ! to set the task list omp lock whenever it needs to access the task list.
592  !-----------------------------------------------------------------------------
593  sell = .false.
594  if (.not.self%on) return
595  call trace%begin ('load_balance%check_load', itimer=itimer)
596  call self%lock%set
597  wc = wallclock()
598  patch => task2patch(head%task)
599  rank_info%nq = head%task%nq
600  !-----------------------------------------------------------------------------
601  ! Update the load balance calculation for the local rank, and enter into the
602  ! rank_info_list, which is also updated via MPI
603  !-----------------------------------------------------------------------------
604  if (wc > next_info) then
605  next_info = next_info + cadence
606  rank_info%cadence = cadence
607  rank_info%rank = mpi%rank
608  rank_info%patch_cost = product(patch%mesh%gn)/patch%dtime
609  call rank_info%measure_load (head)
610  call rank_info_list%send
611  call rank_info_list%recv
612  call load_balance%print (head%task%time)
613  end if
614  !-----------------------------------------------------------------------------
615  ! If load balancing is on, and we have information, and it's time, or if an
616  ! earlier case has left an excess, go ahead
617  !-----------------------------------------------------------------------------
618  if (rank_info%ok .and. wc<duration) then
619  !---------------------------------------------------------------------------
620  ! Among the virtual patches that surround the head boundary patch, find one
621  ! that needs more load and give the head patch to it
622  !---------------------------------------------------------------------------
623  nbor => head%nbor
624  do while (associated(nbor))
625  if (io%verbose>1) &
626  write (io_unit%log,'(a,2i9,3i6,2f12.6,l3)') 'LB: ids,nqs,rank,times', &
627  head%task%id, nbor%task%id, rank_info%nq, nbor%task%nq, nbor%task%rank, &
628  nbor%task%time, head%task%time, nbor%task%is_set(bits%virtual)
629  if (nbor%task%is_set (bits%virtual)) then
630  nbor_info => rank_info_list%find (nbor%task%rank)
631  if (nbor%task%rank == mpi%rank) then
632  write (io_unit%log,*) &
633  'rank_info_mod::check_load virtual bit on my rank', nbor%task%id
634  flush (io_unit%log)
635  else if (associated(nbor_info)) then
636  imbalance = rank_info%imbalance (nbor_info)/grace
637  imbalance = max(imbalance, -1.)
638  !---------------------------------------------------------------------
639  ! If the imbalance measure is posive then give a patch to the remote
640  ! rank, with a probability essentially proportional to the imbalance,
641  ! but saturating at 5%, to limit the number of swaps per unit time.
642  ! A swap can cause a fair number of patch transfers, since new virtual
643  ! patches on the remote need at least two time slices.
644  !---------------------------------------------------------------------
645  randomu = self%random%ran1()
646  if (io%verbose>0) &
647  write (io_unit%log,'("LB:",2i4,2x,2l1,2f6.2,l4)') &
648  mpi%rank,nbor%task%rank,rank_info%ok,nbor_info%ok,imbalance, &
649  randomu,randomu < 0.05*(1.-exp(-imbalance))
650  if (rank_info%ok.and.nbor_info%ok .and. &
651  imbalance > 0.0 .and. &
652  randomu < 0.05*(1.-exp(-imbalance))) then
653  if (io%verbose>0) then
654  print 1, &
655  wc,'LB: rank',mpi%rank,' gives patch',head%task%id,' to',nbor%task%rank, &
656  excess, nbor_info%n_swap, &
657  nbor_info%cost, rank_info%cost
658  1 format(f12.6,2x,a,i6,a,i9,a,i6,2i5,1p,2g12.3)
659  write (io_unit%log,2) &
660  wc,'load_balance: giving patch',head%task%id, ' to',nbor%task%rank, &
661  excess, nbor_info%n_swap, &
662  nbor_info%cost, rank_info%cost
663  2 format(f12.6,2x,a,i9,a,i6,2i5,1p,2g12.3)
664  flush (io_unit%log)
665  end if
666  call list%give_to (head, nbor%task%rank)
667  sell = .true.
668  if (io%verbose>0) &
669  write (io_unit%log,*) 'swapped boundary to virtual:', head%task%id
670  exit
671  end if
672  else
673  write (io_unit%log,*) &
674  'check_load ERROR: nbor_info not associated, rank', nbor%task%rank
675  nbor_info => rank_info_list%find (nbor%task%rank, debug=.true.)
676  flush (io_unit%log)
677  end if
678  else if (nbor%task%is_set (bits%external)) then
679  write (io_unit%log,*) nbor%task%id, 'LB: external status on nbor from rank', nbor%task%rank
680  else if (nbor%task%rank /= mpi%rank) then
681  write (io_unit%log,*) nbor%task%id, 'LB: inconsistent status on nbor from rank', nbor%task%rank
682  end if
683  nbor => nbor%next
684  end do
685  end if ! (rank_info%ok .and. wc < duration)
686  call self%lock%unset
687  call trace%end(itimer)
688 END FUNCTION check_load
689 
690 END MODULE load_balance_mod
Support tic/toc timing, as in MATLAB, and accurate wallclock() function. The timing is generally much...
Module with list handling for generic class task_t objects.
Definition: list_mod.f90:4
Template module for patches, which adds pointers to memory and mesh, and number of dimensions and var...
Definition: patch_mod.f90:6
Definition: io_mod.f90:4
Keep track of neighbor ranks and their loads, by sending and receiving short messages, storing the info in a linked list.
The lock module uses nested locks, to allow versatile use of locks, where a procedure may want to mak...
Template module for tasks.
Definition: task_mod.f90:4