!> @file init_pegrid.f90
!--------------------------------------------------------------------------------------------------!
! This file is part of the PALM model system.
!
! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
! Public License as published by the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.
!
! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
! Public License for more details.
!
! You should have received a copy of the GNU General Public License along with PALM. If not, see
! <http://www.gnu.org/licenses/>.
!
! Description:
! ------------
!> Determination of the virtual processor topology (if not prescribed by the user) and computation
!> of the grid point number and array bounds of the local domains.
!> @todo: remove MPI-data types for 2D exchange on coarse multigrid level (not used any more)
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE init_pegrid

#if defined( __parallel )
    USE MPI
#endif

    USE control_parameters,                                                                        &
        ONLY:  bc_dirichlet_l, bc_dirichlet_n, bc_dirichlet_r, bc_dirichlet_s, bc_lr, bc_ns,       &
               bc_radiation_l, bc_radiation_n, bc_radiation_r, bc_radiation_s, grid_level,         &
               grid_level_count, maximum_grid_level, message_string, mg_switch_to_pe0_level,       &
               pe_grid_prescribed, psolver, serial_run

#if defined( __parallel )
    USE control_parameters,                                                                        &
        ONLY:  coupling_mode, coupling_topology, gathered_size, momentum_advec,                    &
               outflow_source_plane, recycling_width, scalar_advec, subdomain_size, syn_turb_gen,  &
               turbulent_inflow, turbulent_outflow, y_shift

    USE grid_variables,                                                                            &
        ONLY:  dx
#endif

    USE indices,                                                                                   &
        ONLY:  nnx, nny, nnz, nx, nxl, nxl_mg, nxlu, nxr, nxr_mg, ny, nyn, nyn_mg, nys, nys_mg,    &
               nysv, nz, nzb, nzt, nzt_mg, wall_flags_1, wall_flags_2, wall_flags_3, wall_flags_4, &
               wall_flags_5, wall_flags_6, wall_flags_7, wall_flags_8, wall_flags_9, wall_flags_10

#if defined( __parallel )
    USE indices,                                                                                   &
        ONLY:  mg_loc_ind, nbgp, nnx_pe, nny_pe, nxl_pe, nxr_pe, nx_a, nx_o, nyn_pe, nys_pe, ny_a, &
               ny_o
#endif

    USE kinds

    USE pegrid

#if defined( __parallel )
    USE pmc_interface,                                                                             &
        ONLY:  nested_run

    USE spectra_mod,                                                                               &
        ONLY:  calculate_spectra

    USE synthetic_turbulence_generator_mod,                                                        &
        ONLY:  id_stg_left, id_stg_north, id_stg_right, id_stg_south
#endif

    USE transpose_mod,                                                                             &
        ONLY:  nnx_x_max, nxl_y, nxl_z, nxr_x_max, nxr_y, nxr_z, nx_y_max, nyn_x, nyn_x_max,       &
               nyn_z, nys_x, nys_z, ny_z_max, nzb_x, nzb_y, nzt_x, nzt_y, nzt_y_max, nz_x_max
#if defined( __parallel )
    USE transpose_mod,                                                                             &
        ONLY:  nnx_y_max, nny_yd_max, nny_z_max, nnz_x_max, nnz_yd_max, nnz_z_max, nxl_y_pe,       &
               nxl_yd, nxr_y_max, nxr_y_pe, nxr_yd, nyn_yd_max, nyn_z_max, nyn_z_pe, nys_z_pe,     &
               nzb_x_pe, nzb_yd, nzb_yd_pe, nzt_x_max, nzt_x_pe, nzt_yd, nzt_yd_max, nzt_yd_pe,    &
               nz_yd_max
#endif

    IMPLICIT NONE

    INTEGER(iwp) ::  i                        !< running index over number of processors or number of multigrid level
#if defined( __parallel )
    INTEGER(iwp) ::  id_inflow_l              !< ID indicating processors located at the left inflow boundary
    INTEGER(iwp) ::  id_outflow_l             !< local value of id_outflow
    INTEGER(iwp) ::  id_outflow_source_l      !< local value of id_outflow_source
    INTEGER(iwp) ::  id_recycling_l           !< ID indicating processors located at the recycling plane
    INTEGER(iwp) ::  id_stg_left_l            !< left lateral boundary local core id in case of turbulence generator
    INTEGER(iwp) ::  id_stg_north_l           !< north lateral boundary local core id in case of turbulence generator
    INTEGER(iwp) ::  id_stg_right_l           !< right lateral boundary local core id in case of turbulence generator
    INTEGER(iwp) ::  id_stg_south_l           !< south lateral boundary local core id in case of turbulence generator
    INTEGER(iwp) ::  ind(5)                   !< array containing the subdomain bounds
#endif
    INTEGER(iwp) ::  j                        !< running index, used for various loops
    INTEGER(iwp) ::  k                        !< number of vertical grid points in different multigrid level
    INTEGER(iwp) ::  maximum_grid_level_l     !< maximum number of grid level without switching to PE 0
    INTEGER(iwp) ::  mg_levels_x              !< maximum number of grid level allowed along x-direction
    INTEGER(iwp) ::  mg_levels_y              !< maximum number of grid level allowed along y-direction
    INTEGER(iwp) ::  mg_levels_z              !< maximum number of grid level allowed along z-direction
    INTEGER(iwp) ::  mg_switch_to_pe0_level_l !< maximum number of grid level with switching to PE 0
#if defined( __parallel )
    INTEGER(iwp) ::  nnx_y                    !< quotient of number of grid points along x-direction and number of PEs used along y-direction
    INTEGER(iwp) ::  nny_x                    !< quotient of number of grid points along y-direction and number of PEs used along x-direction
    INTEGER(iwp) ::  nny_z                    !< quotient of number of grid points along y-direction and number of PEs used along x-direction
    INTEGER(iwp) ::  nnz_x                    !< quotient of number of grid points along z-direction and number of PEs used along x-direction
    INTEGER(iwp) ::  nnz_y                    !< quotient of number of grid points along z-direction and number of PEs used along x-direction
    INTEGER(iwp) ::  numproc_sqr              !< square root of the number of processors
#endif
    INTEGER(iwp) ::  nxl_l                    !< lower index bound along x-direction on subdomain and different multigrid level
    INTEGER(iwp) ::  nxr_l                    !< upper index bound along x-direction on subdomain and different multigrid level
    INTEGER(iwp) ::  nyn_l                    !< lower index bound along y-direction on subdomain and different multigrid level
    INTEGER(iwp) ::  nys_l                    !< upper index bound along y-direction on subdomain and different multigrid level
#if defined( __parallel )
    INTEGER(iwp) ::  nzb_l                    !< lower index bound along z-direction on subdomain and different multigrid level
#endif
    INTEGER(iwp) ::  nzt_l                    !< upper index bound along z-direction on subdomain and different multigrid level
!$  INTEGER(iwp) ::  omp_get_num_threads      !< number of OpenMP threads

#if defined( __parallel )
    INTEGER(iwp), DIMENSION(:), ALLOCATABLE ::  ind_all !< dummy array containing index bounds on subdomain, used for gathering
    INTEGER(iwp), DIMENSION(2) ::  npe_xy = 1     !< number of processors along x-y dimension
    INTEGER(iwp), DIMENSION(2) ::  npe_xy_remote  !< number of PEs used for coupled model (only in atmospher-ocean coupling)
    INTEGER(iwp)               ::  lcoord(2)      !< PE coordinates of left neighbor along x and y
    INTEGER(iwp)               ::  rcoord(2)      !< PE coordinates of right neighbor along x and y
    INTEGER(iwp)               ::  irest_x        !< remaining grid points along x in case of non-uniform subdomains
    INTEGER(iwp)               ::  irest_y        !< remaining grid points along y in case of non-uniform subdomains
#endif

!
!-- Get the number of OpenMP threads
    !$OMP PARALLEL
!$  threads_per_task = omp_get_num_threads()
    !$OMP END PARALLEL


#if defined( __parallel )

    CALL location_message( 'creating virtual PE grids + MPI derived data types', 'start' )

!
!-- Determine the processor topology or check it, if prescribed by the user
    IF ( npex == -1  .AND.  npey == -1 )  THEN

!
!--    Automatic determination of the topology
       numproc_sqr = SQRT( REAL( numprocs, KIND=wp ) )
       npex        = MAX( numproc_sqr , 1 )
       DO  WHILE ( MOD( numprocs , npex ) /= 0 )
          npex = npex - 1
       ENDDO
       npey = numprocs / npex

    ELSEIF ( npex /= -1  .AND.  npey /= -1 )  THEN

!
!--    Prescribed by user. Number of processors on the prescribed topology must be equal to the
!--    number of PEs available to the job
       IF ( ( npex * npey ) /= numprocs )  THEN
          WRITE( message_string, * ) 'number of PEs of the prescribed ', 'topology (', npex*npey,  &
                                     ') does not match & the number of ',                          &
                                     'PEs available to the job (', numprocs, ')'
          CALL message( 'init_pegrid', 'PA0221', 1, 2, 0, 6, 0 )
       ENDIF
       pe_grid_prescribed = .TRUE.

    ELSE
!
!--    If the processor topology is prescribed by the user, the number of
!--    PEs must be given in both directions
       message_string = 'if the processor topology is prescribed by th' //                         &
                        'e user & both values of "npex" and "npey" must be given' //               &
                        ' in the &NAMELIST-parameter file'
       CALL message( 'init_pegrid', 'PA0222', 1, 2, 0, 6, 0 )

    ENDIF

!
!-- Create four default MPI communicators for the 2d virtual PE grid. One of them will be used as
!-- the main communicator for this run, while others might be used for specific quantities like
!-- aerosol, chemical species, or passive scalars), if their horizontal boundary conditions shall
!-- be different from those of the other quantities (e.g. non-cyclic conditions for aerosols, and
!-- cyclic conditions for all others).
    npe_xy(1) = npex
    npe_xy(2) = npey
    DO  i = 1, 4

       IF ( i == 1 )  cyclic = (/  .TRUE., .TRUE.  /)   ! cyclic along x and y
       IF ( i == 2 )  cyclic = (/  .TRUE., .FALSE. /)   ! cyclic along x
       IF ( i == 3 )  cyclic = (/ .FALSE., .TRUE.  /)   ! cyllic along y
       IF ( i == 4 )  cyclic = (/ .FALSE., .FALSE. /)   ! non-cyclic

       CALL MPI_CART_CREATE( comm_palm, ndim, npe_xy, cyclic, reorder,                             &
                             communicator_configurations(i)%mpi_communicator, ierr )

       CALL MPI_CART_SHIFT( communicator_configurations(i)%mpi_communicator, 0, 1,                 &
                            communicator_configurations(i)%pleft,                                  &
                            communicator_configurations(i)%pright, ierr )

       CALL MPI_CART_SHIFT( communicator_configurations(i)%mpi_communicator, 1, 1,                 &
                            communicator_configurations(i)%psouth,                                 &
                            communicator_configurations(i)%pnorth, ierr )

    ENDDO

!
!-- If necessary, set horizontal boundary conditions to non-cyclic
    IF ( bc_lr /= 'cyclic' )  cyclic(1) = .FALSE.
    IF ( bc_ns /= 'cyclic' )  cyclic(2) = .FALSE.


!
!-- Set the main communicator (virtual pe grid) for this run
    IF ( bc_lr == 'cyclic'  .AND.  bc_ns == 'cyclic' )  i = 1
    IF ( bc_lr == 'cyclic'  .AND.  bc_ns /= 'cyclic' )  i = 2
    IF ( bc_lr /= 'cyclic'  .AND.  bc_ns == 'cyclic' )  i = 3
    IF ( bc_lr /= 'cyclic'  .AND.  bc_ns /= 'cyclic' )  i = 4

    comm2d = communicator_configurations(i)%mpi_communicator
    pleft  = communicator_configurations(i)%pleft
    pright = communicator_configurations(i)%pright
    psouth = communicator_configurations(i)%psouth
    pnorth = communicator_configurations(i)%pnorth

!
!-- Set rank and coordinates of the main communicator
    CALL MPI_COMM_RANK( comm2d, myid, ierr )
    WRITE (myid_char,'(''_'',I6.6)')  myid

    CALL MPI_CART_COORDS( comm2d, myid, ndim, pcoord, ierr )

!
!-- In case of cyclic boundary conditions, a y-shift at the boundaries in x-direction can be
!-- introduced via parameter y_shift. The shift is done by modifying the processor grid in such a
!-- way that processors located at the x-boundary communicate across it to processors with
!-- y-coordinate shifted by y_shift relative to their own. This feature can not be used in
!-- combination with an fft pressure solver. It has been implemented to counter the effect of streak
!-- structures in case of cyclic boundary conditions. For a description of these see Munters
!-- (2016; dx.doi.org/10.1063/1.4941912)
!--
!-- Get coordinates of left and right neighbor on PE grid
    IF ( y_shift /= 0 ) THEN
       IF ( bc_lr == 'cyclic' ) THEN
          IF ( TRIM( psolver ) /= 'multigrid' .AND.  TRIM( psolver ) /= 'multigrid_noopt')  THEN
             message_string = 'y_shift /= 0 requires a multigrid pressure solver '
             CALL message( 'check_parameters', 'PA0468', 1, 2, 0, 6, 0 )
          ENDIF

          CALL MPI_CART_COORDS( comm2d, pright, ndim, rcoord, ierr )
          CALL MPI_CART_COORDS( comm2d, pleft, ndim, lcoord, ierr )

!
!--       If the x(y)-coordinate of the right (left) neighbor is smaller (greater) than that of the
!--       calling process, then the calling process is located on the right (left) boundary of the
!--       processor grid. In that case, the y-coordinate of that neighbor is increased (decreased)
!--       by y_shift.
!--       The rank of the process with that coordinate is then inquired and the neighbor rank for
!--       MPI_SENDRECV, pright (pleft) is set to it.
!--       In this way, the calling process receives a new right (left) neighbor for all future
!--       MPI_SENDRECV calls. That neighbor has a y-coordinate of y+(-)y_shift, where y is the
!--       original right (left) neighbor's y-coordinate. The modulo-operation ensures that if the
!--       neighbor's y-coordinate exceeds the grid-boundary, it will be relocated to the opposite
!--       part of the grid cyclicly.
          IF ( rcoord(1) < pcoord(1) )  THEN
             rcoord(2) = MODULO( rcoord(2) + y_shift, npey )
             CALL MPI_CART_RANK( comm2d, rcoord, pright, ierr )
          ENDIF

          IF ( lcoord(1) > pcoord(1) )  THEN
             lcoord(2) = MODULO( lcoord(2) - y_shift, npey )
             CALL MPI_CART_RANK( comm2d, lcoord, pleft, ierr )
          ENDIF

       ELSE
!
!--       y-shift for non-cyclic boundary conditions is only implemented
!--       for the turbulence recycling method in inflow_turbulence.f90
          IF ( .NOT. turbulent_inflow )  THEN
             message_string = 'y_shift /= 0 is only allowed for cyclic ' //                        &
                              'boundary conditions in both directions '  //                        &
                              'or with turbulent_inflow == .TRUE.'
             CALL message( 'check_parameters', 'PA0467', 1, 2, 0, 6, 0 )
          ENDIF
       ENDIF
    ENDIF

!
!-- Determine sub-topologies for transpositions
!-- Transposition from z to x:
    remain_dims(1) = .TRUE.
    remain_dims(2) = .FALSE.
    CALL MPI_CART_SUB( comm2d, remain_dims, comm1dx, ierr )
    CALL MPI_COMM_RANK( comm1dx, myidx, ierr )
!
!-- Transposition from x to y
    remain_dims(1) = .FALSE.
    remain_dims(2) = .TRUE.
    CALL MPI_CART_SUB( comm2d, remain_dims, comm1dy, ierr )
    CALL MPI_COMM_RANK( comm1dy, myidy, ierr )

!
!-- Check if subdomains are uniform or non-uniform
    CALL check_subdomain_uniformity

!
!-- Calculate array bounds along x-direction for every PE.
    ALLOCATE( nnx_pe(0:npex-1), nny_pe(0:npey-1), nxl_pe(0:npex-1),                    &
              nxr_pe(0:npex-1), nyn_pe(0:npey-1), nys_pe(0:npey-1) )

!
!-- Compute the local index bounds (nxl, nyr, nys, nyn) on each of the subdomains.
!-- Start with array bounds along x. Begin with calculating the minimum grid point
!-- number and the possible rest.
    IF ( non_uniform_subdomain )  THEN
       nnx     = ( nx + 1 ) / npex
       irest_x = MOD( nx+1, npex )
    ELSE
       irest_x = 0
       IF ( MOD( nx+1, npex ) /= 0 )  THEN
          WRITE( message_string, * ) 'x-direction: gridpoint number (' ,nx+1, ') ',                &
                                     'is not an& integral multiple of the number',                 &
                                     ' of processors (', npex, ')'
          CALL message( 'init_pegrid', 'PA0225', 1, 2, 0, 6, 0 )
       ELSE
          nnx  = ( nx + 1 ) / npex
       ENDIF
    ENDIF

!
!-- Left and right array bounds on each PE.
    DO  i = 0, npex-1
       IF ( i < irest_x )  THEN
!
!--       PEs with low id will get one additional grid point (nnx+1), i.e. the surplus points
!--       irest_x will be distributed among these PEs.
          nxl_pe(i) = i         * ( nnx + 1 )
          nxr_pe(i) = ( i + 1 ) * ( nnx + 1 ) - 1
       ELSE
!
!--       The remaining PEs with higher id get nnx points. Be aware of the larger size
!--       of the low id subdomains when calculating the left index bound.
          nxl_pe(i) = irest_x * ( nnx + 1 ) + ( i - irest_x ) * nnx
          nxr_pe(i) = nxl_pe(i) + nnx - 1
       ENDIF
!
!--    Grid points along x.
       nnx_pe(i) = nxr_pe(i) - nxl_pe(i) + 1
    ENDDO

!
!-- Now calculate array bounds in y-direction. Begin with calculating the minimum grid point
!-- number and the possible rest.
    IF ( non_uniform_subdomain )  THEN
       nny     = ( ny + 1 ) / npey
       irest_y = MOD( ny+1, npey )
    ELSE
       irest_y = 0
       IF ( MOD( ny+1, npey ) /= 0 )  THEN
          WRITE( message_string, * ) 'y-direction: gridpoint number (', ny+1, ') ',                &
                                     'is not an& integral multiple of the number',                 &
                                     ' of processors (', npey, ')'
          CALL message( 'init_pegrid', 'PA0227', 1, 2, 0, 6, 0 )
       ELSE
          nny  = ( ny + 1 ) / npey
       ENDIF
   ENDIF

!
!-- South and north array bounds on each PE.
    DO  j = 0, npey-1
       IF ( j < irest_y )  THEN
!
!--       PEs with low id will get one additional grid point (nny+1), i.e. the surplus points
!--       irest_y will be distributed among these PEs.
          nys_pe(j) = j         * ( nny + 1 )
          nyn_pe(j) = ( j + 1 ) * ( nny + 1 ) - 1
       ELSE
!
!--       The remaining PEs with higher id get nny points. Be aware of the larger size
!--       of the low id subdomains when calculating the left index bound.
          nys_pe(j) = irest_y * ( nny + 1 ) + ( j - irest_y ) * nny
          nyn_pe(j) = nys_pe(j) + nny - 1
       ENDIF
!
!--    Grid points along y.
       nny_pe(j) = nyn_pe(j) - nys_pe(j) + 1
    ENDDO

!
!-- Local array bounds and grid points on this PE.
    nxl = nxl_pe(pcoord(1))
    nxr = nxr_pe(pcoord(1))
    nys = nys_pe(pcoord(2))
    nyn = nyn_pe(pcoord(2))
    nzb = 0
    nzt = nz
    nnz = nz

    nnx = nxr-nxl+1
    nny = nyn-nys+1

!
!-- Set switches to define if the PE is situated at the border of the virtual processor grid
    IF ( nxl == 0 )   left_border_pe  = .TRUE.
    IF ( nxr == nx )  right_border_pe = .TRUE.
    IF ( nys == 0 )   south_border_pe = .TRUE.
    IF ( nyn == ny )  north_border_pe = .TRUE.

!
!-- Calculate array bounds and gridpoint numbers for the transposed arrays (needed in the pressure
!-- solver).
!-- For the transposed arrays, cyclic boundaries as well as top and bottom boundaries are omitted,
!-- because they are obstructive to the transposition.
    IF ( non_uniform_subdomain )  THEN

       CALL setup_transpose_indices

    ELSE
!
!--    1. transposition  z --> x
!--    This transposition is not neccessary in case of a 1d-decomposition along x
       IF ( psolver(1:7) == 'poisfft'  .OR.  calculate_spectra )  THEN
          IF ( npey /= 1 )  THEN
             IF ( MOD( nz , npex ) /= 0 )  THEN
                WRITE( message_string, * ) 'transposition z --> x:& ',                             &
                                           'nz=', nz, ' is not an integral multiple ',             &
                                           'of npex=', npex
                CALL message( 'init_pegrid', 'PA0230', 1, 2, 0, 6, 0 )
             ENDIF
          ENDIF
       ENDIF

       nys_x = nys
       nyn_x = nyn
       nny_x = nny
       nnz_x = nz / npex
       nzb_x = 1 + myidx * nnz_x
       nzt_x = ( myidx + 1 ) * nnz_x
       sendrecvcount_zx = nnx * nny * nnz_x

       IF ( psolver(1:7) == 'poisfft' )  THEN
!
!--       2. transposition  x --> y
          IF ( MOD( nx+1 , npey ) /= 0 )  THEN
             WRITE( message_string, * ) 'transposition x --> y:& ',                                &
                                        'nx+1=', nx+1, ' is not an integral ',                     &
                                        'multiple of npey=', npey
             CALL message( 'init_pegrid', 'PA0231', 1, 2, 0, 6, 0 )
          ENDIF

          nnz_y = nnz_x
          nzb_y = nzb_x
          nzt_y = nzt_x
          nnx_y = (nx+1) / npey
          nxl_y = myidy * nnx_y
          nxr_y = ( myidy + 1 ) * nnx_y - 1
          sendrecvcount_xy = nnx_y * nny_x * nnz_y

!
!--       3. transposition  y --> z
!--       (ELSE:  x --> y  in case of 1D-decomposition along x)
          nxl_z = nxl_y
          nxr_z = nxr_y
          nny_z = (ny+1) / npex
          nys_z = myidx * nny_z
          nyn_z = ( myidx + 1 ) * nny_z - 1
          sendrecvcount_yz = nnx_y * nny_z * nnz_y

          IF ( npey /= 1 )  THEN
!
!--          y --> z
!--          This transposition is not neccessary in case of a 1d-decomposition
!--          along x, except that the uptream-spline method is switched on
             IF ( MOD( ny+1 , npex ) /= 0 )  THEN
                WRITE( message_string, * ) 'transposition y --> z:& ',                             &
                                           'ny+1=', ny+1, ' is not an integral ',                  &
                                           'multiple of npex=', npex
                CALL message( 'init_pegrid', 'PA0232', 1, 2, 0, 6, 0 )
             ENDIF

          ELSE
!
!--          x --> y
!--          This condition must be fulfilled for a 1D-decomposition along x
             IF ( MOD( ny+1 , npex ) /= 0 )  THEN
                WRITE( message_string, * ) 'transposition x --> y:& ',                             &
                                           'ny+1=', ny+1, ' is not an integral ',                  &
                                           'multiple of npex=', npex
                CALL message( 'init_pegrid', 'PA0233', 1, 2, 0, 6, 0 )
             ENDIF

          ENDIF

       ENDIF

!
!--    Indices for direct transpositions z --> y (used for calculating spectra)
       IF ( calculate_spectra )  THEN
          IF ( MOD( nz, npey ) /= 0 )  THEN
             WRITE( message_string, * ) 'direct transposition z --> y (needed ',                   &
                                        'for spectra):& nz=', nz, ' is not an ',                   &
                                        'integral multiple of npey=', npey
             CALL message( 'init_pegrid', 'PA0234', 1, 2, 0, 6, 0 )
          ELSE
             nxl_yd = nxl
             nxr_yd = nxr
             nzb_yd = 1 + myidy * ( nz / npey )
             nzt_yd = ( myidy + 1 ) * ( nz / npey )
             sendrecvcount_zyd = nnx * nny * ( nz / npey )
          ENDIF
       ENDIF

       IF ( psolver(1:7) == 'poisfft'  .OR.  calculate_spectra )  THEN
!
!--       Indices for direct transpositions y --> x
!--       (they are only possible in case of a 1d-decomposition along x)
          IF ( npey == 1 )  THEN
             nny_x = nny / npex
             nys_x = myid * nny_x
             nyn_x = ( myid + 1 ) * nny_x - 1
             nzb_x = 1
             nzt_x = nz
             sendrecvcount_xy = nnx * nny_x * nz
          ENDIF

       ENDIF

       IF ( psolver == 'poisfft(1:7)' )  THEN
!
!--       Indices for direct transpositions x --> y
!--       (they are only possible in case of a 1d-decomposition along y)
          IF ( npex == 1 )  THEN
             nnx_y = nnx / npey
             nxl_y = myid * nnx_y
             nxr_y = ( myid + 1 ) * nnx_y - 1
             nzb_y = 1
             nzt_y = nz
             sendrecvcount_xy = nnx_y * nny * nz
          ENDIF

       ENDIF
!
!--    Set dimension variables here to be compatible with non-uniform grid.
       nxr_x_max = nxr
       nzt_x_max = nzt_x
       nnx_x_max = nnx
       nnz_x_max = nnz_x
       nyn_x_max = nyn_x
       nx_y_max  = nx
       nnx_y_max = nnx_y
       nxr_y_max = nxr_y
       nnz_z_max = nzt_y-nzb_y+1
       nz_x_max  = nz
       nzt_y_max = nzt_y
       nyn_z_max = nyn_z
       ny_z_max  = ny

       IF ( calculate_spectra )  THEN
          nyn_yd_max = nyn
          nnz_yd_max = nnz
          nz_yd_max  = nz
          nny_yd_max = nny
          nzt_yd_max = nzt_yd
       ENDIF

    ENDIF

!
!-- Collect index bounds from other PEs (to be written to Fortran I/O restart file later)
    ALLOCATE( hor_index_bounds(4,0:numprocs-1) )

    IF ( myid == 0 )  THEN

       hor_index_bounds(1,0) = nxl
       hor_index_bounds(2,0) = nxr
       hor_index_bounds(3,0) = nys
       hor_index_bounds(4,0) = nyn

!
!--    Receive data from all other PEs
       DO  i = 1, numprocs-1
          CALL MPI_RECV( ibuf, 4, MPI_INTEGER, i, MPI_ANY_TAG, comm2d, status, ierr )
          hor_index_bounds(:,i) = ibuf(1:4)
       ENDDO

    ELSE
!
!--    Send index bounds to PE0
       ibuf(1) = nxl
       ibuf(2) = nxr
       ibuf(3) = nys
       ibuf(4) = nyn
       CALL MPI_SEND( ibuf, 4, MPI_INTEGER, 0, myid, comm2d, ierr )

    ENDIF

!
!-- Determine the number of ghost point layers
    IF ( scalar_advec == 'ws-scheme'  .OR.  momentum_advec == 'ws-scheme'  .OR.  nested_run )  THEN
       nbgp = 3
    ELSE
       nbgp = 1
    ENDIF

!
!-- Check that the number of computational grid points is not smaller than the number of ghost
!-- points.
    IF ( nnx < nbgp )  THEN
       WRITE( message_string, * ) 'number of subdomain grid points along x (', nnx, ') is smaller',&
                                  'than the number of ghost points (', nbgp, ')'
       CALL message( 'init_pegrid', 'PA0682', 1, 2, 0, 6, 0 )
    ENDIF
    IF ( nny < nbgp )  THEN
       WRITE( message_string, * ) 'number of subdomain grid points along y (', nny, ') is smaller',&
                                  'than the number of ghost points (', nbgp, ')'
       CALL message( 'init_pegrid', 'PA0683', 1, 2, 0, 6, 0 )
    ENDIF

!
!-- Create a new MPI derived datatype for the exchange of surface (xy) data, which is needed for
!-- coupled atmosphere-ocean runs.
!-- First, calculate number of grid points of an xy-plane.
    ngp_xy  = ( nxr - nxl + 1 + 2 * nbgp ) * ( nyn - nys + 1 + 2 * nbgp )
    CALL MPI_TYPE_VECTOR( ngp_xy, 1, nzt-nzb+2, MPI_REAL, type_xy, ierr )
    CALL MPI_TYPE_COMMIT( type_xy, ierr )

    IF ( TRIM( coupling_mode ) /= 'uncoupled' )  THEN

!
!--    Pass the number of grid points of the atmosphere model to
!--    the ocean model and vice versa
       IF ( coupling_mode == 'atmosphere_to_ocean' )  THEN

          nx_a = nx
          ny_a = ny

          IF ( myid == 0 )  THEN

             CALL MPI_SEND( nx_a, 1, MPI_INTEGER, numprocs, 1, comm_inter, ierr )
             CALL MPI_SEND( ny_a, 1, MPI_INTEGER, numprocs, 2, comm_inter,  ierr )
             CALL MPI_SEND( npe_xy, 2, MPI_INTEGER, numprocs, 3, comm_inter, ierr )
             CALL MPI_RECV( nx_o, 1, MPI_INTEGER, numprocs, 4, comm_inter, status, ierr )
             CALL MPI_RECV( ny_o, 1, MPI_INTEGER, numprocs, 5, comm_inter, status, ierr )
             CALL MPI_RECV( npe_xy_remote, 2, MPI_INTEGER, numprocs, 6, comm_inter, status, ierr )
          ENDIF

          CALL MPI_BCAST( nx_o, 1, MPI_INTEGER, 0, comm2d, ierr )
          CALL MPI_BCAST( ny_o, 1, MPI_INTEGER, 0, comm2d, ierr )
          CALL MPI_BCAST( npe_xy_remote, 2, MPI_INTEGER, 0, comm2d, ierr )

       ELSEIF ( coupling_mode == 'ocean_to_atmosphere' )  THEN

          nx_o = nx
          ny_o = ny

          IF ( myid == 0 )  THEN

             CALL MPI_RECV( nx_a, 1, MPI_INTEGER, 0, 1, comm_inter, status, ierr )
             CALL MPI_RECV( ny_a, 1, MPI_INTEGER, 0, 2, comm_inter, status, ierr )
             CALL MPI_RECV( npe_xy_remote, 2, MPI_INTEGER, 0, 3, comm_inter, status, ierr )
             CALL MPI_SEND( nx_o, 1, MPI_INTEGER, 0, 4, comm_inter, ierr )
             CALL MPI_SEND( ny_o, 1, MPI_INTEGER, 0, 5, comm_inter, ierr )
             CALL MPI_SEND( npe_xy, 2, MPI_INTEGER, 0, 6, comm_inter, ierr )
          ENDIF

          CALL MPI_BCAST( nx_a, 1, MPI_INTEGER, 0, comm2d, ierr)
          CALL MPI_BCAST( ny_a, 1, MPI_INTEGER, 0, comm2d, ierr)
          CALL MPI_BCAST( npe_xy_remote, 2, MPI_INTEGER, 0, comm2d, ierr)

       ENDIF

       ngp_a = ( nx_a+1 + 2 * nbgp ) * ( ny_a+1 + 2 * nbgp )
       ngp_o = ( nx_o+1 + 2 * nbgp ) * ( ny_o+1 + 2 * nbgp )

!
!--    Determine if the horizontal grid and the number of PEs in ocean and atmosphere is same or not.
       IF ( nx_o == nx_a  .AND.  ny_o == ny_a  .AND.                                               &
            npex == npe_xy_remote(1)  .AND.  npey == npe_xy_remote(2) )  THEN
          coupling_topology = 0
       ELSE
          coupling_topology = 1
       ENDIF

!
!--    Determine the target PEs for the exchange between ocean and atmosphere (comm2d)
       IF ( coupling_topology == 0 )  THEN
!
!--       In case of identical topologies, every atmosphere PE has exactly one ocean PE counterpart
!--       and vice versa
          IF ( TRIM( coupling_mode ) == 'atmosphere_to_ocean' )  THEN
             target_id = myid + numprocs
          ELSE
             target_id = myid
          ENDIF

       ELSE
!
!--       In case of nonequivalent topology in ocean and atmosphere only for PE0 in ocean and PE0 in
!--       atmosphere a target_id is needed, since data echxchange between ocean and atmosphere will
!--       be done only between these PEs.
          IF ( myid == 0 )  THEN

             IF ( TRIM( coupling_mode ) == 'atmosphere_to_ocean' )  THEN
                target_id = numprocs
             ELSE
                target_id = 0
             ENDIF

          ENDIF

       ENDIF

    ENDIF

    serial_run = .FALSE.
#else

!
!-- Else branch contains settings for serial run.
    serial_run = .TRUE.
    pe_grid_prescribed = .FALSE.
    npex = 1
    npey = 1

!
!-- Array bounds when running on a single PE (respectively a non-parallel machine)
    nxl = 0
    nxr = nx
    nnx = nxr - nxl + 1
    nys = 0
    nyn = ny
    nny = nyn - nys + 1
    nzb = 0
    nzt = nz
    nnz = nz

    ALLOCATE( hor_index_bounds(4,0:0) )
    hor_index_bounds(1,0) = nxl
    hor_index_bounds(2,0) = nxr
    hor_index_bounds(3,0) = nys
    hor_index_bounds(4,0) = nyn

!
!-- Array bounds for the pressure solver (in the parallel code, these bounds are the ones for the
!-- transposed arrays)
    nys_x = nys
    nyn_x = nyn
    nzb_x = nzb + 1
    nzt_x = nzt

    nxl_y = nxl
    nxr_y = nxr
    nzb_y = nzb + 1
    nzt_y = nzt

    nxl_z = nxl
    nxr_z = nxr
    nys_z = nys
    nyn_z = nyn

!
!-- Set dimension variables here to be compatible with non-uniform grid.
    nxr_x_max = nxr
    nnx_x_max = nnx
    nyn_x_max = nyn_x
    nx_y_max  = nx
    nz_x_max  = nz
    nzt_y_max = nzt_y
    ny_z_max  = ny

#endif

!
!-- Calculate number of grid levels necessary for the multigrid poisson solver as well as the
!-- gridpoint indices on each level
    IF ( psolver(1:9) == 'multigrid' )  THEN

!
!--    First calculate number of possible grid levels for the subdomains
       mg_levels_x = 1
       mg_levels_y = 1
       mg_levels_z = 1

       i = nnx
       DO WHILE ( MOD( i, 2 ) == 0  .AND.  i /= 2 )
          i = i / 2
          mg_levels_x = mg_levels_x + 1
       ENDDO

       j = nny
       DO WHILE ( MOD( j, 2 ) == 0  .AND.  j /= 2 )
          j = j / 2
          mg_levels_y = mg_levels_y + 1
       ENDDO

       k = nz    ! do not use nnz because it might be > nz due to transposition
                 ! requirements
       DO WHILE ( MOD( k, 2 ) == 0  .AND.  k /= 2 )
          k = k / 2
          mg_levels_z = mg_levels_z + 1
       ENDDO
!
!--    The optimized MG-solver does not allow odd values for nz at the coarsest grid level
       IF ( TRIM( psolver ) /= 'multigrid_noopt' )  THEN
          IF ( MOD( k, 2 ) /= 0 )  mg_levels_z = mg_levels_z - 1
!
!--       An odd value of nz does not work. The finest level must have an even value.
          IF (  mg_levels_z == 0 )  THEN
             message_string = 'optimized multigrid method requires nz to be even'
             CALL message( 'init_pegrid', 'PA0495', 1, 2, 0, 6, 0 )
          ENDIF
       ENDIF

       maximum_grid_level = MIN( mg_levels_x, mg_levels_y, mg_levels_z )
!
!--    Check if subdomain sizes prevents any coarsening.
!--    This case, the maximum number of grid levels is 1, i.e. effectively a Gauss-Seidel scheme is
!--    applied rather than a multigrid approach.
!--    Give a warning in this case.
       IF ( maximum_grid_level == 1  .AND.  mg_switch_to_pe0_level == -1 )  THEN
          message_string = 'No grid coarsening possible, multigrid ' //                            &
                           'approach effectively reduces to a Gauss-Seidel ' //                    &
                           'scheme.'

          CALL message( 'poismg', 'PA0648', 0, 1, 0, 6, 0 )
       ENDIF

!
!--    Find out, if the total domain allows more levels. These additional levels are identically
!--    processed on all PEs.
       IF ( numprocs > 1  .AND.  mg_switch_to_pe0_level /= -1 )  THEN

          IF ( mg_levels_z > MIN( mg_levels_x, mg_levels_y ) )  THEN

             mg_switch_to_pe0_level_l = maximum_grid_level

             mg_levels_x = 1
             mg_levels_y = 1

             i = nx+1
             DO WHILE ( MOD( i, 2 ) == 0  .AND.  i /= 2 )
                i = i / 2
                mg_levels_x = mg_levels_x + 1
             ENDDO

             j = ny+1
             DO WHILE ( MOD( j, 2 ) == 0  .AND.  j /= 2 )
                j = j / 2
                mg_levels_y = mg_levels_y + 1
             ENDDO

             maximum_grid_level_l = MIN( mg_levels_x, mg_levels_y, mg_levels_z )

             IF ( maximum_grid_level_l > mg_switch_to_pe0_level_l )  THEN
                mg_switch_to_pe0_level_l = maximum_grid_level_l - mg_switch_to_pe0_level_l + 1
             ELSE
                mg_switch_to_pe0_level_l = 0
             ENDIF

          ELSE

             mg_switch_to_pe0_level_l = 0
             maximum_grid_level_l = maximum_grid_level

          ENDIF

!
!--       Use switch level calculated above only if it is not pre-defined by user
          IF ( mg_switch_to_pe0_level == 0 )  THEN
             IF ( mg_switch_to_pe0_level_l /= 0 )  THEN
                mg_switch_to_pe0_level = mg_switch_to_pe0_level_l
                maximum_grid_level     = maximum_grid_level_l
             ENDIF

          ELSE
!
!--          Check pre-defined value and reset to default, if neccessary
             IF ( mg_switch_to_pe0_level < mg_switch_to_pe0_level_l  .OR.                          &
                  mg_switch_to_pe0_level >= maximum_grid_level_l )  THEN
                message_string = 'mg_switch_to_pe0_level ' //                                      &
                                 'out of range and reset to 0'
                CALL message( 'init_pegrid', 'PA0235', 0, 1, 0, 6, 0 )
                mg_switch_to_pe0_level = 0
             ELSE
!
!--             Use the largest number of possible levels anyway and recalculate the switch level to
!--             this largest number of possible values
                maximum_grid_level = maximum_grid_level_l

             ENDIF

          ENDIF

       ENDIF

       ALLOCATE( grid_level_count(maximum_grid_level),                                             &
                 nxl_mg(0:maximum_grid_level), nxr_mg(0:maximum_grid_level),                       &
                 nyn_mg(0:maximum_grid_level), nys_mg(0:maximum_grid_level),                       &
                 nzt_mg(0:maximum_grid_level) )

       grid_level_count = 0
!
!--    Index zero required as dummy due to definition of arrays f2 and p2 in recursive subroutine
!--    next_mg_level
       nxl_mg(0) = 0; nxr_mg(0) = 0; nyn_mg(0) = 0; nys_mg(0) = 0; nzt_mg(0) = 0

       nxl_l = nxl; nxr_l = nxr; nys_l = nys; nyn_l = nyn; nzt_l = nzt

       DO  i = maximum_grid_level, 1 , -1

          IF ( i == mg_switch_to_pe0_level )  THEN
#if defined( __parallel )
!
!--          Save the grid size of the subdomain at the switch level, because it is needed in poismg.
             ind(1) = nxl_l; ind(2) = nxr_l
             ind(3) = nys_l; ind(4) = nyn_l
             ind(5) = nzt_l
             ALLOCATE( ind_all(5*numprocs), mg_loc_ind(5,0:numprocs-1) )
             CALL MPI_ALLGATHER( ind, 5, MPI_INTEGER, ind_all, 5, &
                                 MPI_INTEGER, comm2d, ierr )
             DO  j = 0, numprocs-1
                DO  k = 1, 5
                   mg_loc_ind(k,j) = ind_all(k+j*5)
                ENDDO
             ENDDO
             DEALLOCATE( ind_all )
!
!--          Calculate the grid size of the total domain
             nxr_l = ( nxr_l-nxl_l+1 ) * npex - 1
             nxl_l = 0
             nyn_l = ( nyn_l-nys_l+1 ) * npey - 1
             nys_l = 0
!
!--          The size of this gathered array must not be larger than the array tend, which is used
!--          in the multigrid scheme as a temporary array. Therefore the subdomain size of an PE is
!--          calculated and the size of the gathered grid. These values are used in routines pres
!--          and poismg.
             subdomain_size = ( nxr - nxl + 2 * nbgp + 1 ) *                                       &
                              ( nyn - nys + 2 * nbgp + 1 ) * ( nzt - nzb + 2 )
             gathered_size  = ( nxr_l - nxl_l + 3 ) * ( nyn_l - nys_l + 3 ) * ( nzt_l - nzb + 2 )

#else
             message_string = 'multigrid gather/scatter impossible ' //                            &
                          'in non parallel mode'
             CALL message( 'init_pegrid', 'PA0237', 1, 2, 0, 6, 0 )
#endif
          ENDIF

          nxl_mg(i) = nxl_l
          nxr_mg(i) = nxr_l
          nys_mg(i) = nys_l
          nyn_mg(i) = nyn_l
          nzt_mg(i) = nzt_l

          nxl_l = nxl_l / 2
          nxr_l = nxr_l / 2
          nys_l = nys_l / 2
          nyn_l = nyn_l / 2
          nzt_l = nzt_l / 2

       ENDDO

!
!--    Temporary problem: Currently calculation of maxerror in routine poismg crashes if grid data
!--    are collected on PE0 already on the finest grid level.
!--    To be solved later.
       IF ( maximum_grid_level == mg_switch_to_pe0_level )  THEN
          message_string = 'grid coarsening on subdomain level cannot be performed'
          CALL message( 'poismg', 'PA0236', 1, 2, 0, 6, 0 )
       ENDIF

    ELSE

       maximum_grid_level = 0

    ENDIF

!
!-- Default level 0 tells exchange_horiz that all ghost planes have to be exchanged. grid_level is
!-- adjusted in poismg, where only one ghost plane is required.
    grid_level = 0

#if defined( __parallel )
!
!-- Gridpoint number for the exchange of ghost points (y-line for 2D-arrays)
    ngp_y  = nyn - nys + 1 + 2 * nbgp

!
!-- Define new MPI derived datatypes for the exchange of ghost points in x- and y-direction for
!-- 2D-arrays (line)
    CALL MPI_TYPE_VECTOR( nxr-nxl+1+2*nbgp, nbgp, ngp_y, MPI_REAL, type_x, ierr )
    CALL MPI_TYPE_COMMIT( type_x, ierr )

    CALL MPI_TYPE_VECTOR( nbgp, ngp_y, ngp_y, MPI_REAL, type_y, ierr )
    CALL MPI_TYPE_COMMIT( type_y, ierr )
!
!-- Define new MPI derived datatypes for the exchange of ghost points in x- and y-direction for
!-- 2D-INTEGER arrays (line) - on normal grid.
!-- Define types for 32-bit and 8-bit Integer. The 8-bit Integer are only required on normal grid,
!-- while 32-bit Integer may be also required on coarser grid level in case of multigrid solver.
!
!-- 8-bit Integer
    CALL MPI_TYPE_VECTOR( nxr-nxl+1+2*nbgp, nbgp, ngp_y, MPI_BYTE, type_x_byte, ierr )
    CALL MPI_TYPE_COMMIT( type_x_byte, ierr )

    CALL MPI_TYPE_VECTOR( nbgp, ngp_y, ngp_y, MPI_BYTE, type_y_byte, ierr )
    CALL MPI_TYPE_COMMIT( type_y_byte, ierr )
!
!-- 32-bit Integer
    ALLOCATE( type_x_int(0:maximum_grid_level), type_y_int(0:maximum_grid_level) )

    CALL MPI_TYPE_VECTOR( nxr-nxl+1+2*nbgp, nbgp, ngp_y, MPI_INTEGER, type_x_int(0), ierr )
    CALL MPI_TYPE_COMMIT( type_x_int(0), ierr )

    CALL MPI_TYPE_VECTOR( nbgp, ngp_y, ngp_y, MPI_INTEGER, type_y_int(0), ierr )
    CALL MPI_TYPE_COMMIT( type_y_int(0), ierr )
!
!-- Calculate gridpoint numbers for the exchange of ghost points along x (yz-plane for 3D-arrays)
!-- and define MPI derived data type(s) for the exchange of ghost points in y-direction (xz-plane).
!-- Do these calculations for the model grid and (if necessary) also for the coarser grid levels
!-- used in the multigrid method
    ALLOCATE ( ngp_xz(0:maximum_grid_level),                                                       &
               ngp_xz_int(0:maximum_grid_level),                                                   &
               ngp_yz(0:maximum_grid_level),                                                       &
               ngp_yz_int(0:maximum_grid_level),                                                   &
               type_xz(0:maximum_grid_level),                                                      &
               type_xz_int(0:maximum_grid_level),                                                  &
               type_yz(0:maximum_grid_level),                                                      &
               type_yz_int(0:maximum_grid_level) )

    nxl_l = nxl; nxr_l = nxr; nys_l = nys; nyn_l = nyn; nzb_l = nzb; nzt_l = nzt

!
!-- Discern between the model grid, which needs nbgp ghost points and grid levels for the multigrid
!-- scheme. In the latter case only one ghost point is necessary.
!-- First definition of MPI-datatypes for exchange of ghost layers on normal grid. The following
!-- loop is needed for data exchange in poismg.f90.
!
!-- Determine number of grid points of yz-layer for exchange
    ngp_yz(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * nbgp)

!
!-- Define an MPI-datatype for the exchange of left/right boundaries.
!-- Although data are contiguous in physical memory (which does not necessarily require an
!-- MPI-derived datatype), the data exchange between left and right PE's using the MPI-derived type
!-- is 10% faster than without.
    CALL MPI_TYPE_VECTOR( nxr-nxl+1+2*nbgp, nbgp*(nzt-nzb+2), ngp_yz(0), MPI_REAL, type_xz(0),     &
                          ierr )
    CALL MPI_TYPE_COMMIT( type_xz(0), ierr )

    CALL MPI_TYPE_VECTOR( nbgp, ngp_yz(0), ngp_yz(0), MPI_REAL, type_yz(0), ierr )
    CALL MPI_TYPE_COMMIT( type_yz(0), ierr )

!
!-- Define data types for exchange of 3D Integer arrays.
    ngp_yz_int(0) = (nzt - nzb + 2) * (nyn - nys + 1 + 2 * nbgp)

    CALL MPI_TYPE_VECTOR( nxr-nxl+1+2*nbgp, nbgp*(nzt-nzb+2), ngp_yz_int(0), MPI_INTEGER,          &
                          type_xz_int(0), ierr )
    CALL MPI_TYPE_COMMIT( type_xz_int(0), ierr )

    CALL MPI_TYPE_VECTOR( nbgp, ngp_yz_int(0), ngp_yz_int(0), MPI_INTEGER, type_yz_int(0), ierr )
    CALL MPI_TYPE_COMMIT( type_yz_int(0), ierr )

!
!-- Definition of MPI-datatypes for multigrid method (coarser level grids)
    IF ( psolver(1:9) == 'multigrid' )  THEN
!
!--    Definition of MPI-datatyoe as above, but only 1 ghost level is used
       DO  i = maximum_grid_level, 1 , -1
!
!--       For 3D-exchange on different multigrid level, one ghost point for REAL arrays, two ghost
!--       points for INTEGER arrays
          ngp_xz(i) = (nzt_l - nzb_l + 2) * (nxr_l - nxl_l + 3)
          ngp_yz(i) = (nzt_l - nzb_l + 2) * (nyn_l - nys_l + 3)

          ngp_xz_int(i) = (nzt_l - nzb_l + 2) * (nxr_l - nxl_l + 3)
          ngp_yz_int(i) = (nzt_l - nzb_l + 2) * (nyn_l - nys_l + 3)
!
!--       MPI data type for REAL arrays, for xz-layers
          CALL MPI_TYPE_VECTOR( nxr_l-nxl_l+3, nzt_l-nzb_l+2, ngp_yz(i), MPI_REAL, type_xz(i),     &
                                ierr )
          CALL MPI_TYPE_COMMIT( type_xz(i), ierr )

!
!--       MPI data type for INTEGER arrays, for xz-layers
          CALL MPI_TYPE_VECTOR( nxr_l-nxl_l+3, nzt_l-nzb_l+2, ngp_yz_int(i), MPI_INTEGER,          &
                                type_xz_int(i), ierr )
          CALL MPI_TYPE_COMMIT( type_xz_int(i), ierr )

!
!--       MPI data type for REAL arrays, for yz-layers
          CALL MPI_TYPE_VECTOR( 1, ngp_yz(i), ngp_yz(i), MPI_REAL, type_yz(i), ierr )
          CALL MPI_TYPE_COMMIT( type_yz(i), ierr )
!
!--       MPI data type for INTEGER arrays, for yz-layers
          CALL MPI_TYPE_VECTOR( 1, ngp_yz_int(i), ngp_yz_int(i), MPI_INTEGER, type_yz_int(i), ierr )
          CALL MPI_TYPE_COMMIT( type_yz_int(i), ierr )


!--       For 2D-exchange of INTEGER arrays on coarser grid level, where 2 ghost points need to be
!--       exchanged. Only required for 32-bit Integer arrays.
          CALL MPI_TYPE_VECTOR( nxr_l-nxl_l+5, 2, nyn_l-nys_l+5, MPI_INTEGER, type_x_int(i), ierr )
          CALL MPI_TYPE_COMMIT( type_x_int(i), ierr )


          CALL MPI_TYPE_VECTOR( 2, nyn_l-nys_l+5, nyn_l-nys_l+5, MPI_INTEGER, type_y_int(i), ierr )
          CALL MPI_TYPE_COMMIT( type_y_int(i), ierr )

          nxl_l = nxl_l / 2
          nxr_l = nxr_l / 2
          nys_l = nys_l / 2
          nyn_l = nyn_l / 2
          nzt_l = nzt_l / 2

       ENDDO

    ENDIF

#endif

#if defined( __parallel )
!
!-- Setting of flags for inflow/outflow/nesting conditions.
    IF ( pleft == MPI_PROC_NULL )  THEN
       IF ( bc_lr == 'dirichlet/radiation'  .OR.  bc_lr == 'nested'  .OR.                          &
            bc_lr == 'nesting_offline' )  THEN
          bc_dirichlet_l  = .TRUE.
       ELSEIF ( bc_lr == 'radiation/dirichlet' )  THEN
          bc_radiation_l = .TRUE.
       ENDIF
    ENDIF

    IF ( pright == MPI_PROC_NULL )  THEN
       IF ( bc_lr == 'dirichlet/radiation' )  THEN
          bc_radiation_r = .TRUE.
       ELSEIF ( bc_lr == 'radiation/dirichlet'  .OR.  bc_lr == 'nested'  .OR.                      &
                bc_lr == 'nesting_offline' )  THEN
          bc_dirichlet_r  = .TRUE.
       ENDIF
    ENDIF

    IF ( psouth == MPI_PROC_NULL )  THEN
       IF ( bc_ns == 'dirichlet/radiation' )  THEN
          bc_radiation_s = .TRUE.
       ELSEIF ( bc_ns == 'radiation/dirichlet'  .OR.  bc_ns == 'nested'  .OR.                      &
                bc_ns == 'nesting_offline' )  THEN
          bc_dirichlet_s  = .TRUE.
       ENDIF
    ENDIF

    IF ( pnorth == MPI_PROC_NULL )  THEN
       IF ( bc_ns == 'dirichlet/radiation'  .OR.  bc_ns == 'nested'  .OR.                          &
            bc_ns == 'nesting_offline' )  THEN
          bc_dirichlet_n  = .TRUE.
       ELSEIF ( bc_ns == 'radiation/dirichlet' )  THEN
          bc_radiation_n = .TRUE.
       ENDIF
    ENDIF
!
!-- In case of synthetic turbulence geneartor determine ids.
!-- Please note, if no forcing or nesting is applied, the generator is applied only at the left
!-- lateral boundary.
    IF ( syn_turb_gen )  THEN
       IF ( bc_dirichlet_l )  THEN
          id_stg_left_l = myidx
       ELSE
          id_stg_left_l = 0
       ENDIF
       IF ( bc_dirichlet_r )  THEN
          id_stg_right_l = myidx
       ELSE
          id_stg_right_l = 0
       ENDIF
       IF ( bc_dirichlet_s )  THEN
          id_stg_south_l = myidy
       ELSE
          id_stg_south_l = 0
       ENDIF
       IF ( bc_dirichlet_n )  THEN
          id_stg_north_l = myidy
       ELSE
          id_stg_north_l = 0
       ENDIF

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_stg_left_l, id_stg_left,   1, MPI_INTEGER, MPI_SUM, comm1dx, ierr )

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_stg_right_l, id_stg_right, 1, MPI_INTEGER, MPI_SUM, comm1dx, ierr )

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_stg_south_l, id_stg_south, 1, MPI_INTEGER, MPI_SUM, comm1dy, ierr )

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_stg_north_l, id_stg_north, 1, MPI_INTEGER, MPI_SUM, comm1dy, ierr )

    ENDIF

!
!-- Broadcast the id of the inflow PE
    IF ( bc_dirichlet_l )  THEN
       id_inflow_l = myidx
    ELSE
       id_inflow_l = 0
    ENDIF
    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    CALL MPI_ALLREDUCE( id_inflow_l, id_inflow, 1, MPI_INTEGER, MPI_SUM, comm1dx, ierr )

!
!-- Broadcast the id of the recycling plane
!-- WARNING: needs to be adjusted in case of inflows other than from left side!
    IF ( turbulent_inflow ) THEN

       IF ( NINT( recycling_width / dx, KIND=idp ) >= nxl  .AND.                                   &
            NINT( recycling_width / dx, KIND=idp ) <= nxr )  THEN
          id_recycling_l = myidx
       ELSE
          id_recycling_l = 0
       ENDIF
       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_recycling_l, id_recycling, 1, MPI_INTEGER, MPI_SUM, comm1dx, ierr )

    ENDIF

!
!-- Broadcast the id of the outflow PE and outflow-source plane
    IF ( turbulent_outflow )  THEN

       IF ( bc_radiation_r )  THEN
          id_outflow_l = myidx
       ELSE
          id_outflow_l = 0
       ENDIF
       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_outflow_l, id_outflow, 1, MPI_INTEGER, MPI_SUM, &
                           comm1dx, ierr )

       IF ( NINT( outflow_source_plane / dx ) >= nxl  .AND.                                        &
            NINT( outflow_source_plane / dx ) <= nxr )  THEN
          id_outflow_source_l = myidx
       ELSE
          id_outflow_source_l = 0
       ENDIF
       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLREDUCE( id_outflow_source_l, id_outflow_source, 1, MPI_INTEGER, MPI_SUM,        &
                           comm1dx, ierr )

    ENDIF

    CALL location_message( 'creating virtual PE grids + MPI derived data types', 'finished' )

#else
    IF ( bc_lr == 'dirichlet/radiation' )  THEN
       bc_dirichlet_l = .TRUE.
       bc_radiation_r = .TRUE.
    ELSEIF ( bc_lr == 'radiation/dirichlet' )  THEN
       bc_radiation_l = .TRUE.
       bc_dirichlet_r = .TRUE.
    ENDIF

    IF ( bc_ns == 'dirichlet/radiation' )  THEN
       bc_dirichlet_n = .TRUE.
       bc_radiation_s = .TRUE.
    ELSEIF ( bc_ns == 'radiation/dirichlet' )  THEN
       bc_radiation_n = .TRUE.
       bc_dirichlet_s = .TRUE.
    ENDIF
#endif

!
!-- At the inflow or outflow, u or v, respectively, have to be calculated for one more grid point.
    IF ( bc_dirichlet_l  .OR.  bc_radiation_l )  THEN
       nxlu = nxl + 1
    ELSE
       nxlu = nxl
    ENDIF
    IF ( bc_dirichlet_s  .OR.  bc_radiation_s )  THEN
       nysv = nys + 1
    ELSE
       nysv = nys
    ENDIF

!
!-- Allocate wall flag arrays used in the multigrid solver
    IF ( psolver(1:9) == 'multigrid' )  THEN

       DO  i = maximum_grid_level, 1, -1

           SELECT CASE ( i )

              CASE ( 1 )
                 ALLOCATE( wall_flags_1(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 2 )
                 ALLOCATE( wall_flags_2(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 3 )
                 ALLOCATE( wall_flags_3(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 4 )
                 ALLOCATE( wall_flags_4(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 5 )
                 ALLOCATE( wall_flags_5(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 6 )
                 ALLOCATE( wall_flags_6(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 7 )
                 ALLOCATE( wall_flags_7(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 8 )
                 ALLOCATE( wall_flags_8(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 9 )
                 ALLOCATE( wall_flags_9(nzb:nzt_mg(i)+1,                                           &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE ( 10 )
                 ALLOCATE( wall_flags_10(nzb:nzt_mg(i)+1,                                          &
                                        nys_mg(i)-1:nyn_mg(i)+1,                                   &
                                        nxl_mg(i)-1:nxr_mg(i)+1) )

              CASE DEFAULT
                 message_string = 'more than 10 multigrid levels'
                 CALL message( 'init_pegrid', 'PA0238', 1, 2, 0, 6, 0 )

          END SELECT

       ENDDO

    ENDIF

#if defined( __parallel )
 CONTAINS

!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Checks, if the total domain can be split up to uniform subdomains, so that each PE gets the
!> same number of gridpoints.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE check_subdomain_uniformity

    non_uniform_subdomain = .TRUE.
!
!-- Multigrid solver and 1D-decompositions don't allow non-uniform subdomains
    IF ( psolver(1:9) == 'multigrid'  .OR.  npex == 1  .OR.  npey == 1 )  THEN
       non_uniform_subdomain = .FALSE.
       RETURN
    ENDIF
!
!-- Check if current grid and PE combination requires non-uniform subdomains.
    IF ( MOD( nx+1, npex ) == 0  .AND.  MOD( ny+1, npey ) == 0 )  THEN
!
!--    Since PE numbers are integral divisors of total gird point numbers,
!--    the subdomains are uniform.
       non_uniform_subdomain = .FALSE.
!
!--    However, we need to switch back to non-uniform subdomains in case that at least one of
!--    the following conditions is true.
       IF ( MOD(   nz , npex ) /= 0  .OR.  MOD( nx+1 , npey ) /= 0  .OR.                   &
            MOD( ny+1 , npex ) /= 0 )                                                          &
       THEN
          non_uniform_subdomain = .TRUE.
       ENDIF
    ENDIF

 END SUBROUTINE check_subdomain_uniformity


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Calculates indices and gridpoint numbers required for transposition of 3d-arrays in case
!> of a domain decomposition which has non-uniform subdomains.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE setup_transpose_indices

    INTEGER(iwp) ::  i
    INTEGER(iwp) ::  irest_xy
    INTEGER(iwp) ::  irest_yd
    INTEGER(iwp) ::  irest_yz
    INTEGER(iwp) ::  irest_z
    INTEGER(iwp) ::  j
    INTEGER(iwp) ::  k
    INTEGER(iwp) ::  nnz_yd


    ALLOCATE( nxl_y_pe(0:npey-1) )
    ALLOCATE( nxr_y_pe(0:npey-1) )
    ALLOCATE( nyn_z_pe(0:npex-1) )
    ALLOCATE( nys_z_pe(0:npex-1) )
    ALLOCATE( nzb_x_pe(0:npex-1) )
    ALLOCATE( nzt_x_pe(0:npex-1) )

!
!-- 1. transposition  z --> x
    nys_x   = nys
    nyn_x   = nyn
    nny_x   = nny
    nnz_x   = nz / npex
    irest_z = MOD( nz, npex )

    DO  k = 0, npex-1
       IF ( k < irest_z )  THEN
          nzb_x_pe(k) = k * ( nnz_x + 1 ) + 1
          nzt_x_pe(k) = nzb_x_pe(k) + ( nnz_x + 1 ) - 1
       ELSE
          nzb_x_pe(k) = irest_z * ( nnz_x + 1 ) + ( k - irest_z ) * nnz_x + 1
          nzt_x_pe(k) = nzb_x_pe(k) + nnz_x - 1
       ENDIF
    ENDDO

    nzb_x = nzb_x_pe(myidx)
    nzt_x = nzt_x_pe(myidx)
    nnz_x = nzt_x - nzb_x + 1
!
!-- Calculate index bounds and number of grid points that are required for allocating the
!-- 3d-arrays used by the transpose routine. Arrays on all PEs need to have identical size.
!-- Therefore, the dimensions defined on PE0 are used, because in case of non-uniform
!-- subdomains, at least on PE0 the respective arrays contain one more grid point than on the
!-- other PEs.
    nxr_x_max = nxl + nnx_pe(0) - 1
    nnx_x_max = nnx_pe(0)
    nnz_x_max = nzt_x_pe(0) - nzb_x_pe(0) + 1
    nz_x_max  = npex * nnz_x_max
    nzt_x_max = nzb_x + nnz_x_max - 1
    sendrecvcount_zx = nnx_x_max * nny * nnz_x_max

!
!-- 2. transposition  x --> y
    nnz_y    = nnz_x
    nzb_y    = nzb_x
    nzt_y    = nzt_x
    nnx_y    = (nx+1) / npey
    irest_xy = MOD( (nx+1), npey )

    DO  j = 0, npey-1
       IF ( j < irest_xy )  THEN
          nxl_y_pe(j) = j * ( nnx_y + 1 )
          nxr_y_pe(j) = nxl_y_pe(j) + ( nnx_y + 1 ) - 1
       ELSE
          nxl_y_pe(j) = irest_xy * ( nnx_y + 1 ) + ( j - irest_xy ) * nnx_y
          nxr_y_pe(j) = nxl_y_pe(j) + nnx_y - 1
       ENDIF
    ENDDO

    nxl_y = nxl_y_pe(myidy)
    nxr_y = nxr_y_pe(myidy)
    nnx_y = nxr_y - nxl_y + 1
!
!-- Calculate index bounds and number of grid points that are required for allocating the
!-- 3d-arrays used by the transpose routine. Arrays on all PEs need to have identical size.
!-- Therefore, the dimensions defined on PE0 are used, because in case of non-uniform
!-- subdomains, at least on PE0 the respective arrays contain one more grid point than on the
!-- other PEs.
    nyn_x_max = nys_x + nny_pe(0) - 1
    nx_y_max  = npey * ( nxr_y_pe(0) - nxl_y_pe(0) + 1 ) - 1
    nnx_y_max = nxr_y_pe(0) - nxl_y_pe(0) + 1
    nxr_y_max = nxl_y + nxr_y_pe(0) - nxl_y_pe(0)
    sendrecvcount_xy = ( nyn_x_max-nys_x + 1 ) * ( nzt_y-nzb_y + 1 ) * ( nxr_y_max-nxl_y + 1 )

!
!-- 3. transposition  y --> z
!-- (ELSE:  x --> y  in case of 1D-decomposition along x)
    nxl_z    = nxl_y
    nxr_z    = nxr_y
    nny_z    = ( ny+1 ) / npex
    irest_yz = MOD( (ny+1), npex )

    DO  i = 0, npex-1
       IF ( i < irest_yz )  THEN
          nys_z_pe(i) = i * ( nny_z + 1 )
          nyn_z_pe(i) = nys_z_pe(i) + ( nny_z + 1 ) - 1
       ELSE
          nys_z_pe(i) = irest_yz * ( nny_z + 1 ) + ( i - irest_yz ) * nny_z
          nyn_z_pe(i) = nys_z_pe(i) + nny_z - 1
       ENDIF
    ENDDO

    nys_z = nys_z_pe(myidx)
    nyn_z = nyn_z_pe(myidx)

!
!-- Calculate index bounds and number of grid points that are required for allocating the
!-- 3d-arrays used by the transpose routine. Arrays on all PEs need to have identical size.
!-- Therefore, the dimensions defined on PE0 are used, because in case of non-uniform
!-- subdomains, at least on PE0 the respective arrays contain one more grid point than on the
!-- other PEs.
    nzt_y_max = nzt_x_max
    ny_z_max  = npex * ( nyn_z_pe(0) - nys_z_pe(0) + 1 ) - 1
    nnz_z_max = ( nz - 1 ) / npex + 1
    nny_z_max = nyn_z_pe(0) - nys_z_pe(0) + 1
    nyn_z_max = nys_z + nyn_z_pe(0) - nys_z_pe(0)
    sendrecvcount_yz = ( nxr_z-nxl_z + 1 ) * nnz_z_max * ( nyn_z_max-nys_z + 1 )

!
!-- Indices for direct transpositions z --> y (used for calculating spectra)
    ALLOCATE( nzb_yd_pe(0:npey-1) )
    ALLOCATE( nzt_yd_pe(0:npey-1) )

    nxl_yd   = nxl
    nxr_yd   = nxr
    nnz_yd   = nz / npey
    irest_yd = MOD( nz, npey )

    DO  k = 0, npey-1
       IF ( k < irest_yd )  THEN
          nzb_yd_pe(k) = k * ( nnz_yd + 1 ) + 1
          nzt_yd_pe(k) = nzb_yd_pe(k) + ( nnz_yd + 1 ) - 1
       ELSE
          nzb_yd_pe(k) = irest_yd * ( nnz_yd + 1 ) + ( k - irest_yd ) * nnz_yd + 1
          nzt_yd_pe(k) = nzb_yd_pe(k) + nnz_yd - 1
       ENDIF
    ENDDO

    nzb_yd = nzb_yd_pe(myidy)
    nzt_yd = nzt_yd_pe(myidy)
    nnz_yd = nzt_yd - nzb_yd + 1

    nyn_yd_max = nys + nny_pe(0) - 1
    nnz_yd_max = nzt_yd_pe(0) - nzb_yd_pe(0) + 1
    nz_yd_max  = npey * nnz_yd_max
    nny_yd_max = nny_pe(0)
    nzt_yd_max = nzb_yd + nnz_yd_max - 1
    sendrecvcount_zyd = nnx * nnz_yd_max * nny_yd_max

 END SUBROUTINE setup_transpose_indices
#endif

 END SUBROUTINE init_pegrid