!> @file transpose.f90
!--------------------------------------------------------------------------------------------------!
! This file is part of the PALM model system.
!
! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
! Public License as published by the Free Software Foundation, either version 3 of the License, or
! (at your option) any later version.
!
! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
! Public License for more details.
!
! You should have received a copy of the GNU General Public License along with PALM. If not, see
! <http://www.gnu.org/licenses/>.
!
! Copyright 1997-2021 Leibniz Universitaet Hannover
!--------------------------------------------------------------------------------------------------!
!
!
! Description:
! ------------
!> Resorting data for the transposition from x to y. The transposition itself is carried out in
!> transpose_xy.
!--------------------------------------------------------------------------------------------------!

#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )

 MODULE transpose_mod

#if defined( __parallel )
    USE MPI
#endif

    USE kinds

    USE pegrid,                                                                                    &
        ONLY:  collective_wait,                                                                    &
               comm2d,                                                                             &
               comm1dx,                                                                            &
               comm1dy,                                                                            &
               ierr,                                                                               &
               non_uniform_subdomain,                                                              &
               npex,                                                                               &
               npey,                                                                               &
               numprocs,                                                                           &
               sendrecvcount_xy,                                                                   &
               sendrecvcount_yz,                                                                   &
               sendrecvcount_zx,                                                                   &
               sendrecvcount_zyd

    IMPLICIT NONE

    INTEGER(iwp) ::  nxl_y   !< internal index bound for transpositions
    INTEGER(iwp) ::  nxl_yd  !< internal index bound for transpositions
    INTEGER(iwp) ::  nxl_z   !< internal index bound for transpositions
    INTEGER(iwp) ::  nxr_y   !< internal index bound for transpositions
    INTEGER(iwp) ::  nxr_yd  !< internal index bound for transpositions
    INTEGER(iwp) ::  nxr_z   !< internal index bound for transpositions
    INTEGER(iwp) ::  nyn_x   !< internal index bound for transpositions
    INTEGER(iwp) ::  nyn_z   !< internal index bound for transpositions
    INTEGER(iwp) ::  nys_x   !< internal index bound for transpositions
    INTEGER(iwp) ::  nys_z   !< internal index bound for transpositions
    INTEGER(iwp) ::  nzb_x   !< internal index bound for transpositions
    INTEGER(iwp) ::  nzb_y   !< internal index bound for transpositions
    INTEGER(iwp) ::  nzb_yd  !< internal index bound for transpositions
    INTEGER(iwp) ::  nzt_x   !< internal index bound for transpositions
    INTEGER(iwp) ::  nzt_y   !< internal index bound for transpositions
    INTEGER(iwp) ::  nzt_yd  !< internal index bound for transpositions
!
!-- Variables to handle non-uniform subdomains
    INTEGER(iwp) ::  nnx_x_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nnx_y_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nny_yd_max  !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nny_z_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nnz_x_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nnz_yd_max  !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nnz_z_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nxr_x_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nxr_y_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nx_y_max    !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nyn_x_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nyn_yd_max  !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nyn_z_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  ny_z_max    !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nzt_x_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nzt_y_max   !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nzt_yd_max  !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nz_x_max    !< internal index bound for allocating transpose arrays
    INTEGER(iwp) ::  nz_yd_max   !< internal index bound for allocating transpose arrays

    INTEGER, DIMENSION(:), ALLOCATABLE ::  nxl_y_pe   !< index bounds for all PEs for transposition x --> y
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nxr_y_pe   !< index bounds for all PEs for transposition x --> y
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nyn_z_pe   !< index bounds for all PEs for transposition y --> z
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nys_z_pe   !< index bounds for all PEs for transposition y --> z
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nzb_x_pe   !< index bounds for all PEs for transposition z --> x
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nzb_yd_pe  !< index bounds for all PEs for direct transposition z --> y
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nzt_x_pe   !< index bounds for all PEs for transposition z --> x
    INTEGER, DIMENSION(:), ALLOCATABLE ::  nzt_yd_pe  !< index bounds for all PEs for direct transposition z --> y

    REAL(wp), DIMENSION(:,:), ALLOCATABLE ::  f_vec_x  !< array used for vectorized Temperton FFT, allocated in fft_xy

    SAVE

    INTERFACE resort_for_xy
       MODULE PROCEDURE resort_for_xy
    END INTERFACE resort_for_xy

    INTERFACE transpose_xy
       MODULE PROCEDURE transpose_xy
    END INTERFACE transpose_xy

    INTERFACE resort_for_xz
       MODULE PROCEDURE resort_for_xz
    END INTERFACE resort_for_xz

    INTERFACE transpose_xz
       MODULE PROCEDURE transpose_xz
    END INTERFACE transpose_xz

    INTERFACE resort_for_yx
       MODULE PROCEDURE resort_for_yx
    END INTERFACE resort_for_yx

    INTERFACE transpose_yx
       MODULE PROCEDURE transpose_yx
    END INTERFACE transpose_yx

#if defined( __parallel )
    INTERFACE transpose_yxd
       MODULE PROCEDURE transpose_yxd
    END INTERFACE transpose_yxd
#endif

    INTERFACE resort_for_yz
       MODULE PROCEDURE resort_for_yz
    END INTERFACE resort_for_yz

    INTERFACE transpose_yz
       MODULE PROCEDURE transpose_yz
    END INTERFACE transpose_yz

    INTERFACE resort_for_zx
       MODULE PROCEDURE resort_for_zx
    END INTERFACE resort_for_zx

    INTERFACE transpose_zx
       MODULE PROCEDURE transpose_zx
    END INTERFACE transpose_zx

    INTERFACE resort_for_zy
       MODULE PROCEDURE resort_for_zy
    END INTERFACE resort_for_zy

    INTERFACE transpose_zy
       MODULE PROCEDURE transpose_zy
    END INTERFACE transpose_zy

#if defined( __parallel )
    INTERFACE transpose_zyd
       MODULE PROCEDURE transpose_zyd
    END INTERFACE transpose_zyd
#endif

    PUBLIC

 CONTAINS


 SUBROUTINE resort_for_xy( f_in, f_inv )

    USE indices,                                                                                   &
        ONLY:  nx

    IMPLICIT NONE

    INTEGER(iwp) ::  i        !<
    INTEGER(iwp) ::  ii       !<
    INTEGER(iwp) ::  j        !<
    INTEGER(iwp) ::  k        !<
    INTEGER(iwp) ::  l        !<

    REAL(wp) ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)   !<
    REAL(wp) ::  f_inv(nys_x:nyn_x_max,nzb_x:nzt_x,0:nx_y_max)  !<

!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    IF ( non_uniform_subdomain )  THEN

       !$OMP  PARALLEL PRIVATE ( i, j, k, ii, l )
       !$OMP  DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,ii,l) &
       !$ACC PRESENT(f_inv, f_in)
#endif
       DO  k = nzb_x, nzt_x
          DO  j = nys_x, nyn_x
             ii = 0
             DO  l = 0, npey - 1
                DO  i = l*nnx_y_max, l*nnx_y_max+nxr_y_pe(l)-nxl_y_pe(l)
                   f_inv(j,k,i) = f_in(ii,j,k)
                   ii = ii+1
                ENDDO
             ENDDO
          ENDDO
       ENDDO
       !$OMP  END PARALLEL

    ELSE

       !$OMP  PARALLEL PRIVATE ( i, j, k )
       !$OMP  DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_inv, f_in)
#endif
       DO  k = nzb_x, nzt_x
          DO  j = nys_x, nyn_x
             DO  i = 0, nx
                f_inv(j,k,i) = f_in(i,j,k)
             ENDDO
          ENDDO
       ENDDO
       !$OMP  END PARALLEL
    ENDIF

 END SUBROUTINE resort_for_xy


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from x to y. For the input array, all elements along x reside
!> on the same PE, while after transposition, all elements along y reside on the same PE.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_xy( f_inv, f_out )

#if defined( __parallel )
    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               cpu_log_nowait,                                                                     &
               log_point_s
#endif
    USE indices,                                                                                   &
        ONLY:  ny
#if defined( __parallel )
    USE indices,                                                                                   &
        ONLY:  nny_pe
#endif

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<

#if defined( __parallel )
    INTEGER(iwp) ::  jj !<
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  ys  !<
#endif

    REAL(wp) ::  f_inv(nys_x:nyn_x_max,nzb_x:nzt_x,0:nx_y_max)  !<
    REAL(wp) ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)  !<

#if defined( __parallel )
    REAL(wp), DIMENSION(nyn_x_max-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y_max,0:npey-1) ::  work  !<
#if __acc_fft_device
    !$ACC DECLARE CREATE(work)
#endif
#endif


    IF ( numprocs /= 1 )  THEN

#if defined( __parallel )
!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE HOST(f_inv)
#else
       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
#endif
#endif

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL,                       &
                          work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE DEVICE(work)
#else
       !$ACC END HOST_DATA
#endif
#endif

       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Reorder transposed array.
       IF ( non_uniform_subdomain )  THEN

          !$OMP  PARALLEL PRIVATE ( i, j, k, l, jj )
#if __acc_fft_device
          !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,jj) &
          !$ACC PRESENT(f_out, work)
#endif
          !$OMP DO
          DO  i = nxl_y, nxr_y
             DO  k = nzb_y, nzt_y
                jj = 0
                DO  l = 0, npey - 1
                   DO  j = 1, nny_pe(l)
                      f_out(jj,i,k) = work(j,k,i,l)
                      jj = jj+1
                   ENDDO
                ENDDO
             ENDDO
          ENDDO
          !$OMP END PARALLEL

       ELSE

          !$OMP  PARALLEL PRIVATE ( i, j, k, l, ys )
          DO  l = 0, npey - 1
             ys = 0 + l * ( nyn_x - nys_x + 1 )
#if __acc_fft_device
             !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
             !$ACC PRESENT(f_out, work)
#endif
             !$OMP DO
             DO  i = nxl_y, nxr_y
                DO  k = nzb_y, nzt_y
                   DO  j = ys, ys + nyn_x - nys_x
                      f_out(j,i,k) = work(j-ys+1,k,i,l)
                   ENDDO
                ENDDO
             ENDDO
             !$OMP END DO NOWAIT
          ENDDO
          !$OMP END PARALLEL

       ENDIF
#endif

    ELSE

!
!--    Reorder transposed array
       !$OMP  PARALLEL PRIVATE ( i, j, k )
       !$OMP  DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_out, f_inv)
#endif
       DO  k = nzb_y, nzt_y
          DO  i = nxl_y, nxr_y
             DO  j = 0, ny
                f_out(j,i,k) = f_inv(j,k,i)
             ENDDO
          ENDDO
       ENDDO
       !$OMP  END PARALLEL

    ENDIF

 END SUBROUTINE transpose_xy


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Resorting data after the transposition from x to z. The transposition itself is carried out in
!> transpose_xz.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE resort_for_xz( f_inv, f_out )

    USE indices,                                                                                   &
        ONLY:  nxl,                                                                                &
               nxr,                                                                                &
               nyn,                                                                                &
               nys,                                                                                &
               nz

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<
    INTEGER(iwp) ::  kk !<
    INTEGER(iwp) ::  l  !<

    REAL(wp) ::  f_inv(nys:nyn,nxl:nxr_x_max,1:nz_x_max)  !<
    REAL(wp) ::  f_out(1:nz,nys:nyn,nxl:nxr)  !<


!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
!-- In case of parallel fft/transposition, scattered store is faster in backward direction.
    IF ( non_uniform_subdomain )  THEN

       !$OMP PARALLEL PRIVATE ( i, j, k, kk, l )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_in, f_inv)
#endif
       DO  i = nxl, nxr
          DO  j = nys, nyn
             kk = 1
             DO  l = 0, npex - 1
                DO  k = l*nnz_x_max+1,l*nnz_x_max+nzt_x_pe(l)-nzb_x_pe(l)+1
                   f_out(kk,j,i) = f_inv(j,i,k)
                   kk = kk+1
                END DO
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ELSE

       !$OMP  PARALLEL PRIVATE ( i, j, k )
       !$OMP  DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_out, f_inv)
#endif
       DO  i = nxl, nxr
          DO  j = nys, nyn
             DO  k = 1, nz
                f_out(k,j,i) = f_inv(j,i,k)
             ENDDO
          ENDDO
       ENDDO
       !$OMP  END PARALLEL

    ENDIF

 END SUBROUTINE resort_for_xz


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from x to z. For the input array, all elements along x reside
!> on the same PE, while after transposition, all elements along z reside on the same PE.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_xz( f_in, f_inv )

#if defined( __parallel )
    USE control_parameters,                                                                        &
        ONLY:  temperton_fft_vec

    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               cpu_log_nowait,                                                                     &
               log_point_s
#endif

    USE indices,                                                                                   &
        ONLY:  nx,                                                                                 &
               nxl,                                                                                &
               nxr,                                                                                &
               nyn,                                                                                &
               nys,                                                                                &
               nz

#if defined( __parallel )
    USE indices,                                                                                   &
        ONLY:  nnx, nxl_pe, nxr_pe
#endif

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<

#if defined( __parallel )
    INTEGER(iwp) ::  ii  !<
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  mm  !<‚
    INTEGER(iwp) ::  xs  !<
#endif

    REAL(wp) ::  f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)  !<
    REAL(wp) ::  f_inv(nys:nyn,nxl:nxr_x_max,1:nz_x_max)          !<


#if defined( __parallel )
    REAL(wp), DIMENSION(nys_x:nyn_x,nnx_x_max,nzb_x:nzt_x_max,0:npex-1) ::  work  !<
#if __acc_fft_device
    !$ACC DECLARE CREATE(work)
#endif
#endif


!
!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
!-- therefore no transposition has to be done.
    IF ( npex /= 1 )  THEN

#if defined( __parallel )
!
!--    Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
!--    different array format (f_vec_x).
       IF ( temperton_fft_vec )  THEN

          DO  l = 0, npex - 1
             xs = 0 + l * nnx
             DO  k = nzb_x, nzt_x
                DO  i = xs, xs + nnx - 1
                   DO  j = nys_x, nyn_x
                      mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
                      work(j,i-xs+1,k,l) = f_vec_x(mm,i)
                   ENDDO
                ENDDO
             ENDDO
          ENDDO

       ELSE

          !$OMP  PARALLEL PRIVATE ( i, j, k, l, xs, ii )
          DO  l = 0, npex - 1
             xs = 0 + l * nnx
#if __acc_fft_device
             !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,ii) &
             !$ACC PRESENT(work, f_in)
#endif
             !$OMP DO
             DO  k = nzb_x, nzt_x
                IF(non_uniform_subdomain)   THEN
                   ii = 1
                   DO  i = nxl_pe(l), nxr_pe(l)
                      DO  j = nys_x, nyn_x
                         work(j,ii,k,l) = f_in(i,j,k)
                      ENDDO
                      ii = ii+1
                   ENDDO
                ELSE
                   DO  i = xs, xs + nnx - 1
                      DO  j = nys_x, nyn_x
                         work(j,i-xs+1,k,l) = f_in(i,j,k)
                      ENDDO
                   ENDDO
                ENDIF
             ENDDO
             !$OMP END DO NOWAIT
          ENDDO
          !$OMP  END PARALLEL

       ENDIF
!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE HOST(work)
#else
       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
#endif
#endif

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL,                       &
                          f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL, comm1dx, ierr )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE DEVICE(f_inv)
#else
       !$ACC END HOST_DATA
#endif
#endif
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#endif

    ELSE

!
!--    Reorder the array in a way that the z index is in first position
       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_inv, f_in)
#endif
       DO  i = nxl, nxr
          DO  j = nys, nyn
             DO  k = 1, nz
                f_inv(j,i,k) = f_in(i,j,k)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ENDIF

 END SUBROUTINE transpose_xz


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Resorting data after the transposition from y to x. The transposition itself is carried out in
!> transpose_yx.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE resort_for_yx( f_inv, f_out )

    USE indices,                                                                                   &
        ONLY:  nx

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  ii !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<
    INTEGER(iwp) ::  l  !<

    REAL(wp) ::  f_inv(nys_x:nyn_x_max,nzb_x:nzt_x,0:nx_y_max)  !<
    REAL(wp) ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)                    !<


!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    IF ( non_uniform_subdomain )  THEN

       !$OMP  PARALLEL PRIVATE ( i, j, k, ii, l )
       !$OMP  DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,ii,l) &
       !$ACC PRESENT(f_inv, f_in)
#endif
       DO  k = nzb_x, nzt_x
          DO  j = nys_x, nyn_x
             ii = 0
             DO  l = 0, npey - 1
                DO  i = l*nnx_y_max, l*nnx_y_max+nxr_y_pe(l)-nxl_y_pe(l)
                   f_out(ii,j,k) = f_inv(j,k,i)
                   ii = ii+1
                ENDDO
             ENDDO
          ENDDO
       ENDDO
       !$OMP  END PARALLEL

    ELSE

       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_out, f_inv)
#endif
       DO  k = nzb_x, nzt_x
          DO  j = nys_x, nyn_x
             DO  i = 0, nx
                f_out(i,j,k) = f_inv(j,k,i)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ENDIF

 END SUBROUTINE resort_for_yx


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from y to x. For the input array, all  elements along y
!> reside on the same PE, while after transposition, all elements along x reside on the same PE.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_yx( f_in, f_inv )

#if defined( __parallel )
    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               cpu_log_nowait,                                                                     &
               log_point_s
#endif
    USE indices,                                                                                   &
        ONLY:  ny
#if defined( __parallel )
    USE indices,                                                                                   &
        ONLY:  nny_pe
#endif

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<

#if defined( __parallel )
    INTEGER(iwp) ::  jj !<
    INTEGER(iwp) ::  l  !<
    INTEGER(iwp) ::  ys  !<
#endif

    REAL(wp) ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)             !<
    REAL(wp) ::  f_inv(nys_x:nyn_x_max,nzb_x:nzt_x,0:nx_y_max)  !<


#if defined( __parallel )
    REAL(wp), DIMENSION(nyn_x_max-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y_max,0:npey-1) ::  work  !<
#if __acc_fft_device
    !$ACC DECLARE CREATE(work)
#endif
#endif


    IF ( numprocs /= 1 )  THEN

#if defined( __parallel )
!
!--    Reorder input array for transposition
       IF ( non_uniform_subdomain )  THEN

          !$OMP PARALLEL PRIVATE ( i, j, k, l, jj)
#if __acc_fft_device
          !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,l,jj) &
          !$ACC PRESENT(work, f_in)
#endif
          !$OMP DO
          DO  i = nxl_y, nxr_y
             DO  k = nzb_y, nzt_y
                jj = 0
                DO  l = 0, npey - 1
                   ys = 0 + l * ( nyn_x - nys_x + 1 )
                   DO  j = 1, nny_pe(l)
                      work(j,k,i,l )= f_in(jj,i,k)
                      jj = jj+1
                   ENDDO
                ENDDO
             ENDDO
          ENDDO
          !$OMP END PARALLEL

       ELSE

          !$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
          DO  l = 0, npey - 1
             ys = 0 + l * ( nyn_x - nys_x + 1 )
#if __acc_fft_device
             !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
             !$ACC PRESENT(work, f_in)
#endif
             !$OMP DO
             DO  i = nxl_y, nxr_y
                DO  k = nzb_y, nzt_y
                   DO  j = ys, ys + nyn_x - nys_x
                      work(j-ys+1,k,i,l) = f_in(j,i,k)
                   ENDDO
                ENDDO
             ENDDO
             !$OMP END DO NOWAIT
          ENDDO
          !$OMP END PARALLEL

       ENDIF

!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE HOST(work)
#else
       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
#endif
#endif

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL,                       &
                          f_inv(nys_x,nzb_x,0),  sendrecvcount_xy, MPI_REAL, comm1dy, ierr )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE DEVICE(f_inv)
#else
       !$ACC END HOST_DATA
#endif
#endif
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#endif

    ELSE

!
!--    Reorder array f_in the same way as ALLTOALL did it.
       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_inv, f_in)
#endif
       DO  i = nxl_y, nxr_y
          DO  k = nzb_y, nzt_y
             DO  j = 0, ny
                f_inv(j,k,i) = f_in(j,i,k)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ENDIF

 END SUBROUTINE transpose_yx


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Resorting data for the transposition from y to z. The transposition itself is carried out in
!> transpose_yz.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE resort_for_yz( f_in, f_inv )

    USE indices,                                                                                   &
        ONLY:  ny

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  jj !<
    INTEGER(iwp) ::  k  !<
    INTEGER(iwp) ::  l  !<

    REAL(wp) ::  f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)                     !<
    REAL(wp) ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y_max,0:ny_z_max)  !<


!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    !$OMP PARALLEL PRIVATE ( i, j, k, l, jj )
    !$OMP DO
#if __acc_fft_device
    !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,l,jj) &
    !$ACC PRESENT(f_inv, f_in)
#endif
    DO  k = nzb_y, nzt_y
       DO  i = nxl_y, nxr_y
          IF ( non_uniform_subdomain )  THEN
             jj = 0
             DO  l = 0, npex - 1
                DO  j = l*nny_z_max, l*nny_z_max+nyn_z_pe(l)-nys_z_pe(l)
                   f_inv(i,k,j) = f_in(jj,i,k)
                   jj = jj+1
                ENDDO
             ENDDO
          ELSE
             DO  j = 0, ny
                f_inv(i,k,j) = f_in(j,i,k)
             ENDDO
          ENDIF
       ENDDO
    ENDDO
    !$OMP END PARALLEL

 END SUBROUTINE resort_for_yz


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from y to z. For the input array, all elements along y reside
!> on the same PE, while after transposition, all elements along z reside on the same PE.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_yz( f_inv, f_out )

#if defined( __parallel )
    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               cpu_log_nowait,                                                                     &
               log_point_s
#endif

    USE indices,                                                                                   &
        ONLY:  ny,                                                                                 &
               nz

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<

#if defined( __parallel )
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  kk !<
    INTEGER(iwp) ::  zs  !<
#endif

    REAL(wp) ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y_max,0:ny_z_max)  !<
    REAL(wp) ::  f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)                    !<

#if defined( __parallel )
    REAL(wp), DIMENSION(nxl_z:nxr_z,nnz_z_max,nys_z:nyn_z_max,0:npex-1) ::  work  !<
#if __acc_fft_device
    !$ACC DECLARE CREATE(work)
#endif
#endif


!
!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
!-- transposition has to be done.
    IF ( npex == 1 )  THEN

       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_out, f_inv)
#endif
       DO  j = 0, ny
          DO  k = nzb_y, nzt_y
             DO  i = nxl_y, nxr_y
                f_out(i,j,k) = f_inv(i,k,j)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ELSE

#if defined( __parallel )
!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE HOST(f_inv)
#else
       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
#endif
#endif
       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL,                       &
                          work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE DEVICE(work)
#else
       !$ACC END HOST_DATA
#endif
#endif

       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Reorder transposed array
       IF ( non_uniform_subdomain )  THEN

          !$OMP PARALLEL PRIVATE ( i, j, k, l, kk )
#if __acc_fft_device
          !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,l,kk) &
          !$ACC PRESENT(f_out, work)
#endif
          !$OMP DO
          DO  j = nys_z, nyn_z
             kk = 1
             DO  l = 0, npex - 1
                DO  k = 1, nzt_x_pe(l)-nzb_x_pe(l)+1
                   DO  i = nxl_z, nxr_z
                      f_out(i,j,kk) = work(i,k,j,l)
                   ENDDO
                   kk = kk+1
                ENDDO
             ENDDO
          ENDDO
          !$OMP END PARALLEL

       ELSE

          !$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
          DO  l = 0, npex - 1
             zs = 1 + l * ( nzt_y - nzb_y + 1 )
#if __acc_fft_device
             !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
             !$ACC PRESENT(f_out, work)
#endif
             !$OMP DO
             DO  j = nys_z, nyn_z
                DO  k = zs, zs + nzt_y - nzb_y
                   DO  i = nxl_z, nxr_z
                      f_out(i,j,k) = work(i,k-zs+1,j,l)
                   ENDDO
                ENDDO
             ENDDO
             !$OMP END DO NOWAIT
          ENDDO
          !$OMP END PARALLEL

       ENDIF
#endif

   ENDIF

 END SUBROUTINE transpose_yz


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Resorting data for the transposition from z to x. The transposition itself is carried out in
!> transpose_zx.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE resort_for_zx( f_in, f_inv )

    USE indices,                                                                                   &
        ONLY:  nxl,                                                                                &
               nxr,                                                                                &
               nyn,                                                                                &
               nys,                                                                                &
               nz

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<
    INTEGER(iwp) ::  kk !<
    INTEGER(iwp) ::  l  !<

    REAL(wp) ::  f_in(1:nz,nys:nyn,nxl:nxr)   !<
    REAL(wp) ::  f_inv(nys:nyn,nxl:nxr_x_max,1:nz_x_max)  !<


!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    IF ( non_uniform_subdomain )  THEN

       !$OMP PARALLEL PRIVATE ( i, j, k, kk, l )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,kk,l)   &
       !$ACC PRESENT(f_in, f_inv)
#endif
       DO  i = nxl, nxr
          DO  j = nys, nyn
             kk = 1
             DO  l = 0, npex - 1
                DO  k = l*nnz_x_max+1, l*nnz_x_max+nzt_x_pe(l)-nzb_x_pe(l)+1
                   f_inv(j,i,k) = f_in(kk,j,i)
                   kk = kk+1
                ENDDO
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ELSE

       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_in, f_inv)
#endif
       DO  i = nxl, nxr
          DO  j = nys, nyn
             DO  k = 1,nz
                f_inv(j,i,k) = f_in(k,j,i)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ENDIF

 END SUBROUTINE resort_for_zx


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from z to x. For the input array, all elements along z reside
!> on the same PE, while after transposition, all elements along x reside on the same PE.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_zx( f_inv, f_out )

#if defined( __parallel )
    USE control_parameters,                                                                        &
        ONLY:  temperton_fft_vec

    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               cpu_log_nowait,                                                                     &
               log_point_s

#endif

    USE indices,                                                                                   &
        ONLY:  nx,                                                                                 &
               nxl,                                                                                &
               nxr,                                                                                &
               nyn,                                                                                &
               nys,                                                                                &
               nz

#if defined( __parallel )
    USE indices,                                                                                   &
        ONLY:  nnx, nxl_pe, nxr_pe
#endif

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<

#if defined( __parallel )
    INTEGER(iwp) ::  ii  !<
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  mm  !<
    INTEGER(iwp) ::  xs  !<
#endif

    REAL(wp) ::  f_inv(nys:nyn,nxl:nxr_x_max,1:nz_x_max)          !<
    REAL(wp) ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)  !<

#if defined( __parallel )
    REAL(wp), DIMENSION(nys_x:nyn_x,nnx_x_max,nzb_x:nzt_x_max,0:npex-1) ::  work  !<
#if __acc_fft_device
    !$ACC DECLARE CREATE(work)
#endif
#endif


!
!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
!-- transposition has to be done.
    IF ( npex == 1 )  THEN

       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_out, f_inv)
#endif
       DO  k = 1, nz
          DO  i = nxl, nxr
             DO  j = nys, nyn
                f_out(i,j,k) = f_inv(j,i,k)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ELSE

#if defined( __parallel )
!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE HOST(f_inv)
#else
       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
#endif
#endif

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLTOALL( f_inv(nys,nxl,1),      sendrecvcount_zx, MPI_REAL,                       &
                          work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE DEVICE(work)
#else
       !$ACC END HOST_DATA
#endif
#endif

       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!--    Reorder transposed array.
!--    Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
!--    saves additional data copy in fft_x.
       IF ( temperton_fft_vec )  THEN

          DO  l = 0, npex - 1
             xs = 0 + l * nnx
             DO  k = nzb_x, nzt_x
                DO  i = xs, xs + nnx - 1
                   DO  j = nys_x, nyn_x
                      mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
                      f_vec_x(mm,i) = work(j,i-xs+1,k,l)
                   ENDDO
                ENDDO
             ENDDO
          ENDDO

       ELSE

          !$OMP PARALLEL PRIVATE ( i, j, k, l, xs, ii )
          DO  l = 0, npex - 1
             xs = 0 + l * nnx
#if __acc_fft_device
             !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,ii) &
             !$ACC PRESENT(f_out, work)
#endif
             !$OMP DO
             DO  k = nzb_x, nzt_x
                IF(non_uniform_subdomain)   THEN
                   ii = 1
                   DO  i = nxl_pe(l), nxr_pe(l)
                      DO  j = nys_x, nyn_x
                         f_out(i,j,k) = work(j,ii,k,l)
                      ENDDO
                      ii = ii+1
                   ENDDO
                ELSE
                   DO  i = xs, xs + nnx - 1
                      DO  j = nys_x, nyn_x
                         f_out(i,j,k) = work(j,i-xs+1,k,l)
                      ENDDO
                   ENDDO
                ENDIF
             ENDDO
             !$OMP END DO NOWAIT
          ENDDO
          !$OMP END PARALLEL

       ENDIF
#endif

    ENDIF

 END SUBROUTINE transpose_zx


!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Resorting data after the transposition from z to y. The transposition itself is carried out in
!> transpose_zy.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE resort_for_zy( f_inv, f_out )


    USE indices,                                                                                   &
        ONLY:  ny

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  jj !<
    INTEGER(iwp) ::  k  !<
    INTEGER(iwp) ::  l  !<

    REAL(wp) ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y_max,0:ny_z_max)  !<
    REAL(wp) ::  f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)  !<


!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    !$OMP PARALLEL PRIVATE ( i, j, k, l, jj )
    !$OMP DO
#if __acc_fft_device
    !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,l,jj) &
    !$ACC PRESENT(f_out, f_inv)
#endif
    DO  k = nzb_y, nzt_y
       DO  i = nxl_y, nxr_y
          IF ( non_uniform_subdomain )  THEN
             jj = 0
             DO  l = 0, npex - 1
                DO  j = l*nny_z_max, l*nny_z_max+nyn_z_pe(l)-nys_z_pe(l)
                   f_out(jj,i,k) = f_inv(i,k,j)
                   jj = jj+1
                ENDDO
             ENDDO
          ELSE
             DO  j = 0, ny
                f_out(j,i,k) = f_inv(i,k,j)
             ENDDO
          ENDIF
       ENDDO
    ENDDO
    !$OMP END PARALLEL

 END SUBROUTINE resort_for_zy


!--------------------------------------------------------------------------------------------------!
! Description:cpu_log_nowait
! ------------
!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
!> on the same PE, while after transposition, all elements along y reside on the same PE.
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_zy( f_in, f_inv )

#if defined( __parallel )
    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               cpu_log_nowait,                                                                     &
               log_point_s
#endif

    USE indices,                                                                                   &
        ONLY:  ny,                                                                                 &
               nz

    IMPLICIT NONE

    INTEGER(iwp) ::  i  !<
    INTEGER(iwp) ::  j  !<
    INTEGER(iwp) ::  k  !<

#if defined( __parallel )
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  kk
    INTEGER(iwp) ::  zs  !<
#endif

    REAL(wp) ::  f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz)   !<
    REAL(wp) ::  f_inv(nxl_y:nxr_y,nzb_y:nzt_y_max,0:ny_z_max)  !<

#if defined( __parallel )
    REAL(wp), DIMENSION(nxl_z:nxr_z,nnz_z_max,nys_z:nyn_z_max,0:npex-1) ::  work  !<
#if __acc_fft_device
    !$ACC DECLARE CREATE(work)
#endif
#endif


!
!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
!-- therefore no transposition has to be done.
    IF ( npex /= 1 )  THEN

#if defined( __parallel )
!
!--    Reorder input array for transposition.
       IF ( non_uniform_subdomain )  THEN

          !$OMP PARALLEL PRIVATE ( i, j, k, l, kk )
#if __acc_fft_device
          !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k,l,kk) &
          !$ACC PRESENT(f_out, work)
#endif
          !$OMP DO
          DO  j = nys_z, nyn_z
             kk = 1
             DO  l = 0, npex - 1
                DO  k = 1, nzt_x_pe(l) - nzb_x_pe(l) + 1
                   DO  i = nxl_z, nxr_z
                      work(i,k,j,l) = f_in(i,j,kk)
                   ENDDO
                   kk = kk+1
                ENDDO
             ENDDO
          ENDDO
          !$OMP END PARALLEL

       ELSE

          !$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
          DO  l = 0, npex - 1
             zs = 1 + l * ( nzt_y - nzb_y + 1 )
#if __acc_fft_device
             !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
             !$ACC PRESENT(work, f_in)
#endif
             !$OMP DO
             DO  j = nys_z, nyn_z
                DO  k = zs, zs + nzt_y - nzb_y
                   DO  i = nxl_z, nxr_z
                      work(i,k-zs+1,j,l) = f_in(i,j,k)
                   ENDDO
                ENDDO
             ENDDO
             !$OMP END DO NOWAIT
          ENDDO
          !$OMP END PARALLEL

       ENDIF
!
!--    Transpose array
       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE HOST(work)
#else
       !$ACC HOST_DATA USE_DEVICE(work, f_inv)
#endif
#endif

       IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
       CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL,                       &
                          f_inv(nxl_y,nzb_y,0),  sendrecvcount_yz, MPI_REAL, comm1dx, ierr )

#if __acc_fft_device
#ifndef __cuda_aware_mpi
       !$ACC UPDATE DEVICE(f_inv)
#else
       !$ACC END HOST_DATA
#endif
#endif

       CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
#endif

    ELSE
!
!--    Reorder the array in the same way like ALLTOALL did it
       !$OMP PARALLEL PRIVATE ( i, j, k )
       !$OMP DO
#if __acc_fft_device
       !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
       !$ACC PRESENT(f_inv, f_in)
#endif
       DO  k = nzb_y, nzt_y
          DO  j = 0, ny
             DO  i = nxl_y, nxr_y
                f_inv(i,k,j) = f_in(i,j,k)
             ENDDO
          ENDDO
       ENDDO
       !$OMP END PARALLEL

    ENDIF

 END SUBROUTINE transpose_zy


#if defined( __parallel )
!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
!> on the same PE, while after transposition, all elements along y reside on the same PE. This is a
!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_zy).
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_zyd( f_in, f_out )

    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               log_point_s

    USE indices,                                                                                   &
        ONLY:  nny,                                                                                &
               nxl,                                                                                &
               nxr,                                                                                &
               nyn,                                                                                &
               nyn_pe,                                                                             &
               nys,                                                                                &
               nys_pe,                                                                             &
               ny,                                                                                 &
               nz

    IMPLICIT NONE

    INTEGER(iwp) ::  i   !<
    INTEGER(iwp) ::  j   !<
    INTEGER(iwp) ::  jj  !<
    INTEGER(iwp) ::  k   !<
    INTEGER(iwp) ::  kk  !<
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  ys  !<

    REAL(wp) ::  f_in(1:nz,nys:nyn,nxl:nxr)                            !<
    REAL(wp) ::  f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd)               !<

    REAL(wp), DIMENSION(nys:nyn_yd_max,nxl_yd:nxr_yd,1:nz_yd_max)      ::  f_inv  !<
    REAL(wp), DIMENSION(nny_yd_max,nxl:nxr,nzb_yd:nzt_yd_max,0:npey-1) ::  work   !<


!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    IF ( non_uniform_subdomain )  THEN

       DO  i = nxl, nxr
          DO  j = nys, nyn
             kk = 1
             DO  l = 0, npey - 1
                DO  k = l*nnz_yd_max+1, l*nnz_yd_max+nzt_yd_pe(l)-nzb_yd_pe(l)+1
                   f_inv(j,i,k) = f_in(kk,j,i)
                   kk = kk+1
                ENDDO
             ENDDO
          ENDDO
       ENDDO

    ELSE

       DO  i = nxl, nxr
          DO  j = nys, nyn
             DO  k = 1, nz
                f_inv(j,i,k) = f_in(k,j,i)
             ENDDO
          ENDDO
       ENDDO

    ENDIF

!
!-- Move data to different array, because memory location of work1 is needed further below
!-- (work1 = work2). If the PE grid is one-dimensional along x, only local reordering of the data is
!-- necessary and no transposition has to be done.
    IF ( npey == 1 )  THEN
       DO  k = 1, nz
          DO  i = nxl, nxr
             DO  j = nys, nyn
                f_out(j,i,k) = f_inv(j,i,k)
             ENDDO
          ENDDO
       ENDDO
       RETURN
    ENDIF

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL,                              &
       work(1,nxl,nzb_yd,0), sendrecvcount_zyd, MPI_REAL, comm1dy, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    DO  l = 0, npey - 1
       ys = 0 + l * nny
       DO  k = nzb_yd, nzt_yd
          DO  i = nxl,nxr
             IF(non_uniform_subdomain)   THEN
                jj = 1
                DO  j = nys_pe(l), nyn_pe(l)
                   f_out(j,i,k) = work(jj,i,k,l)
                   jj = jj+1
                ENDDO
             ELSE
                DO  j = ys, ys + nny - 1
                   f_out(j,i,k) = work(j-ys+1,i,k,l)
                ENDDO
             end if
          ENDDO
       ENDDO
    ENDDO

 END SUBROUTINE transpose_zyd
#endif

#if defined( __parallel )
!--------------------------------------------------------------------------------------------------!
! Description:
! ------------
!> Transposition of input array (f_in) from y to x. For the input array, all elements along y reside
!> on the same PE, while after transposition, all elements along x reside on the same PE. This is a
!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_yx).
!--------------------------------------------------------------------------------------------------!
 SUBROUTINE transpose_yxd( f_in, f_out )

    USE cpulog,                                                                                    &
        ONLY:  cpu_log,                                                                            &
               log_point_s

    USE indices,                                                                                   &
        ONLY:  nnx,                                                                                &
               nny,                                                                                &
               nnz,                                                                                &
               nx,                                                                                 &
               nxl,                                                                                &
               nxr,                                                                                &
               nyn,                                                                                &
               nys,                                                                                &
               nz

    USE kinds

    USE pegrid

    IMPLICIT NONE

    INTEGER(iwp) ::  i   !<
    INTEGER(iwp) ::  j   !<
    INTEGER(iwp) ::  k   !<
    INTEGER(iwp) ::  l   !<
    INTEGER(iwp) ::  m   !<
    INTEGER(iwp) ::  xs  !<

    REAL(wp) ::  f_in(1:nz,nys:nyn,nxl:nxr)           !<
    REAL(wp) ::  f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)  !<

    REAL(wp) ::  f_inv(nxl:nxr,1:nz,nys:nyn)          !<
    REAL(wp) ::  work(nnx*nny*nnz)                    !<

!
!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
    DO  k = 1, nz
       DO  j = nys, nyn
          DO  i = nxl, nxr
             f_inv(i,k,j) = f_in(k,j,i)
          ENDDO
       ENDDO
    ENDDO

!
!-- Transpose array
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
    IF ( collective_wait )  CALL MPI_BARRIER( comm2d, ierr )
    CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL,                               &
                       work(1), sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
    CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )

!
!-- Reorder transposed array
    m = 0
    DO  l = 0, npex - 1
       xs = 0 + l * nnx
       DO  j = nys_x, nyn_x
          DO  k = 1, nz
             DO  i = xs, xs + nnx - 1
                m = m + 1
                f_out(i,j,k) = work(m)
             ENDDO
          ENDDO
       ENDDO
    ENDDO

 END SUBROUTINE transpose_yxd
#endif

 END MODULE transpose_mod