!> @file poisfft_mod.f90 !--------------------------------------------------------------------------------------------------! ! This file is part of the PALM model system. ! ! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General ! Public License as published by the Free Software Foundation, either version 3 of the License, or ! (at your option) any later version. ! ! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the ! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General ! Public License for more details. ! ! You should have received a copy of the GNU General Public License along with PALM. If not, see ! . ! ! Copyright 1997-2021 Leibniz Universitaet Hannover !--------------------------------------------------------------------------------------------------! ! ! ! Description: ! ------------ !> Solves the Poisson equation with a 2D spectral method !> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !> !> Input: !> real ar contains (nnz,nny,nnx) elements of the velocity divergence, starting from (1,nys,nxl) !> !> Output: !> real ar contains the solution for perturbation pressure p !--------------------------------------------------------------------------------------------------! MODULE poisfft_mod #if defined( __parallel ) USE MPI #endif USE control_parameters, & ONLY: bc_lr_cyc, & bc_ns_cyc, & message_string, & temperton_fft_vec USE fft_xy, & ONLY: fft_init, & fft_y, & fft_y_1d, & fft_y_m, & fft_x, & fft_x_1d, & fft_x_m USE indices, & ONLY: nnx, & nny, & nx, & nxl, & nxr, & ny, & nys, & nyn, & nz USE pegrid, & ONLY: non_uniform_subdomain USE transpose_mod, & ONLY: nnx_x_max, & nnx_y_max, & nnz_x_max, & nnz_z_max, & nxl_y, & nxl_z, & nxr_x_max, & nxr_y_max, & nxr_y, & nxr_z, & nx_y_max, & nys_x, & nys_z, & nyn_x, & nyn_x_max, & nyn_z, & nyn_z_max, & ny_z_max, & nzb_x, & nzb_y, & nzt_x, & nzt_x_max, & nzt_y_max, & nzt_y, & nz_x_max, & resort_for_xy, transpose_xy, & resort_for_xz, transpose_xz, & resort_for_yx, transpose_yx, & resort_for_yz, transpose_yz, & resort_for_zx, transpose_zx, & resort_for_zy, transpose_zy USE tridia_solver, & ONLY: tridia_1dd, & tridia_init, & tridia_substi IMPLICIT NONE LOGICAL, SAVE :: poisfft_initialized = .FALSE. !< PRIVATE PUBLIC poisfft, poisfft_init INTERFACE poisfft MODULE PROCEDURE poisfft END INTERFACE poisfft INTERFACE poisfft_init MODULE PROCEDURE poisfft_init END INTERFACE poisfft_init CONTAINS !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> Setup coefficients for FFT and the tridiagonal solver !--------------------------------------------------------------------------------------------------! SUBROUTINE poisfft_init IMPLICIT NONE CALL fft_init CALL tridia_init poisfft_initialized = .TRUE. END SUBROUTINE poisfft_init !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> Two-dimensional Fourier Transformation in x- and y-direction. !--------------------------------------------------------------------------------------------------! SUBROUTINE poisfft( ar ) USE cpulog, & ONLY: cpu_log, & log_point_s USE indices, & ONLY: & nxl, & nxr, & nyn, & nys, & nz USE kinds USE pegrid IMPLICIT NONE REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !< #define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) ) #if __acc_fft_device !$ACC DECLARE CREATE(ar_inv) #endif REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: xy_in !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: xy_out !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: yz_in !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: yz_out !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: zx_in !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: zx_out !< IF ( .NOT. poisfft_initialized ) CALL poisfft_init CALL cpu_log( log_point_s(3), 'poisfft', 'start' ) #if !__acc_fft_device !$ACC UPDATE HOST(ar) #endif #ifndef _OPENACC ! !-- Two-dimensional Fourier Transformation in x- and y-direction. IF ( npey == 1 .AND. npex > 1 ) THEN ! !-- 1d-domain-decomposition along x: !-- FFT along y and transposition y --> x CALL ffty_tr_yx( ar, ar ) ! !-- FFT along x, solving the tridiagonal system and backward FFT CALL fftx_tri_fftx( ar ) ! !-- Transposition x --> y and backward FFT along y CALL tr_xy_ffty( ar, ar ) ELSEIF ( npex == 1 .AND. npey > 1 ) THEN ! !-- 1d-domain-decomposition along y: !-- FFT along x and transposition x --> y CALL fftx_tr_xy( ar, ar ) ! !-- FFT along y, solving the tridiagonal system and backward FFT CALL ffty_tri_ffty( ar ) ! !-- Transposition y --> x and backward FFT along x CALL tr_yx_fftx( ar, ar ) ELSE #endif ALLOCATE( xy_in(nys_x:nyn_x_max,nzb_x:nzt_x,0:nx_y_max) ) ALLOCATE( zx_in(nys:nyn,nxl:nxr_x_max,1:nz_x_max) ) ALLOCATE( zx_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) ) ! !-- 2d-domain-decomposition or no decomposition (1 PE run) !-- Transposition z --> x CALL cpu_log( log_point_s(5), 'transpo forward', 'start' ) CALL resort_for_zx( ar, zx_in ) CALL transpose_zx( zx_in, zx_out ) CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) CALL cpu_log( log_point_s(4), 'fft_x', 'start' ) IF ( temperton_fft_vec ) THEN ! !-- Vector version outputs a transformed array ar_inv that does not require resorting !-- (which is done for ar further below) CALL fft_x( zx_out, 'forward', ar_inv=xy_in) ELSE CALL fft_x( zx_out, 'forward') ENDIF CALL cpu_log( log_point_s(4), 'fft_x', 'pause' ) ! !-- Transposition x --> y ALLOCATE( xy_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) ) ALLOCATE( yz_in(nxl_y:nxr_y,nzb_y:nzt_y_max,0:ny_z_max) ) CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) IF( .NOT. temperton_fft_vec ) CALL resort_for_xy( zx_out, xy_in ) CALL transpose_xy( xy_in, xy_out ) CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' ) CALL cpu_log( log_point_s(7), 'fft_y', 'start' ) IF ( temperton_fft_vec .AND. .NOT. non_uniform_subdomain ) THEN ! !-- Input array ar_inv from fft_x can be directly used here. !-- The output (also in array ar_inv) does not require resorting below. !-- This is currently only programmed for uniform sundomains. !-- TODO: Please check performance on NEC to decide if this branch is also possible for !-- nonuniform subdomains. CALL fft_y( xy_out, 'forward', ar_inv = yz_in, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & nxl_y_l = nxl_y, nxr_y_l = nxr_y ) ELSE CALL fft_y( xy_out, 'forward', ar_tr = xy_out, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, & nxl_y_l = nxl_y, nxr_y_l = nxr_y ) ENDIF CALL cpu_log( log_point_s(7), 'fft_y', 'pause' ) ! !-- Transposition y --> z ALLOCATE( yz_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) ) CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' ) IF ( .NOT. temperton_fft_vec .OR. non_uniform_subdomain ) THEN CALL resort_for_yz( xy_out, yz_in ) ENDIF CALL transpose_yz( yz_in, yz_out ) CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' ) ! !-- Solve the tridiagonal equation system along z CALL cpu_log( log_point_s(6), 'tridia', 'start' ) CALL tridia_substi( yz_out ) CALL cpu_log( log_point_s(6), 'tridia', 'stop' ) ! ! !-- Inverse Fourier Transformation !-- Transposition z --> y CALL cpu_log( log_point_s(8), 'transpo invers', 'start' ) CALL transpose_zy( yz_out, yz_in ) ! !-- The fft_y below (vector branch) can directly process ar_inv (i.e. does not require a !-- resorting) IF ( .NOT. temperton_fft_vec .OR. non_uniform_subdomain ) THEN CALL resort_for_zy( yz_in, xy_out ) ENDIF CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) DEALLOCATE( yz_out ) CALL cpu_log( log_point_s(7), 'fft_y', 'continue' ) IF ( temperton_fft_vec .AND. .NOT. non_uniform_subdomain ) THEN ! !-- Output array ar_inv can be used as input to the below fft_x routine without resorting CALL fft_y( xy_out, 'backward', ar_inv = yz_in, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,& nxl_y_l = nxl_y, nxr_y_l = nxr_y ) ELSE CALL fft_y( xy_out, 'backward', ar_tr = xy_out, nxl_y_bound = nxl_y, nxr_y_bound = nxr_y,& nxl_y_l = nxl_y, nxr_y_l = nxr_y ) ENDIF DEALLOCATE( yz_in ) CALL cpu_log( log_point_s(7), 'fft_y', 'stop' ) ! !-- Transposition y --> x CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) CALL transpose_yx( xy_out, xy_in ) IF ( .NOT. temperton_fft_vec ) CALL resort_for_yx( xy_in, zx_out ) CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' ) DEALLOCATE( xy_out ) CALL cpu_log( log_point_s(4), 'fft_x', 'continue' ) IF ( temperton_fft_vec ) THEN CALL fft_x( zx_out, 'backward', ar_inv=xy_in ) ELSE CALL fft_x( zx_out, 'backward' ) ENDIF DEALLOCATE( xy_in ) CALL cpu_log( log_point_s(4), 'fft_x', 'stop' ) ! !-- Transposition x --> z CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' ) CALL transpose_xz( zx_out, zx_in ) CALL resort_for_xz( zx_in, ar ) CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' ) DEALLOCATE( zx_in ) DEALLOCATE( zx_out ) #ifndef _OPENACC ENDIF #endif #if !__acc_fft_device !$ACC UPDATE DEVICE(ar) #endif CALL cpu_log( log_point_s(3), 'poisfft', 'stop' ) END SUBROUTINE poisfft !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> Fourier-transformation along y with subsequent transposition y --> x for a 1d-decomposition !> along x. !> !> @attention The performance of this routine is much faster on the NEC-SX6, if the first index of !> work_ffty_vec is odd. Otherwise memory bank conflicts may occur (especially if the !> index is a multiple of 128). That's why work_ffty_vec is dimensioned as 0:ny+1. !> Of course, this will not work if users are using an odd number of gridpoints along y. !--------------------------------------------------------------------------------------------------! SUBROUTINE ffty_tr_yx( f_in, f_out ) USE control_parameters, & ONLY: loop_optimization USE cpulog, & ONLY: cpu_log, & log_point_s USE kinds USE pegrid IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: iend !< INTEGER(iwp) :: iouter !< INTEGER(iwp) :: ir !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< INTEGER(iwp), PARAMETER :: stridex = 4 !< REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !< REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,npex) :: f_out !< REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !< REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !< ! !-- Carry out the FFT along y, where all data are present due to the 1d-decomposition along x. !-- Resort the data in a way that x becomes the first index. CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' ) IF ( loop_optimization == 'vector' ) THEN ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) ) ! !-- Code optimized for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO i = nxl, nxr DO j = 0, ny DO k = 1, nz work_ffty_vec(j,k,i) = f_in(k,j,i) ENDDO ENDDO CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' ) ENDDO !$OMP DO DO k = 1, nz DO j = 0, ny DO i = nxl, nxr work(i,k,j) = work_ffty_vec(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL DEALLOCATE( work_ffty_vec ) ELSE ! !-- Cache optimized code. ALLOCATE( work_ffty(0:ny,stridex) ) ! !-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache !-- performance !$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty) !$OMP DO DO iouter = nxl, nxr, stridex iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop DO k = 1, nz DO i = iouter, iend ir = i-iouter+1 ! Counter within a stride DO j = 0, ny work_ffty(j,ir) = f_in(k,j,i) ENDDO ! !-- FFT along y CALL fft_y_1d( work_ffty(:,ir), 'forward' ) ENDDO ! !-- Resort DO j = 0, ny DO i = iouter, iend work(i,k,j) = work_ffty(j,i-iouter+1) ENDDO ENDDO ENDDO ENDDO !$OMP END PARALLEL DEALLOCATE( work_ffty ) ENDIF CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' ) ! !-- Transpose array #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nys_x,1), & sendrecvcount_xy, MPI_REAL, comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else ! !-- Next line required to avoid compile error about unused dummy argument in serial mode i = SIZE( f_out ) #endif END SUBROUTINE ffty_tr_yx !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition x --> y with a subsequent backward Fourier transformation for a 1d-decomposition !> along x !--------------------------------------------------------------------------------------------------! SUBROUTINE tr_xy_ffty( f_in, f_out ) USE control_parameters, & ONLY: loop_optimization USE cpulog, & ONLY: cpu_log, & log_point_s USE kinds USE pegrid IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: iend !< INTEGER(iwp) :: iouter !< INTEGER(iwp) :: ir !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< INTEGER(iwp), PARAMETER :: stridex = 4 !< REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,npex) :: f_in !< REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !< REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !< REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !< REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !< ! !-- Transpose array #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, work(nxl,1,0), & sendrecvcount_xy, MPI_REAL, comm1dx, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else ! !-- Next line required to avoid compile error about unused dummy argument in serial mode i = SIZE( f_in ) #endif ! !-- Resort the data in a way that y becomes the first index and carry out the backward fft along y. CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' ) IF ( loop_optimization == 'vector' ) THEN ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) ) ! !-- Code optimized for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO k = 1, nz DO j = 0, ny DO i = nxl, nxr work_ffty_vec(j,k,i) = work(i,k,j) ENDDO ENDDO ENDDO !$OMP DO DO i = nxl, nxr CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' ) DO j = 0, ny DO k = 1, nz f_out(k,j,i) = work_ffty_vec(j,k,i) ENDDO ENDDO ENDDO !$OMP END PARALLEL DEALLOCATE( work_ffty_vec ) ELSE ! !-- Cache optimized code. ALLOCATE( work_ffty(0:ny,stridex) ) ! !-- The i-(x-)direction is split into a strided outer loop and an inner loop for better cache !-- performance !$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty ) !$OMP DO DO iouter = nxl, nxr, stridex iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop DO k = 1, nz ! !-- Resort DO j = 0, ny DO i = iouter, iend work_ffty(j,i-iouter+1) = work(i,k,j) ENDDO ENDDO DO i = iouter, iend ! !-- FFT along y ir = i-iouter+1 ! Counter within a stride CALL fft_y_1d( work_ffty(:,ir), 'backward' ) DO j = 0, ny f_out(k,j,i) = work_ffty(j,ir) ENDDO ENDDO ENDDO ENDDO !$OMP END PARALLEL DEALLOCATE( work_ffty ) ENDIF CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' ) END SUBROUTINE tr_xy_ffty !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> FFT along x, solution of the tridiagonal system and backward FFT for a 1d-decomposition along x !> !> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible !> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen, !> May 2002) !--------------------------------------------------------------------------------------------------! SUBROUTINE fftx_tri_fftx( ar ) USE control_parameters, & ONLY: loop_optimization USE cpulog, & ONLY: cpu_log, & log_point_s USE grid_variables, & ONLY: ddx2, & ddy2 USE kinds USE pegrid IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< INTEGER(iwp) :: m !< INTEGER(iwp) :: n !< !$ INTEGER(iwp) :: omp_get_thread_num !< INTEGER(iwp) :: tn !< REAL(wp), DIMENSION(0:nx) :: work_fftx !< REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !< REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,npex) :: ar !< REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !< CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' ) ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) ) tn = 0 ! Default thread number in case of one thread !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix ) DO j = nys_x, nyn_x !$ tn = omp_get_thread_num() IF ( loop_optimization == 'vector' ) THEN ! !-- Code optimized for vector processors DO k = 1, nz m = 0 DO n = 1, npex DO i = 1, nnx work_trix(m,k) = ar(i,k,j,n) m = m + 1 ENDDO ENDDO ENDDO CALL fft_x_m( work_trix, 'forward' ) ELSE ! !-- Cache optimized code DO k = 1, nz m = 0 DO n = 1, npex DO i = 1, nnx work_fftx(m) = ar(i,k,j,n) m = m + 1 ENDDO ENDDO CALL fft_x_1d( work_fftx, 'forward' ) DO i = 0, nx work_trix(i,k) = work_fftx(i) ENDDO ENDDO ENDIF ! !-- Solve the linear equation system CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) ) IF ( loop_optimization == 'vector' ) THEN ! !-- Code optimized for vector processors CALL fft_x_m( work_trix, 'backward' ) DO k = 1, nz m = 0 DO n = 1, npex DO i = 1, nnx ar(i,k,j,n) = work_trix(m,k) m = m + 1 ENDDO ENDDO ENDDO ELSE ! !-- Cache optimized code DO k = 1, nz DO i = 0, nx work_fftx(i) = work_trix(i,k) ENDDO CALL fft_x_1d( work_fftx, 'backward' ) m = 0 DO n = 1, npex DO i = 1, nnx ar(i,k,j,n) = work_fftx(m) m = m + 1 ENDDO ENDDO ENDDO ENDIF ENDDO DEALLOCATE( tri ) CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' ) END SUBROUTINE fftx_tri_fftx !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> Fourier-transformation along x with subsequent transposition x --> y for a 1d-decomposition !> along y. !> !> @attention NEC-branch of this routine may significantly profit from further optimizations. So !> far, performance is much worse than for routine ffty_tr_yx (more than three times !> slower). !--------------------------------------------------------------------------------------------------! SUBROUTINE fftx_tr_xy( f_in, f_out ) USE control_parameters, & ONLY: loop_optimization USE cpulog, & ONLY: cpu_log, & log_point_s USE kinds USE pegrid IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !< REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !< REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,npey) :: f_out !< REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !< ! !-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y. !-- Resort the data in a way that y becomes the first index. CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' ) IF ( loop_optimization == 'vector' ) THEN ! !-- Code for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz work_fftx(i,k,j) = f_in(k,j,i) ENDDO ENDDO ENDDO !$OMP DO DO j = nys, nyn CALL fft_x_m( work_fftx(:,:,j), 'forward' ) DO k = 1, nz DO i = 0, nx work(j,k,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE ! !-- Cache optimized code (there might still be a potential for better optimization). !$OMP PARALLEL PRIVATE (i,j,k) !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz work_fftx(i,k,j) = f_in(k,j,i) ENDDO ENDDO ENDDO !$OMP DO DO j = nys, nyn DO k = 1, nz CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' ) DO i = 0, nx work(j,k,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' ) ! !-- Transpose array #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, f_out(1,1,nxl_y,1), & sendrecvcount_xy, MPI_REAL, comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else ! !-- Next line required to avoid compile error about unused dummy argument in serial mode i = SIZE( f_out ) #endif END SUBROUTINE fftx_tr_xy !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> Transposition y --> x with a subsequent backward Fourier transformation for a 1d-decomposition !> along x. !--------------------------------------------------------------------------------------------------! SUBROUTINE tr_yx_fftx( f_in, f_out ) USE control_parameters, & ONLY: loop_optimization USE cpulog, & ONLY: cpu_log, & log_point_s USE kinds USE pegrid IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,npey) :: f_in !< REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !< REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !< REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !< ! !-- Transpose array #if defined( __parallel ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' ) IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr ) CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, work(nys,1,0), & sendrecvcount_xy, MPI_REAL, comm1dy, ierr ) CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' ) #else ! !-- Next line required to avoid compile error about unused dummy argument in serial mode i = SIZE( f_in ) #endif ! !-- Carry out the FFT along x, where all data are present due to the 1d-decomposition along y. !-- Resort the data in a way that y becomes the first index. CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' ) IF ( loop_optimization == 'vector' ) THEN ! !-- Code optimized for vector processors !$OMP PARALLEL PRIVATE ( i, j, k ) !$OMP DO DO j = nys, nyn DO k = 1, nz DO i = 0, nx work_fftx(i,k,j) = work(j,k,i) ENDDO ENDDO CALL fft_x_m( work_fftx(:,:,j), 'backward' ) ENDDO !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz f_out(k,j,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ELSE ! !-- Cache optimized code (there might be still a potential for better optimization). !$OMP PARALLEL PRIVATE (i,j,k) !$OMP DO DO j = nys, nyn DO k = 1, nz DO i = 0, nx work_fftx(i,k,j) = work(j,k,i) ENDDO CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' ) ENDDO ENDDO !$OMP DO DO i = 0, nx DO j = nys, nyn DO k = 1, nz f_out(k,j,i) = work_fftx(i,k,j) ENDDO ENDDO ENDDO !$OMP END PARALLEL ENDIF CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' ) END SUBROUTINE tr_yx_fftx !--------------------------------------------------------------------------------------------------! ! Description: ! ------------ !> FFT along y, solution of the tridiagonal system and backward FFT for a 1d-decomposition along y. !> !> @warning This subroutine may still not work for hybrid parallelization with OpenMP (for possible !> necessary changes see the original routine poisfft_hybrid, developed by Klaus Ketelsen, !> May 2002) !--------------------------------------------------------------------------------------------------! SUBROUTINE ffty_tri_ffty( ar ) USE control_parameters, & ONLY: loop_optimization USE cpulog, & ONLY: cpu_log, & log_point_s USE grid_variables, & ONLY: ddx2, & ddy2 USE kinds USE pegrid IMPLICIT NONE INTEGER(iwp) :: i !< INTEGER(iwp) :: j !< INTEGER(iwp) :: k !< INTEGER(iwp) :: m !< INTEGER(iwp) :: n !< !$ INTEGER(iwp) :: omp_get_thread_num !< INTEGER(iwp) :: tn !< REAL(wp), DIMENSION(0:ny) :: work_ffty !< REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !< REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,npey) :: ar !< REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !< CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' ) ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) ) tn = 0 ! Default thread number in case of one thread !$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy ) DO i = nxl_y, nxr_y !$ tn = omp_get_thread_num() IF ( loop_optimization == 'vector' ) THEN ! !-- Code optimized for vector processors DO k = 1, nz m = 0 DO n = 1, npey DO j = 1, nny work_triy(m,k) = ar(j,k,i,n) m = m + 1 ENDDO ENDDO ENDDO CALL fft_y_m( work_triy, ny, 'forward' ) ELSE ! !-- Cache optimized code DO k = 1, nz m = 0 DO n = 1, npey DO j = 1, nny work_ffty(m) = ar(j,k,i,n) m = m + 1 ENDDO ENDDO CALL fft_y_1d( work_ffty, 'forward' ) DO j = 0, ny work_triy(j,k) = work_ffty(j) ENDDO ENDDO ENDIF ! !-- Solve the linear equation system CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) ) IF ( loop_optimization == 'vector' ) THEN ! !-- Code optimized for vector processors CALL fft_y_m( work_triy, ny, 'backward' ) DO k = 1, nz m = 0 DO n = 1, npey DO j = 1, nny ar(j,k,i,n) = work_triy(m,k) m = m + 1 ENDDO ENDDO ENDDO ELSE ! !-- Cache optimized code DO k = 1, nz DO j = 0, ny work_ffty(j) = work_triy(j,k) ENDDO CALL fft_y_1d( work_ffty, 'backward' ) m = 0 DO n = 1, npey DO j = 1, nny ar(j,k,i,n) = work_ffty(m) m = m + 1 ENDDO ENDDO ENDDO ENDIF ENDDO DEALLOCATE( tri ) CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' ) END SUBROUTINE ffty_tri_ffty END MODULE poisfft_mod