void GaussSmoothCPU3DOptZYXSplitZIntrinsic(float* pSrc
, int iDim
[3], float* pKernel
, int kernelSize
[3], float* pDst
, float* pBuffer
)
{
int iSliceSize
= iDim
[1] * iDim
[0];
int nCenter
= kernelSize
[0] / 2;
const unsigned int InnerSize
= 16*2;
#pragma omp parallel for num_threads(16) schedule(dynamic)
for (int z
= 0; z
< (iDim
[2] - kernelSize
[0] + 1); z
++)
{
float* pBuffSlice
= pBuffer
+ (z
+ nCenter
) * iSliceSize
;
float* pDstSlice
= pDst
+ (z
+ nCenter
) * iSliceSize
;
{
for (unsigned int iSliceIt
= 0; iSliceIt
< iSliceSize
; iSliceIt
+= InnerSize
)
{
float* pSrcSlice
= pSrc
+ z
* iSliceSize
+ iSliceIt
;
__m256 DST1
= _mm256_setzero_ps();
__m256 DST2
= _mm256_setzero_ps();
__m256 DST3
= _mm256_setzero_ps();
__m256 DST4
= _mm256_setzero_ps();
for (unsigned int kx
= 0; kx
< kernelSize
[0]; kx
++)
{
__m256 SRC1
= _mm256_loadu_ps(pSrcSlice
+ kx
* iSliceSize
);
__m256 SRC2
= _mm256_loadu_ps(pSrcSlice
+ kx
* iSliceSize
+ 8);
__m256 SRC3
= _mm256_loadu_ps(pSrcSlice
+ kx
* iSliceSize
+ 16);
__m256 SRC4
= _mm256_loadu_ps(pSrcSlice
+ kx
* iSliceSize
+ 24);
__m256 KER
= _mm256_broadcast_ss(pKernel
+ kx
);
DST1
= _mm256_fmadd_ps(KER
, SRC1
, DST1
);
DST2
= _mm256_fmadd_ps(KER
, SRC2
, DST2
);
DST3
= _mm256_fmadd_ps(KER
, SRC3
, DST3
);
DST4
= _mm256_fmadd_ps(KER
, SRC4
, DST4
);
}
_mm256_store_ps(pBuffSlice
+ iSliceIt
, DST1
);
_mm256_store_ps(pBuffSlice
+ iSliceIt
+ 8, DST2
);
_mm256_store_ps(pBuffSlice
+ iSliceIt
+ 16, DST3
);
_mm256_store_ps(pBuffSlice
+ iSliceIt
+ 24, DST4
);
}
}
int tid
= omp_get_thread_num();
memset(yBuf
[tid
], 0, sizeof(float) * iDim
[0] * iDim
[1]);
Conv2D_Fuse(pBuffSlice
, iDim
, pKernel
, kernelSize
[0], yBuf
[tid
], pDstSlice
, NULL);
}
}
转载请注明原文地址: https://lol.8miu.com/read-2259.html