ref: 7cbb75eac668fae4078fa93271e09d51d65a8990
parent: 770e48ac2b7cb8600585842a88dc5185cc434ab6
author: Sindre Aamås <[email protected]>
date: Wed Jun 1 19:36:06 EDT 2016
[Processing] Pick dyadic downsample function based on stride Assume that data can be written into the padding area following each line. This enables the use of faster routines for more cases. Align downsample buffer stride to a multiple of 32. With this all strides used should be a multiple of 16, which means that use of narrower downsample routines can be dropped altogether.
--- a/codec/processing/src/downsample/downsample.cpp
+++ b/codec/processing/src/downsample/downsample.cpp
@@ -32,6 +32,7 @@
#include "downsample.h"
#include "cpu.h"
+#include <cassert>
WELSVP_NAMESPACE_BEGIN
#define MAX_SAMPLE_WIDTH 1920
@@ -75,11 +76,10 @@
WelsFree (m_pSampleBuffer[i][2]);
}
}
+
void CDownsampling::InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_c;
- sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsampler_c;
+ sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_c;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_c;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_c;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsampler_c;
@@ -86,9 +86,8 @@
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsampler_c;
#if defined(X86_ASM)
if (iCpuFlag & WELS_CPU_SSE) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse;
- sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsamplerWidthx8_sse;
+ sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse;
+ sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse;
}
if (iCpuFlag & WELS_CPU_SSE2) {
@@ -96,15 +95,15 @@
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_sse2;
}
if (iCpuFlag & WELS_CPU_SSSE3) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_ssse3;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_ssse3;
+ sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_ssse3;
+ sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_ssse3;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_ssse3;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_ssse3;
sDownsampleFunc.pfGeneralRatioLuma = GeneralBilinearFastDownsamplerWrap_ssse3;
}
if (iCpuFlag & WELS_CPU_SSE41) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_sse4;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsamplerWidthx16_sse4;
+ sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_sse4;
+ sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsamplerWidthx16_sse4;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_sse4;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_sse4;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_sse41;
@@ -117,10 +116,8 @@
#if defined(HAVE_NEON)
if (iCpuFlag & WELS_CPU_NEON) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_neon;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_neon;
- sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_neon;
- sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_neon;
+ sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_neon;
+ sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_neon;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_neon;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_neon;
@@ -130,10 +127,8 @@
#if defined(HAVE_NEON_AARCH64)
if (iCpuFlag & WELS_CPU_NEON) {
- sDownsampleFunc.pfHalfAverage[0] = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
- sDownsampleFunc.pfHalfAverage[1] = DyadicBilinearDownsampler_AArch64_neon;
- sDownsampleFunc.pfHalfAverage[2] = DyadicBilinearDownsampler_AArch64_neon;
- sDownsampleFunc.pfHalfAverage[3] = DyadicBilinearDownsampler_AArch64_neon;
+ sDownsampleFunc.pfHalfAverageWidthx32 = DyadicBilinearDownsamplerWidthx32_AArch64_neon;
+ sDownsampleFunc.pfHalfAverageWidthx16 = DyadicBilinearDownsampler_AArch64_neon;
sDownsampleFunc.pfOneThirdDownsampler = DyadicBilinearOneThirdDownsampler_AArch64_neon;
sDownsampleFunc.pfQuarterDownsampler = DyadicBilinearQuarterDownsampler_AArch64_neon;
sDownsampleFunc.pfGeneralRatioChroma = GeneralBilinearAccurateDownsamplerWrap_AArch64_neon;
@@ -159,14 +154,11 @@
if (iSrcWidthY > MAX_SAMPLE_WIDTH || iSrcHeightY > MAX_SAMPLE_HEIGHT || m_bNoSampleBuffer) {
if ((iSrcWidthY >> 1) == iDstWidthY && (iSrcHeightY >> 1) == iDstHeightY) {
// use half average functions
- uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+ DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
(uint8_t*)pSrcPixMap->pPixel[0], pSrcPixMap->iStride[0], iSrcWidthY, iSrcHeightY);
-
- iAlignIndex = GetAlignedIndex (iSrcWidthUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+ DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
(uint8_t*)pSrcPixMap->pPixel[1], pSrcPixMap->iStride[1], iSrcWidthUV, iSrcHeightUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+ DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
(uint8_t*)pSrcPixMap->pPixel[2], pSrcPixMap->iStride[2], iSrcWidthUV, iSrcHeightUV);
} else if ((iSrcWidthY >> 2) == iDstWidthY && (iSrcHeightY >> 2) == iDstHeightY) {
@@ -223,29 +215,23 @@
do {
if ((iHalfSrcWidth == iDstWidthY) && (iHalfSrcHeight == iDstHeightY)) { //end
// use half average functions
- uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
+ DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[0], pDstPixMap->iStride[0],
(uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);
-
- iAlignIndex = GetAlignedIndex (iSrcWidthUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
+ DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[1], pDstPixMap->iStride[1],
(uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
+ DownsampleHalfAverage ((uint8_t*)pDstPixMap->pPixel[2], pDstPixMap->iStride[2],
(uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);
break;
} else if (((iHalfSrcWidth >> 1) >= iDstWidthY) && ((iHalfSrcHeight >> 1) >= iDstHeightY)) {
// use half average functions
- iDstStrideY = iHalfSrcWidth;
- iDstStrideU = iHalfSrcWidth >> 1;
- iDstStrideV = iHalfSrcWidth >> 1;
- uint8_t iAlignIndex = GetAlignedIndex (iSrcWidthY);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstY, iDstStrideY,
+ iDstStrideY = WELS_ALIGN (iHalfSrcWidth, 32);
+ iDstStrideU = WELS_ALIGN (iHalfSrcWidth >> 1, 32);
+ iDstStrideV = WELS_ALIGN (iHalfSrcWidth >> 1, 32);
+ DownsampleHalfAverage ((uint8_t*)pDstY, iDstStrideY,
(uint8_t*)pSrcY, iSrcStrideY, iSrcWidthY, iSrcHeightY);
-
- iAlignIndex = GetAlignedIndex (iSrcWidthUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstU, iDstStrideU,
+ DownsampleHalfAverage ((uint8_t*)pDstU, iDstStrideU,
(uint8_t*)pSrcU, iSrcStrideU, iSrcWidthUV, iSrcHeightUV);
- m_pfDownsample.pfHalfAverage[iAlignIndex] ((uint8_t*)pDstV, iDstStrideV,
+ DownsampleHalfAverage ((uint8_t*)pDstV, iDstStrideV,
(uint8_t*)pSrcV, iSrcStrideV, iSrcWidthUV, iSrcHeightUV);
pSrcY = (uint8_t*)pDstY;
@@ -258,9 +244,9 @@
iSrcHeightY = iHalfSrcHeight;
iSrcHeightUV = iHalfSrcHeight >> 1;
- iSrcStrideY = iSrcWidthY;
- iSrcStrideU = iSrcWidthUV;
- iSrcStrideV = iSrcWidthUV;
+ iSrcStrideY = iDstStrideY;
+ iSrcStrideU = iDstStrideU;
+ iSrcStrideV = iDstStrideV;
iHalfSrcWidth >>= 1;
iHalfSrcHeight >>= 1;
@@ -286,17 +272,18 @@
return RET_SUCCESS;
}
-int32_t CDownsampling::GetAlignedIndex (const int32_t kiSrcWidth) {
- int32_t iAlignIndex;
- if ((kiSrcWidth & 0x1f) == 0) // x32
- iAlignIndex = 0;
- else if ((kiSrcWidth & 0x0f) == 0) // x16
- iAlignIndex = 1;
- else if ((kiSrcWidth & 0x07) == 0) // x8
- iAlignIndex = 2;
- else
- iAlignIndex = 3;
- return iAlignIndex;
+void CDownsampling::DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,
+ uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight) {
+ if ((iSrcStride & 31) == 0) {
+ assert ((iDstStride & 15) == 0);
+ m_pfDownsample.pfHalfAverageWidthx32 (pDst, iDstStride,
+ pSrc, iSrcStride, WELS_ALIGN (iSrcWidth, 32), iSrcHeight);
+ } else {
+ assert ((iSrcStride & 15) == 0);
+ assert ((iDstStride & 7) == 0);
+ m_pfDownsample.pfHalfAverageWidthx16 (pDst, iDstStride,
+ pSrc, iSrcStride, WELS_ALIGN (iSrcWidth, 16), iSrcHeight);
+ }
}
--- a/codec/processing/src/downsample/downsample.h
+++ b/codec/processing/src/downsample/downsample.h
@@ -73,8 +73,8 @@
SpecificDownsampleFunc DyadicBilinearQuarterDownsampler_c;
typedef struct {
- // align_index: 0 = x32; 1 = x16; 2 = x8; 3 = common case left;
- PHalveDownsampleFunc pfHalfAverage[4];
+ PHalveDownsampleFunc pfHalfAverageWidthx32;
+ PHalveDownsampleFunc pfHalfAverageWidthx16;
PSpecificDownsampleFunc pfOneThirdDownsampler;
PSpecificDownsampleFunc pfQuarterDownsampler;
PGeneralDownsampleFunc pfGeneralRatioLuma;
@@ -185,7 +185,8 @@
private:
void InitDownsampleFuncs (SDownsampleFuncs& sDownsampleFunc, int32_t iCpuFlag);
- int32_t GetAlignedIndex (const int32_t kiSrcWidth);
+ void DownsampleHalfAverage (uint8_t* pDst, int32_t iDstStride,
+ uint8_t* pSrc, int32_t iSrcStride, int32_t iSrcWidth, int32_t iSrcHeight);
bool AllocateSampleBuffer();
void FreeSampleBuffer();
private: