shithub: openh264

--- a/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

+++ b/codec/build/iOS/enc/welsenc/welsenc.xcodeproj/project.pbxproj

@@ -47,6 +47,7 @@

 		4CE4472E18BC605C0017DF25 /* welsEncoderExt.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 4CE4470618BC605C0017DF25 /* welsEncoderExt.cpp */; };

 		9AED665019469FC1009A3567 /* welsCodecTrace.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */; };

 		9AED66661946A2B3009A3567 /* utils.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 9AED66651946A2B3009A3567 /* utils.cpp */; };

+		F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */ = {isa = PBXBuildFile; fileRef = F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */; };

 /* End PBXBuildFile section */

 /* Begin PBXCopyFilesBuildPhase section */

@@ -155,6 +156,7 @@

 		9AED664C19469FC1009A3567 /* welsCodecTrace.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = welsCodecTrace.cpp; path = ../../../common/src/welsCodecTrace.cpp; sourceTree = "<group>"; };

 		9AED66651946A2B3009A3567 /* utils.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = utils.cpp; path = ../../../common/src/utils.cpp; sourceTree = "<group>"; };

 		9AED66671946A2C4009A3567 /* utils.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = utils.h; path = ../../../common/inc/utils.h; sourceTree = "<group>"; };

+		F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = reconstruct_aarch64_neon.S; path = arm64/reconstruct_aarch64_neon.S; sourceTree = "<group>"; };

 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */

@@ -184,6 +186,7 @@

 		4CB8F2B219235FAC005D6386 /* arm64 */ = {

 			isa = PBXGroup;

 			children = (

+				F5617A4F196A833A006E2B20 /* reconstruct_aarch64_neon.S */,

 				4C23BC5F195A77E0003B81FC /* intra_pred_sad_3_opt_aarch64_neon.S */,

 				4CBC1B82194ACBB400214D9E /* intra_pred_aarch64_neon.S */,

 				4CB8F2B319235FC5005D6386 /* pixel_aarch64_neon.S */,

@@ -430,6 +433,7 @@

 				4CE4470E18BC605C0017DF25 /* au_set.cpp in Sources */,

 				4CBC1B83194ACBB400214D9E /* intra_pred_aarch64_neon.S in Sources */,

 				4CE4471718BC605C0017DF25 /* mc.cpp in Sources */,

+				F5617A50196A833A006E2B20 /* reconstruct_aarch64_neon.S in Sources */,

 				4CE4472918BC605C0017DF25 /* svc_set_mb_syn_cavlc.cpp in Sources */,

 				4CE4471818BC605C0017DF25 /* md.cpp in Sources */,

 				4CE4471B18BC605C0017DF25 /* nal_encap.cpp in Sources */,

--- /dev/null

+++ b/codec/encoder/core/arm64/reconstruct_aarch64_neon.S

@@ -1,0 +1,947 @@

+/*!

+ * \copy

+ *     Copyright (c)  2013, Cisco Systems

+ *     All rights reserved.

+ *

+ *     Redistribution and use in source and binary forms, with or without

+ *     modification, are permitted provided that the following conditions

+ *     are met:

+ *

+ *        * Redistributions of source code must retain the above copyright

+ *          notice, this list of conditions and the following disclaimer.

+ *

+ *        * Redistributions in binary form must reproduce the above copyright

+ *          notice, this list of conditions and the following disclaimer in

+ *          the documentation and/or other materials provided with the

+ *          distribution.

+ *

+ *     THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS

+ *     "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT

+ *     LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS

+ *     FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE

+ *     COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,

+ *     INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,

+ *     BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;

+ *     LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER

+ *     CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT

+ *     LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN

+ *     ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE

+ *     POSSIBILITY OF SUCH DAMAGE.

+ *

+ */

+#ifdef  HAVE_NEON_AARCH64

+.text

+#include "arm_arch64_common_macro.S"

+#ifdef __APPLE__

+.macro ZERO_COUNT_IN_2_QUARWORD

+//  {   //  input:  coef_0 (identy to $3 $4), coef_1(identy to $5 $6), mask_q

+    cmeq    $0.8h, $0.8h, #0

+    cmeq    $1.8h, $1.8h, #0

+    uzp1    $0.16b, $0.16b, $1.16b

+    ushr    $0.16b, $0.16b, 7

+    addv    $2, $0.16b

+//  }

+.endm

+.macro NEWQUANT_COEF_EACH_16BITS    // if coef <= 0, - coef; else , coef;

+//  {   //  input:  coef, ff (dst), mf

+    eor     $3.16b, $3.16b, $3.16b          // init 0 , and keep 0;

+    saba    $1.8h, $0.8h, $3.8h      // f + abs(coef - 0)

+    smull   $4.4s, $1.4h, $2.4h

+    smull2  $5.4s, $1.8h, $2.8h

+    shrn    $1.4h, $4.4s, #16

+    shrn2   $1.8h, $5.4s, #16

+    cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111

+    bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched

+    shl     $3.8h, $3.8h, #1

+    sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x

+//  }

+.endm

+.macro NEWQUANT_COEF_EACH_16BITS_MAX    // if coef <= 0, - coef; else , coef;

+//  {   //  input:  coef, ff (dst), mf

+    eor     $3.16b, $3.16b, $3.16b          // init 0 , and keep 0;

+    saba    $1.8h, $0.8h, $3.8h      // f + abs(coef - 0)

+    smull   $4.4s, $1.4h, $2.4h

+    smull2  $5.4s, $1.8h, $2.8h

+    shrn    $1.4h, $4.4s, #16

+    shrn2   $1.8h, $5.4s, #16

+    cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111

+    bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched

+    shl     $3.8h, $3.8h, #1

+    mov.8h   $6, $1

+    sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x

+//  }

+.endm

+.macro QUANT_DUALWORD_COEF_EACH_16BITS  // if coef <= 0, - coef; else , coef;

+//  {   //  input:  coef, ff (dst), mf

+    saba    $1.8h, $0.8h, $3.8h      // f + abs(coef - 0)

+    smull   $4.4s, $1.4h, $2.4h

+    shrn    $1.4h, $4.4s, #16

+    cmgt    $4.8h, $0.8h, #0      // if true, location of coef == 11111111

+    bif     $3.16b, $1.16b, $4.16b      // if (x<0) reserved part; else keep 0 untouched

+    shl     $3.8h, $3.8h, #1

+    sub     $1.8h, $1.8h, $3.8h      // if x > 0, -= 0; else x-= 2x

+//  }

+.endm

+.macro SELECT_MAX_IN_ABS_COEF

+//  {   //  input:  coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)

+    umax    $0.8h, $0.8h, $1.8h

+    umaxv   $4, $0.8h

+    umax    $2.8h, $2.8h, $3.8h

+    umaxv   $5, $2.8h

+//  }

+.endm

+.macro HDM_QUANT_2x2_TOTAL_16BITS

+//  {   //  input: src_d[0][16][32][48], dst_d[0][16][32][48], working

+    sshr  $1.2d, $0.2d, #32

+    add   $2.4h, $0.4h, $1.4h      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];

+    sub   $1.4h, $0.4h, $1.4h      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];

+    zip1  $1.4h, $2.4h, $1.4h

+//  }

+.endm

+.macro DC_ZERO_COUNT_IN_DUALWORD

+//  {   //  input:  coef, dst_d, working_d (all 0x01)

+    cmeq    $0.4h, $0.4h, #0

+    and     $0.8b, $0.8b, $2.8b

+    addv    $1, $0.4h

+//  }

+.endm

+.macro IHDM_4x4_TOTAL_16BITS

+//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1

+    uzp2  $1.4s, $0.4s, $0.4s

+    uzp1  $0.4s, $0.4s, $0.4s

+    add   $2.8h, $0.8h, $1.8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];

+    sub   $1.8h, $0.8h, $1.8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];

+    zip1  $2.8h, $2.8h, $1.8h      // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]

+    uzp2  $1.4s, $2.4s, $2.4s

+    uzp1  $0.4s, $2.4s, $2.4s

+    add   $2.8h, $0.8h, $1.8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];

+    sub   $1.8h, $0.8h, $1.8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];

+    rev32 $1.4h, $1.4h             // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];

+    zip1  $0.4s, $2.4s, $1.4s

+//  }

+.endm

+.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2

+//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]

+    uzp1 $2.4s, $0.4s, $1.4s   //[0 1 4 5]+[8 9 12 13]

+    uzp2 $3.4s, $0.4s, $1.4s   //[2 3 6 7]+[10 11 14 15]

+    uzp1 $0.8h, $2.8h, $3.8h   //[0 4 8 12]+[2 6 10 14]

+    uzp2 $2.8h, $2.8h, $3.8h   //[1 5 9 13]+[3 7 11 15]

+    zip2 $1.2d, $0.2d, $2.2d   //[2 6 10 14]+[3 7 11 15]

+    zip1 $0.2d, $0.2d, $2.2d   //[0 4 8 12]+[1 5 9 13]

+//  }

+.endm

+.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4

+//  {   //  input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]

+    trn1 $4.8h, v0.8h, v1.8h

+    trn2 $5.8h, v0.8h, v1.8h

+    trn1 $6.8h, v2.8h, v3.8h

+    trn2 $7.8h, v2.8h, v3.8h

+    trn1 $0.4s, v4.4s, v6.4s

+    trn2 $2.4s, v4.4s, v6.4s

+    trn1 $1.4s, v5.4s, v7.4s

+    trn2 $3.4s, v5.4s, v7.4s

+//  }

+.endm

+.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2

+//  {   //  input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]

+    mov  $0.d[1], $1.d[0]  //[0 1 2 3]+[4 5 6 7]

+    mov  $2.d[1], $3.d[0]  //[8 9 10 11]+[12 13 14 15]

+    uzp1 $1.4s, $0.4s, $2.4s   //[0 1 4 5]+[8 9 12 13]

+    uzp2 $3.4s, $0.4s, $2.4s   //[2 3 6 7]+[10 11 14 15]

+    uzp1 $0.8h, $1.8h, $3.8h   //[0 4 8 12]+[2 6 10 14]

+    uzp2 $2.8h, $1.8h, $3.8h   //[1 5 9 13]+[3 7 11 15]

+    zip2 $1.2d, $0.2d, $2.2d   //[2 6 10 14]+[3 7 11 15]

+    zip1 $0.2d, $0.2d, $2.2d   //[0 4 8 12]+[1 5 9 13]

+//  }

+.endm

+.macro LOAD_4x4_DATA_FOR_DCT

+    ld1   {$0.s}[0], [$2], $3

+    ld1   {$0.s}[1], [$2], $3

+    ld1   {$0.s}[2], [$2], $3

+    ld1   {$0.s}[3], [$2]

+    ld1   {$1.s}[0], [$4], $5

+    ld1   {$1.s}[1], [$4], $5

+    ld1   {$1.s}[2], [$4], $5

+    ld1   {$1.s}[3], [$4]

+.endm

+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS

+//  {   //  input: src_d[0]~[3], working: [4]~[7]

+    add     $4.8h, $0.8h, $3.8h   //int16 s[0] = data[i] + data[i3];

+    sub     $7.8h, $0.8h, $3.8h   //int16 s[3] = data[i] - data[i3];

+    add     $5.8h, $1.8h, $2.8h   //int16 s[1] = data[i1] + data[i2];

+    sub     $6.8h, $1.8h, $2.8h   //int16 s[2] = data[i1] - data[i2];

+    add     $0.8h, $4.8h, $5.8h   //int16 dct[i ] = s[0] + s[1];

+    sub     $2.8h, $4.8h, $5.8h   //int16 dct[i2] = s[0] - s[1];

+    shl     $1.8h, $7.8h, #1

+    shl     $3.8h, $6.8h, #1

+    add     $1.8h, $1.8h, $6.8h   //int16 dct[i1] = (s[3] << 1) + s[2];

+    sub     $3.8h, $7.8h, $3.8h   //int16 dct[i3] = s[3] - (s[2] << 1);

+//  }

+.endm

+.macro LOAD_8x4_DATA_FOR_DCT

+//  {   //  input: $0~$3, src1*, src2*; untouched r2:src1_stride &r4:src2_stride

+    ld1   {$0.d}[0], [$8], x2

+    ld1   {$1.d}[0], [$8], x2

+    ld1   {$2.d}[0], [$8], x2

+    ld1   {$3.d}[0], [$8], x2

+    ld1   {$4.d}[0], [$9], x4

+    ld1   {$5.d}[0], [$9], x4

+    ld1   {$6.d}[0], [$9], x4

+    ld1   {$7.d}[0], [$9], x4

+//  }

+.endm

+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS

+//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];

+    add   $4.8h, $0.8h, $2.8h          //int16 e[i][0] = src[0] + src[2];

+    sub   $5.8h, $0.8h, $2.8h          //int16 e[i][1] = src[0] - src[2];

+    sshr  $6.8h, $1.8h, #1

+    sshr  $7.8h, $3.8h, #1

+    sub   $6.8h, $6.8h, $3.8h          //int16 e[i][2] = (src[1]>>1)-src[3];

+    add   $7.8h, $1.8h, $7.8h          //int16 e[i][3] = src[1] + (src[3]>>1);

+//  }

+.endm

+.macro TRANSFORM_TOTAL_16BITS   // both row & col transform used

+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];

+    add   $0.8h, $4.8h, $7.8h          //int16 f[i][0] = e[i][0] + e[i][3];

+    add   $1.8h, $5.8h, $6.8h          //int16 f[i][1] = e[i][1] + e[i][2];

+    sub   $2.8h, $5.8h, $6.8h          //int16 f[i][2] = e[i][1] - e[i][2];

+    sub   $3.8h, $4.8h, $7.8h          //int16 f[i][3] = e[i][0] - e[i][3];

+//  }

+.endm

+.macro ROW_TRANSFORM_0_STEP

+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];

+    saddl   $4.4s, $0.4h, $2.4h          //int32 e[i][0] = src[0] + src[2];

+    ssubl   $5.4s, $0.4h, $2.4h          //int32 e[i][1] = src[0] - src[2];

+    ssubl   $6.4s, $1.4h, $3.4h          //int32 e[i][2] = src[1] - src[3];

+    saddl   $7.4s, $1.4h, $3.4h          //int32 e[i][3] = src[1] + src[3];

+//  }

+.endm

+.macro COL_TRANSFORM_0_STEP

+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];

+    add     $4.4s, $0.4s, $2.4s          //int32 e[0][j] = f[0][j] + f[2][j];

+    sub     $5.4s, $0.4s, $2.4s          //int32 e[1][j] = f[0][j] - f[2][j];

+    sub     $6.4s, $1.4s, $3.4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];

+    add     $7.4s, $1.4s, $3.4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);

+//  }

+.endm

+.macro TRANSFORM_4BYTES // both row & col transform used

+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];

+    add     $0.4s, $4.4s, $7.4s          //int16 f[i][0] = e[i][0] + e[i][3];

+    add     $1.4s, $5.4s, $6.4s          //int16 f[i][1] = e[i][1] + e[i][2];

+    sub     $2.4s, $5.4s, $6.4s          //int16 f[i][2] = e[i][1] - e[i][2];

+    sub     $3.4s, $4.4s, $7.4s          //int16 f[i][3] = e[i][0] - e[i][3];

+//  }

+.endm

+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP

+//  {   //  input: pred_d[0](output), dct_q0/1, working_q0/1;

+    uxtl      $3.8h, $0.8b

+    uxtl2     $4.8h, $0.16b

+    add       $3.8h, $3.8h, $1.8h

+    add       $4.8h, $4.8h, $2.8h

+    sqxtun   $0.8b, $3.8h

+    sqxtun2  $0.16b,$4.8h

+//  }

+.endm

+#else

+.macro ZERO_COUNT_IN_2_QUARWORD arg0, arg1, arg2

+//  {   //  input:  coef_0 (identy to \arg3\() \arg4\()), coef_1(identy to \arg5\() \arg6\()), mask_q

+cmeq    \arg0\().8h, \arg0\().8h, #0

+cmeq    \arg1\().8h, \arg1\().8h, #0

+uzp1    \arg0\().16b, \arg0\().16b, \arg1\().16b

+ushr    \arg0\().16b, \arg0\().16b, 7

+addv    \arg2\(), \arg0\().16b

+//  }

+.endm

+.macro NEWQUANT_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4, arg5

+// if coef <= 0, - coef; else , coef;

+//  {   //  input:  coef, ff (dst), mf

+eor     \arg3\().16b, \arg3\().16b, \arg3\().16b          // init 0 , and keep 0;

+saba    \arg1\().8h, \arg0\().8h, \arg3\().8h      // f + abs(coef - 0)

+smull   \arg4\().4s, \arg1\().4h, \arg2\().4h

+smull2  \arg5\().4s, \arg1\().8h, \arg2\().8h

+shrn    \arg1\().4h, \arg4\().4s, #16

+shrn2   \arg1\().8h, \arg5\().4s, #16

+cmgt    \arg4\().8h, \arg0\().8h, #0      // if true, location of coef == 11111111

+bif     \arg3\().16b, \arg1\().16b, \arg4\().16b      // if (x<0) reserved part; else keep 0 untouched

+shl     \arg3\().8h, \arg3\().8h, #1

+sub     \arg1\().8h, \arg1\().8h, \arg3\().8h      // if x > 0, -= 0; else x-= 2x

+//  }

+.endm

+.macro NEWQUANT_COEF_EACH_16BITS_MAX arg0, arg1, arg2, arg3, arg4, arg5, arg6

+// if coef <= 0, - coef; else , coef;

+//  {   //  input:  coef, ff (dst), mf

+eor     \arg3\().16b, \arg3\().16b, \arg3\().16b          // init 0 , and keep 0;

+saba    \arg1\().8h, \arg0\().8h, \arg3\().8h      // f + abs(coef - 0)

+smull   \arg4\().4s, \arg1\().4h, \arg2\().4h

+smull2  \arg5\().4s, \arg1\().8h, \arg2\().8h

+shrn    \arg1\().4h, \arg4\().4s, #16

+shrn2   \arg1\().8h, \arg5\().4s, #16

+cmgt    \arg4\().8h, \arg0\().8h, #0      // if true, location of coef == 11111111

+bif     \arg3\().16b, \arg1\().16b, \arg4\().16b      // if (x<0) reserved part; else keep 0 untouched

+shl     \arg3\().8h, \arg3\().8h, #1

+mov.8h  \arg6, \arg1

+sub     \arg1\().8h, \arg1\().8h, \arg3\().8h      // if x > 0, -= 0; else x-= 2x

+//  }

+.endm

+.macro QUANT_DUALWORD_COEF_EACH_16BITS arg0, arg1, arg2, arg3, arg4

+// if coef <= 0, - coef; else , coef;

+//  {   //  input:  coef, ff (dst), mf

+saba    \arg1\().8h, \arg0\().8h, \arg3\().8h      // f + abs(coef - 0)

+smull   \arg4\().4s, \arg1\().4h, \arg2\().4h

+shrn    \arg1\().4h, \arg4\().4s, #16

+cmgt    \arg4\().8h, \arg0\().8h, #0      // if true, location of coef == 11111111

+bif     \arg3\().16b, \arg1\().16b, \arg4\().16b      // if (x<0) reserved part; else keep 0 untouched

+shl     \arg3\().8h, \arg3\().8h, #1

+sub     \arg1\().8h, \arg1\().8h, \arg3\().8h      // if x > 0, -= 0; else x-= 2x

+//  }

+.endm

+.macro SELECT_MAX_IN_ABS_COEF arg0, arg1, arg2, arg3, arg4, arg5

+//  {   //  input:  coef_0, coef_1, coef_2, coef_3, max_q (identy to follow two)

+umax    \arg0\().8h, \arg0\().8h, \arg1\().8h

+umaxv   \arg4\(), \arg0\().8h

+umax    \arg2\().8h, \arg2\().8h, \arg3\().8h

+umaxv   \arg5\(), \arg2\().8h

+//  }

+.endm

+.macro HDM_QUANT_2x2_TOTAL_16BITS arg0, arg1, arg2

+//  {   //  input: src_d[0][16][32][48], dst_d[0][16][32][48], working

+sshr  \arg1\().2d, \arg0\().2d, #32

+add   \arg2\().4h, \arg0\().4h, \arg1\().4h      // [0] = rs[0] + rs[32];[1] = rs[16] + rs[48];

+sub   \arg1\().4h, \arg0\().4h, \arg1\().4h      // [0] = rs[0] - rs[32];[1] = rs[16] - rs[48];

+zip1  \arg1\().4h, \arg2\().4h, \arg1\().4h

+//  }

+.endm

+.macro DC_ZERO_COUNT_IN_DUALWORD arg0, arg1, arg2

+//  {   //  input:  coef, dst_d, working_d (all 0x01)

+cmeq    \arg0\().4h, \arg0\().4h, #0

+and     \arg0\().8b, \arg0\().8b, \arg2\().8b

+addv    \arg1\(), \arg0\().4h

+//  }

+.endm

+.macro IHDM_4x4_TOTAL_16BITS arg0, arg1, arg2

+//  {   //  input: each src_d[0]~[3](dst), working_q0, working_q1

+uzp2  \arg1\().4s, \arg0\().4s, \arg0\().4s

+uzp1  \arg0\().4s, \arg0\().4s, \arg0\().4s

+add   \arg2\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];

+sub   \arg1\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];

+zip1  \arg2\().8h, \arg2\().8h, \arg1\().8h      // [0] = rs[0] + rs[2]; [1] = rs[0] - rs[2]; ... [2]; [3]

+uzp2  \arg1\().4s, \arg2\().4s, \arg2\().4s

+uzp1  \arg0\().4s, \arg2\().4s, \arg2\().4s

+add   \arg2\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] + rs[2];[1] = rs[1] + rs[3];[2] = rs[4] + rs[6];[3] = rs[5] + rs[7];

+sub   \arg1\().8h, \arg0\().8h, \arg1\().8h      // [0] = rs[0] - rs[2];[1] = rs[1] - rs[3];[2] = rs[4] - rs[6];[3] = rs[5] - rs[7];

+rev32 \arg1\().4h, \arg1\().4h             // [0] = rs[1] - rs[3];[1] = rs[0] - rs[2];[2] = rs[5] - rs[7];[3] = rs[4] - rs[6];

+zip1  \arg0\().4s, \arg2\().4s, \arg1\().4s

+//  }

+.endm

+.macro MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2 arg0, arg1, arg2, arg3

+//  {   //  input & output: src_d[0]~[3];[0 1 2 3]+[4 5 6 7]+[8 9 10 11]+[12 13 14 15]

+uzp1 \arg2\().4s, \arg0\().4s, \arg1\().4s   //[0 1 4 5]+[8 9 12 13]

+uzp2 \arg3\().4s, \arg0\().4s, \arg1\().4s   //[2 3 6 7]+[10 11 14 15]

+uzp1 \arg0\().8h, \arg2\().8h, \arg3\().8h   //[0 4 8 12]+[2 6 10 14]

+uzp2 \arg2\().8h, \arg2\().8h, \arg3\().8h   //[1 5 9 13]+[3 7 11 15]

+zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d   //[2 6 10 14]+[3 7 11 15]

+zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d   //[0 4 8 12]+[1 5 9 13]

+//  }

+.endm

+.macro MATRIX_TRANSFORM_EACH_16BITS_OUT4 arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+//  {   //  input & output: src_d[0]~[3];[0 4 8 12],[1 5 9 13],[2 6 10 14],[3 7 11 15]

+trn1 \arg4\().8h, v0.8h, v1.8h

+trn2 \arg5\().8h, v0.8h, v1.8h

+trn1 \arg6\().8h, v2.8h, v3.8h

+trn2 \arg7\().8h, v2.8h, v3.8h

+trn1 \arg0\().4s, v4.4s, v6.4s

+trn2 \arg2\().4s, v4.4s, v6.4s

+trn1 \arg1\().4s, v5.4s, v7.4s

+trn2 \arg3\().4s, v5.4s, v7.4s

+//  }

+.endm

+.macro MATRIX_TRANSFORM_EACH_16BITS_4x4_OUT2 arg0, arg1, arg2, arg3

+//  {   //  input & output: src_d[0]~[3];[0 1 2 3],[4 5 6 7],[8 9 10 11],[12 13 14 15]

+mov  \arg0\().d[1], \arg1\().d[0]  //[0 1 2 3]+[4 5 6 7]

+mov  \arg2\().d[1], \arg3\().d[0]  //[8 9 10 11]+[12 13 14 15]

+uzp1 \arg1\().4s, \arg0\().4s, \arg2\().4s   //[0 1 4 5]+[8 9 12 13]

+uzp2 \arg3\().4s, \arg0\().4s, \arg2\().4s   //[2 3 6 7]+[10 11 14 15]

+uzp1 \arg0\().8h, \arg1\().8h, \arg3\().8h   //[0 4 8 12]+[2 6 10 14]

+uzp2 \arg2\().8h, \arg1\().8h, \arg3\().8h   //[1 5 9 13]+[3 7 11 15]

+zip2 \arg1\().2d, \arg0\().2d, \arg2\().2d   //[2 6 10 14]+[3 7 11 15]

+zip1 \arg0\().2d, \arg0\().2d, \arg2\().2d   //[0 4 8 12]+[1 5 9 13]

+//  }

+.endm

+.macro LOAD_4x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5

+ld1   {\arg0\().s}[0], [\arg2\()], \arg3\()

+ld1   {\arg0\().s}[1], [\arg2\()], \arg3\()

+ld1   {\arg0\().s}[2], [\arg2\()], \arg3\()

+ld1   {\arg0\().s}[3], [\arg2\()]

+ld1   {\arg1\().s}[0], [\arg4\()], \arg5\()

+ld1   {\arg1\().s}[1], [\arg4\()], \arg5\()

+ld1   {\arg1\().s}[2], [\arg4\()], \arg5\()

+ld1   {\arg1\().s}[3], [\arg4\()]

+.endm

+.macro DCT_ROW_TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+//  {   //  input: src_d[0]~[3], working: [4]~[7]

+add     \arg4\().8h, \arg0\().8h, \arg3\().8h   //int16 s[0] = data[i] + data[i3];

+sub     \arg7\().8h, \arg0\().8h, \arg3\().8h   //int16 s[3] = data[i] - data[i3];

+add     \arg5\().8h, \arg1\().8h, \arg2\().8h   //int16 s[1] = data[i1] + data[i2];

+sub     \arg6\().8h, \arg1\().8h, \arg2\().8h   //int16 s[2] = data[i1] - data[i2];

+add     \arg0\().8h, \arg4\().8h, \arg5\().8h   //int16 dct[i ] = s[0] + s[1];

+sub     \arg2\().8h, \arg4\().8h, \arg5\().8h   //int16 dct[i2] = s[0] - s[1];

+shl     \arg1\().8h, \arg7\().8h, #1

+shl     \arg3\().8h, \arg6\().8h, #1

+add     \arg1\().8h, \arg1\().8h, \arg6\().8h   //int16 dct[i1] = (s[3] << 1) + s[2];

+sub     \arg3\().8h, \arg7\().8h, \arg3\().8h   //int16 dct[i3] = s[3] - (s[2] << 1);

+//  }

+.endm

+.macro LOAD_8x4_DATA_FOR_DCT arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9

+//  {   //  input: \arg0\()~\arg3\(), src1*, src2*; untouched r2:src1_stride &r4:src2_stride

+ld1   {\arg0\().d}[0], [\arg8\()], x2

+ld1   {\arg1\().d}[0], [\arg8\()], x2

+ld1   {\arg2\().d}[0], [\arg8\()], x2

+ld1   {\arg3\().d}[0], [\arg8\()], x2

+ld1   {\arg4\().d}[0], [\arg9\()], x4

+ld1   {\arg5\().d}[0], [\arg9\()], x4

+ld1   {\arg6\().d}[0], [\arg9\()], x4

+ld1   {\arg7\().d}[0], [\arg9\()], x4

+//  }

+.endm

+.macro ROW_TRANSFORM_1_STEP_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+//  {   //  input: src_d[0]~[3], output: e_d[0]~[3];

+add   \arg4\().8h, \arg0\().8h, \arg2\().8h          //int16 e[i][0] = src[0] + src[2];

+sub   \arg5\().8h, \arg0\().8h, \arg2\().8h          //int16 e[i][1] = src[0] - src[2];

+sshr  \arg6\().8h, \arg1\().8h, #1

+sshr  \arg7\().8h, \arg3\().8h, #1

+sub   \arg6\().8h, \arg6\().8h, \arg3\().8h          //int16 e[i][2] = (src[1]>>1)-src[3];

+add   \arg7\().8h, \arg1\().8h, \arg7\().8h          //int16 e[i][3] = src[1] + (src[3]>>1);

+//  }

+.endm

+.macro TRANSFORM_TOTAL_16BITS arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+// both row & col transform used

+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];

+add   \arg0\().8h, \arg4\().8h, \arg7\().8h          //int16 f[i][0] = e[i][0] + e[i][3];

+add   \arg1\().8h, \arg5\().8h, \arg6\().8h          //int16 f[i][1] = e[i][1] + e[i][2];

+sub   \arg2\().8h, \arg5\().8h, \arg6\().8h          //int16 f[i][2] = e[i][1] - e[i][2];

+sub   \arg3\().8h, \arg4\().8h, \arg7\().8h          //int16 f[i][3] = e[i][0] - e[i][3];

+//  }

+.endm

+.macro ROW_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+//  {   //  input: src_d[0]~[3], output: e_q[0]~[3];

+saddl   \arg4\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][0] = src[0] + src[2];

+ssubl   \arg5\().4s, \arg0\().4h, \arg2\().4h          //int32 e[i][1] = src[0] - src[2];

+ssubl   \arg6\().4s, \arg1\().4h, \arg3\().4h          //int32 e[i][2] = src[1] - src[3];

+saddl   \arg7\().4s, \arg1\().4h, \arg3\().4h          //int32 e[i][3] = src[1] + src[3];

+//  }

+.endm

+.macro COL_TRANSFORM_0_STEP arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+//  {   //  input: src_q[0]~[3], output: e_q[0]~[3];

+add     \arg4\().4s, \arg0\().4s, \arg2\().4s          //int32 e[0][j] = f[0][j] + f[2][j];

+sub     \arg5\().4s, \arg0\().4s, \arg2\().4s          //int32 e[1][j] = f[0][j] - f[2][j];

+sub     \arg6\().4s, \arg1\().4s, \arg3\().4s          //int32 e[2][j] = (f[1][j]>>1) - f[3][j];

+add     \arg7\().4s, \arg1\().4s, \arg3\().4s          //int32 e[3][j] = f[1][j] + (f[3][j]>>1);

+//  }

+.endm

+.macro TRANSFORM_4BYTES arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7

+// both row & col transform used

+//  {   //  output: f_q[0]~[3], input: e_q[0]~[3];

+add     \arg0\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][0] = e[i][0] + e[i][3];

+add     \arg1\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][1] = e[i][1] + e[i][2];

+sub     \arg2\().4s, \arg5\().4s, \arg6\().4s          //int16 f[i][2] = e[i][1] - e[i][2];

+sub     \arg3\().4s, \arg4\().4s, \arg7\().4s          //int16 f[i][3] = e[i][0] - e[i][3];

+//  }

+.endm

+.macro MB_PRED_8BITS_ADD_DCT_16BITS_CLIP arg0, arg1, arg2, arg3, arg4

+//  {   //  input: pred_d[0](output), dct_q0/1, working_q0/1;

+uxtl      \arg3\().8h, \arg0\().8b

+uxtl2     \arg4\().8h, \arg0\().16b

+add       \arg3\().8h, \arg3\().8h, \arg1\().8h

+add       \arg4\().8h, \arg4\().8h, \arg2\().8h

+sqxtun   \arg0\().8b, \arg3\().8h

+sqxtun2  \arg0\().16b,\arg4\().8h

+//  }

+.endm

+#endif

+WELS_ASM_AARCH64_FUNC_BEGIN WelsGetNoneZeroCount_AArch64_neon

+    ld1     {v0.8h, v1.8h}, [x0]

+    ZERO_COUNT_IN_2_QUARWORD    v0, v1, b0

+    mov     x0, v0.d[0]

+    mov     x1, #16

+    subs    x0, x1, x0

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4_AArch64_neon

+    ld1     {v2.8h}, [x1]

+    ld1     {v0.8h, v1.8h}, [x0]

+    ld1     {v3.8h}, [x2]

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS   v0, v2, v3, v5, v6, v7

+    st1     {v2.8h}, [x0], #16

+    NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7

+    st1     {v4.8h}, [x0], #16

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsQuant4x4Dc_AArch64_neon

+    ld1     {v0.8h, v1.8h}, [x0]

+    dup     v2.8h, w1      // even ff range [0, 768]

+    dup     v3.8h, w2

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS   v0, v2, v3, v5, v6, v7

+    st1     {v2.8h}, [x0], #16

+    NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7

+    st1     {v4.8h}, [x0], #16

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4_AArch64_neon

+    ld1     {v2.8h}, [x1]

+    ld1     {v3.8h}, [x2]

+    mov     x1, x0

+.rept 4

+    ld1     {v0.8h, v1.8h}, [x0], #32

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS   v0, v4, v3, v5, v6, v7

+    st1     {v4.8h}, [x1], #16

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS   v1, v4, v3, v5, v6, v7

+    st1     {v4.8h}, [x1], #16

+.endr

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsQuantFour4x4Max_AArch64_neon

+    ld1     {v2.8h}, [x1]

+    ld1     {v3.8h}, [x2]

+    mov     x1, x0

+    ld1     {v0.8h, v1.8h}, [x0], #32

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v16

+    st1     {v4.8h}, [x1], #16

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v17

+    st1     {v4.8h}, [x1], #16   // then 1st 16 elem in v16  & v17

+    ld1     {v0.8h, v1.8h}, [x0], #32

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v18

+    st1     {v4.8h}, [x1], #16

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v19

+    st1     {v4.8h}, [x1], #16   // then 2st 16 elem in v18 & v19

+    SELECT_MAX_IN_ABS_COEF  v16, v17, v18, v19, h20, h21

+    ld1     {v0.8h, v1.8h}, [x0], #32

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v16

+    st1     {v4.8h}, [x1], #16

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v17

+    st1     {v4.8h}, [x1], #16   // then 1st 16 elem in v16  & v17

+    ld1     {v0.8h, v1.8h}, [x0], #32

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v0, v4, v3, v5, v6, v7, v18

+    st1     {v4.8h}, [x1], #16

+    mov.8h  v4, v2

+    NEWQUANT_COEF_EACH_16BITS_MAX   v1, v4, v3, v5, v6, v7, v19

+    st1     {v4.8h}, [x1], #16   // then 2st 16 elem in v18 & v19

+    SELECT_MAX_IN_ABS_COEF  v16, v17, v18, v19, h22, h23

+    st4 {v20.h,v21.h,v22.h,v23.h}[0], [x3]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsDequant4x4_AArch64_neon

+    ld1    {v0.8h, v1.8h}, [x0]

+    ld1    {v2.8h}, [x1]

+    mul    v3.8h, v0.8h, v2.8h

+    mul    v4.8h, v1.8h, v2.8h

+    st1    {v3.8h, v4.8h}, [x0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantFour4x4_AArch64_neon

+    ld1    {v2.8h}, [x1]

+    mov    x1, x0

+.rept 4

+    ld1   {v0.8h,v1.8h}, [x0], #32

+    mul   v3.8h, v0.8h, v2.8h

+    mul   v4.8h, v1.8h, v2.8h

+    st1   {v3.8h,v4.8h}, [x1], #32

+.endr

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2SkipKernel_AArch64_neon

+    dup   v4.8h, w1

+    mov   x1, #32

+    ld1   {v0.h}[0], [x0], x1       //rs[0]

+    ld1   {v0.h}[1], [x0], x1       //rs[16]

+    ld1   {v0.h}[2], [x0], x1       //rs[32]

+    ld1   {v0.h}[3], [x0], x1       //rs[48]

+    HDM_QUANT_2x2_TOTAL_16BITS  v0, v1, v2      // output v1

+    HDM_QUANT_2x2_TOTAL_16BITS  v1, v0, v2      // output v0

+    abs   v1.4h, v0.4h

+    cmhi  v1.4h, v1.4h, v4.4h         // abs(dct[i])>threshold;

+    mov   w0, v0.s[0]

+    mov   w1, v0.s[1]

+    orr   w0, w0, w1

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardQuant2x2_AArch64_neon

+    dup   v1.8h, w1 //ff

+    dup   v2.8h, w2 //mf

+    eor   v3.16b, v3.16b, v3.16b

+    mov   x1, #32

+    mov   x2, x0

+    ld1   {v0.h}[0], [x0], x1       //rs[0]

+    st1   {v3.h}[0], [x2], x1      //rs[00]=0

+    ld1   {v0.h}[1], [x0], x1       //rs[16]

+    st1   {v3.h}[1], [x2], x1      //rs[16]=0

+    ld1   {v0.h}[2], [x0], x1       //rs[32]

+    st1   {v3.h}[2], [x2], x1      //rs[32]=0

+    ld1   {v0.h}[3], [x0], x1       //rs[48]

+    st1   {v3.h}[3], [x2], x1      //rs[48]=0

+    HDM_QUANT_2x2_TOTAL_16BITS  v0, v4, v5      // output v4

+    HDM_QUANT_2x2_TOTAL_16BITS  v4, v0, v5      // output v0

+    QUANT_DUALWORD_COEF_EACH_16BITS v0, v1, v2, v3, v4

+    st1    {v1.d}[0], [x3]        // store to dct

+    st1    {v1.d}[0], [x4]        // store to block

+    movi v3.8h, #1, lsl #0

+    movi v0.16b, #255

+    DC_ZERO_COUNT_IN_DUALWORD   v1, h0, v3

+    mov     x0, v0.d[0]

+    mov     x1, #16

+    subs    x0, x1, x0

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsDequantIHadamard4x4_AArch64_neon

+    ld1    {v0.8h, v1.8h}, [x0]

+    dup    v4.8h, w1

+    IHDM_4x4_TOTAL_16BITS   v0, v2, v3

+    IHDM_4x4_TOTAL_16BITS   v1, v2, v3

+    MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2    v0, v1, v2, v3

+    IHDM_4x4_TOTAL_16BITS   v0, v2, v3

+    mul   v0.8h, v0.8h, v4.8h

+    IHDM_4x4_TOTAL_16BITS   v1, v2, v3

+    mul   v1.8h, v1.8h, v4.8h

+    MATRIX_TRANSFORM_EACH_16BITS_2x8_OUT2    v0, v1, v2, v3

+    st1    {v0.16b, v1.16b}, [x0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsDctT4_AArch64_neon

+    LOAD_4x4_DATA_FOR_DCT   v0, v1, x1, x2, x3, x4

+    usubl  v2.8h, v0.8b, v1.8b

+    usubl2 v4.8h, v0.16b, v1.16b

+    uzp1  v3.8h, v2.8h, v4.8h

+    uzp2  v5.8h, v2.8h, v4.8h

+    uzp2  v2.8h, v3.8h, v5.8h // s[2, 6, 10, 14] [3, 7, 11, 15]

+    uzp1  v0.8h, v3.8h, v5.8h // s[0, 4, 8, 12] [1, 5, 9, 13]

+    mov    v3.d[0], v2.d[1]   // s[3, 7, 11, 15]

+    mov    v1.d[0], v0.d[1]   // s[1, 5, 9, 13]

+    // horizontal transform

+    DCT_ROW_TRANSFORM_TOTAL_16BITS          v0, v1, v2, v3, v4, v5, v6, v7

+    // transform element

+    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7

+    // vertical transform

+    DCT_ROW_TRANSFORM_TOTAL_16BITS          v0, v1, v2, v3, v4, v5, v6, v7

+    st4       {v0.d, v1.d, v2.d, v3.d}[0], [x0]

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsDctFourT4_AArch64_neon

+.rept 2

+    LOAD_8x4_DATA_FOR_DCT   v0, v1, v2, v3, v4, v5, v6, v7, x1, x3

+    usubl    v0.8h, v0.8b, v4.8b

+    usubl    v1.8h, v1.8b, v5.8b

+    usubl    v2.8h, v2.8b, v6.8b

+    usubl    v3.8h, v3.8b, v7.8b

+    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7

+    // horizontal transform

+    DCT_ROW_TRANSFORM_TOTAL_16BITS      v0, v1, v2, v3, v4, v5, v6, v7

+    // transform element

+    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7

+    //  vertical transform

+    DCT_ROW_TRANSFORM_TOTAL_16BITS      v0, v1, v2, v3, v4, v5, v6, v7

+    uzp1    v4.2d, v0.2d, v1.2d

+    uzp2    v6.2d, v0.2d, v1.2d

+    uzp1    v5.2d, v2.2d, v3.2d

+    uzp2    v7.2d, v2.2d, v3.2d

+    st1     {v4.16b, v5.16b}, [x0], #32

+    st1     {v6.16b, v7.16b}, [x0], #32

+.endr

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctT4Rec_AArch64_neon

+    ld1     {v16.s}[0], [x2], x3

+    ld1     {v16.s}[1], [x2], x3

+    ld1     {v16.s}[2], [x2], x3

+    ld1     {v16.s}[3], [x2], x3                   // Pred

+    ld4     {v0.4h, v1.4h, v2.4h, v3.4h}, [x4]      // dct coeff

+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7

+    TRANSFORM_TOTAL_16BITS              v0, v1, v2, v3, v4, v5, v6, v7

+    MATRIX_TRANSFORM_EACH_16BITS_OUT4   v0, v1, v2, v3, v4, v5, v6, v7

+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7

+    TRANSFORM_TOTAL_16BITS              v0, v1, v2, v3, v4, v5, v6, v7

+    ins     v0.d[1], v1.d[0]

+    ins     v2.d[1], v3.d[0]

+    srshr   v0.8h, v0.8h, #6

+    srshr   v2.8h, v2.8h, #6

+    //after rounding 6, clip into [0, 255]

+    uxtl    v1.8h, v16.8b

+    add     v0.8h, v0.8h, v1.8h

+    sqxtun  v1.8b, v0.8h

+    st1     {v1.s}[0],[x0],x1

+    st1     {v1.s}[1],[x0],x1

+    uxtl2   v1.8h, v16.16b

+    add     v2.8h, v2.8h, v1.8h

+    sqxtun  v1.8b, v2.8h

+    st1     {v1.s}[0],[x0],x1

+    st1     {v1.s}[1],[x0],x1

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctFourT4Rec_AArch64_neon

+.rept 2

+    ld1     {v16.d}[0], [x2], x3

+    ld1     {v16.d}[1], [x2], x3

+    ld1     {v17.d}[0], [x2], x3

+    ld1     {v17.d}[1], [x2], x3                   // Pred

+    ld4     {v0.8h, v1.8h, v2.8h, v3.8h}, [x4], #64     // dct coeff

+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7

+    TRANSFORM_TOTAL_16BITS    v0, v1, v2, v3, v4, v5, v6, v7

+    MATRIX_TRANSFORM_EACH_16BITS_OUT4    v0, v1, v2, v3, v4, v5, v6, v7

+    ROW_TRANSFORM_1_STEP_TOTAL_16BITS   v0, v1, v2, v3, v4, v5, v6, v7

+    TRANSFORM_TOTAL_16BITS    v0, v1, v2, v3, v4, v5, v6, v7

+    srshr   v0.8h, v0.8h, #6

+    srshr   v1.8h, v1.8h, #6

+    srshr   v2.8h, v2.8h, #6

+    srshr   v3.8h, v3.8h, #6

+    //after rounding 6, clip into [0, 255]

+    uxtl    v4.8h, v16.8b

+    add     v0.8h, v0.8h, v4.8h

+    sqxtun  v0.8b, v0.8h

+    st1     {v0.d}[0],[x0],x1

+    uxtl2   v5.8h, v16.16b

+    add     v1.8h, v1.8h, v5.8h

+    sqxtun  v1.8b, v1.8h

+    st1     {v1.d}[0],[x0],x1

+    uxtl    v6.8h, v17.8b

+    add     v2.8h, v2.8h, v6.8h

+    sqxtun  v2.8b, v2.8h

+    st1     {v2.d}[0],[x0],x1

+    uxtl2   v7.8h, v17.16b

+    add     v3.8h, v3.8h, v7.8h

+    sqxtun  v3.8b, v3.8h

+    st1     {v3.d}[0],[x0],x1

+ .endr

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsHadamardT4Dc_AArch64_neon

+    mov     x2, #32

+    ld1     {v0.h}[0], [x1], x2

+    ld1     {v1.h}[0], [x1], x2

+    ld1     {v0.h}[1], [x1], x2

+    ld1     {v1.h}[1], [x1], x2

+    ld1     {v2.h}[0], [x1], x2

+    ld1     {v3.h}[0], [x1], x2

+    ld1     {v2.h}[1], [x1], x2

+    ld1     {v3.h}[1], [x1], x2

+    ld1     {v0.h}[2], [x1], x2

+    ld1     {v1.h}[2], [x1], x2

+    ld1     {v0.h}[3], [x1], x2

+    ld1     {v1.h}[3], [x1], x2

+    ld1     {v2.h}[2], [x1], x2

+    ld1     {v3.h}[2], [x1], x2

+    ld1     {v2.h}[3], [x1], x2

+    ld1     {v3.h}[3], [x1], x2 // v0[0 4 08 12],v1[1 5 09 13],v2[2 6 10 14],v3[3 7 11 15]

+    ROW_TRANSFORM_0_STEP    v0, v1, v3, v2, v4, v7, v6, v5

+    TRANSFORM_4BYTES        v0, v1, v3, v2, v4, v7, v6, v5

+    // transform element 32bits

+    uzp1    v4.4s, v0.4s, v1.4s // 0 2 4 6

+    uzp2    v5.4s, v0.4s, v1.4s // 1 3 5 7

+    uzp1    v6.4s, v2.4s, v3.4s // 8 10 12 14

+    uzp2    v7.4s, v2.4s, v3.4s // 9 11 13 15

+    uzp1    v0.4s, v4.4s, v6.4s // 0 4  8 12

+    uzp2    v2.4s, v4.4s, v6.4s // 2 6 10 14

+    uzp1    v1.4s, v5.4s, v7.4s // 1 5  9 13

+    uzp2    v3.4s, v5.4s, v7.4s // 3 7 11 15

+    COL_TRANSFORM_0_STEP    v0, v1, v3, v2, v4, v7, v6, v5

+    TRANSFORM_4BYTES        v0, v1, v3, v2, v4, v7, v6, v5

+    sqrshrn   v4.4h, v0.4s, #1

+    sqrshrn2  v4.8h, v1.4s, #1

+    sqrshrn   v5.4h, v2.4s, #1

+    sqrshrn2  v5.8h, v3.4s, #1

+    st1       {v4.16b, v5.16b}, [x0]  //store

+WELS_ASM_AARCH64_FUNC_END

+WELS_ASM_AARCH64_FUNC_BEGIN WelsIDctRecI16x16Dc_AArch64_neon

+    ld1       {v16.16b,v17.16b}, [x4]

+    srshr     v16.8h, v16.8h, #6

+    srshr     v17.8h, v17.8h, #6

+    dup       v0.8h, v16.h[0]

+    dup       v1.8h, v16.h[1]

+    ins       v0.d[1], v1.d[0]

+    dup       v1.8h, v16.h[2]

+    dup       v2.8h, v16.h[3]

+    ins       v1.d[1], v2.d[0]

+.rept 4

+    ld1       {v3.16b}, [x2], x3

+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5

+    st1       {v3.16b}, [x0], x1

+.endr

+    dup       v0.8h, v16.h[4]

+    dup       v1.8h, v16.h[5]

+    ins       v0.d[1], v1.d[0]

+    dup       v1.8h, v16.h[6]

+    dup       v2.8h, v16.h[7]

+    ins       v1.d[1], v2.d[0]

+.rept 4

+    ld1       {v3.16b}, [x2], x3

+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5

+    st1       {v3.16b}, [x0], x1

+.endr

+    dup       v0.8h, v17.h[0]

+    dup       v1.8h, v17.h[1]

+    ins       v0.d[1], v1.d[0]

+    dup       v1.8h, v17.h[2]

+    dup       v2.8h, v17.h[3]

+    ins       v1.d[1], v2.d[0]

+.rept 4

+    ld1       {v3.16b}, [x2], x3

+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5

+    st1       {v3.16b}, [x0], x1

+.endr

+    dup       v0.8h, v17.h[4]

+    dup       v1.8h, v17.h[5]

+    ins       v0.d[1], v1.d[0]

+    dup       v1.8h, v17.h[6]

+    dup       v2.8h, v17.h[7]

+    ins       v1.d[1], v2.d[0]

+.rept 4

+    ld1       {v3.16b}, [x2], x3

+    MB_PRED_8BITS_ADD_DCT_16BITS_CLIP   v3, v0, v1, v4, v5

+    st1       {v3.16b}, [x0], x1

+.endr

+WELS_ASM_AARCH64_FUNC_END

+#endif

\ No newline at end of file

--- a/codec/encoder/core/inc/decode_mb_aux.h

+++ b/codec/encoder/core/inc/decode_mb_aux.h

@@ -81,6 +81,17 @@

                                int16_t* pDctDc);

 #endif

+#ifdef	HAVE_NEON_AARCH64

+void WelsDequantFour4x4_AArch64_neon (int16_t* pDct, const uint16_t* kpMF);

+void WelsDequant4x4_AArch64_neon (int16_t* pDct, const uint16_t* kpMF);

+void WelsDequantIHadamard4x4_AArch64_neon (int16_t* pRes, const uint16_t kuiMF);

+void WelsIDctT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

+void WelsIDctFourT4Rec_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride, int16_t* pDct);

+void WelsIDctRecI16x16Dc_AArch64_neon (uint8_t* pRec, int32_t iStride, uint8_t* pPrediction, int32_t iPredStride,

+                                 int16_t* pDctDc);

+#endif

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/encoder/core/inc/encode_mb_aux.h

+++ b/codec/encoder/core/inc/encode_mb_aux.h

@@ -122,6 +122,22 @@

 void WelsQuantFour4x4Max_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);

 #endif

+#ifdef	HAVE_NEON_AARCH64

+void WelsHadamardT4Dc_AArch64_neon (int16_t* pLumaDc, int16_t* pDct);

+int32_t WelsHadamardQuant2x2_AArch64_neon (int16_t* pRes, const int16_t kiFF, int16_t iMF, int16_t* pDct, int16_t* pBlock);

+int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF,  int16_t iMF);

+int32_t WelsHadamardQuant2x2SkipKernel_AArch64_neon (int16_t* pRes, int16_t iThreshold); // avoid divide operator

+void WelsDctT4_AArch64_neon (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

+void WelsDctFourT4_AArch64_neon (int16_t* pDct,  uint8_t* pPixel1, int32_t iStride1, uint8_t* pPixel2, int32_t iStride2);

+int32_t WelsGetNoneZeroCount_AArch64_neon (int16_t* pLevel);

+void WelsQuant4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);

+void WelsQuant4x4Dc_AArch64_neon (int16_t* pDct, int16_t iFF, int16_t iMF);

+void WelsQuantFour4x4_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF);

+void WelsQuantFour4x4Max_AArch64_neon (int16_t* pDct, const int16_t* pFF, const int16_t* pMF, int16_t* pMax);

+#endif

 #if defined(__cplusplus)

 #endif//__cplusplus

--- a/codec/encoder/core/src/decode_mb_aux.cpp

+++ b/codec/encoder/core/src/decode_mb_aux.cpp

@@ -282,5 +282,17 @@

     pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_neon;

 #endif

+#if defined(HAVE_NEON_AARCH64)

+  if (uiCpuFlag & WELS_CPU_NEON) {

+    pFuncList->pfDequantization4x4			= WelsDequant4x4_AArch64_neon;

+    pFuncList->pfDequantizationFour4x4		= WelsDequantFour4x4_AArch64_neon;

+    pFuncList->pfDequantizationIHadamard4x4	= WelsDequantIHadamard4x4_AArch64_neon;

+    pFuncList->pfIDctFourT4		= WelsIDctFourT4Rec_AArch64_neon;

+    pFuncList->pfIDctT4		= WelsIDctT4Rec_AArch64_neon;

+    pFuncList->pfIDctI16x16Dc = WelsIDctRecI16x16Dc_AArch64_neon;

+  }

+#endif

--- a/codec/encoder/core/src/encode_mb_aux.cpp

+++ b/codec/encoder/core/src/encode_mb_aux.cpp

@@ -455,7 +455,12 @@

   return WelsHadamardQuant2x2SkipKernel_neon (pRes, iThreshold);

 #endif

+#ifdef	HAVE_NEON_AARCH64

+int32_t WelsHadamardQuant2x2Skip_AArch64_neon (int16_t* pRes, int16_t iFF,  int16_t iMF) {

+  int16_t iThreshold = ((1 << 16) - 1) / iMF - iFF;

+  return WelsHadamardQuant2x2SkipKernel_AArch64_neon (pRes, iThreshold);

+}

+#endif

 void WelsInitEncodingFuncs (SWelsFuncPtrList* pFuncList, uint32_t  uiCpuFlag) {

   pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_c;

   pFuncList->pfCopy16x16Aligned		=

@@ -540,6 +545,29 @@

     pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_neon;

     pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_neon;

     pFuncList->pfDctFourT4				= WelsDctFourT4_neon;

+  }

+#endif

+#if defined(HAVE_NEON_AARCH64)

+  if (uiCpuFlag & WELS_CPU_NEON) {

+    pFuncList->pfQuantizationHadamard2x2		= WelsHadamardQuant2x2_AArch64_neon;

+    pFuncList->pfQuantizationHadamard2x2Skip	= WelsHadamardQuant2x2Skip_AArch64_neon;

+    pFuncList->pfDctT4					= WelsDctT4_AArch64_neon;

+    //pFuncList->pfCopy8x8Aligned			= WelsCopy8x8_AArch64_neon; // will enable in next update

+    //pFuncList->pfCopy8x16Aligned		= WelsCopy8x16_AArch64_neon; // will enable in next update

+    pFuncList->pfGetNoneZeroCount		= WelsGetNoneZeroCount_AArch64_neon;

+    pFuncList->pfTransformHadamard4x4Dc	= WelsHadamardT4Dc_AArch64_neon;

+    pFuncList->pfQuantization4x4		= WelsQuant4x4_AArch64_neon;

+    pFuncList->pfQuantizationDc4x4		= WelsQuant4x4Dc_AArch64_neon;

+    pFuncList->pfQuantizationFour4x4	= WelsQuantFour4x4_AArch64_neon;

+    pFuncList->pfQuantizationFour4x4Max	= WelsQuantFour4x4Max_AArch64_neon;

+    //pFuncList->pfCopy16x16Aligned		= WelsCopy16x16_AArch64_neon; // will enable in next update

+    //pFuncList->pfCopy16x16NotAligned	= WelsCopy16x16NotAligned_AArch64_neon; // will enable in next update

+    //pFuncList->pfCopy16x8NotAligned		= WelsCopy16x8NotAligned_AArch64_neon; // will enable in next update

+    pFuncList->pfDctFourT4				= WelsDctFourT4_AArch64_neon;

 #endif

--- a/codec/encoder/targets.mk

+++ b/codec/encoder/targets.mk

@@ -62,6 +62,7 @@

 	$(ENCODER_SRCDIR)/core/arm64/intra_pred_aarch64_neon.S\

 	$(ENCODER_SRCDIR)/core/arm64/intra_pred_sad_3_opt_aarch64_neon.S\

 	$(ENCODER_SRCDIR)/core/arm64/pixel_aarch64_neon.S\

+        $(ENCODER_SRCDIR)/core/arm64/reconstruct_aarch64_neon.S\

 ENCODER_OBJS += $(ENCODER_ASM_ARM64_SRCS:.S=.$(OBJ))

 endif

--- /dev/null

+++ b/test/encoder/EncUT_Reconstruct.cpp

@@ -1,0 +1,492 @@

+#include<gtest/gtest.h>

+#include<math.h>

+#include<stdlib.h>

+#include<time.h>

+#include "cpu_core.h"

+#include "cpu.h"

+#include "macros.h"

+#include "encode_mb_aux.h"

+#include "decode_mb_aux.h"

+#include "wels_func_ptr_def.h"

+using namespace WelsSVCEnc;

+#define RECONTEST_NUM 1000

+static void FillWithRandomData (uint8_t* p, int32_t Len) {

+  for (int32_t i = 0; i < Len; i++) {

+    p[i] = rand() % 256;

+  }

+}

+TEST (ReconstructionFunTest, WelsIDctRecI16x16Dc) {

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 16, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 16, 16)

+  ENFORCE_STACK_ALIGN_1D (int16_t, pDct, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData (pPred, 32 * 16);

+    FillWithRandomData ((uint8_t*)pDct, 16 * 2);

+    for (int32_t i = 0 ; i < 16; i++) {

+      pDct[i] = WELS_CLIP3 (pDct[i], -4080, 4080);

+    }

+    WelsIDctRecI16x16Dc_c (pRec[0], 16, pPred, 32, pDct);

+    sFuncPtrList.pfIDctI16x16Dc (pRec[1], 16, pPred, 32, pDct);

+    for (int32_t j = 0 ; j < 16; j++) {

+      for (int32_t i = 0 ; i < 16; i++) {

+        ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);

+      }

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsGetNoneZeroCount) {

+  ENFORCE_STACK_ALIGN_1D (int16_t, pInput, 64, 16)

+  int32_t iZeroCount[2];

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData ((uint8_t*)pInput, 128);

+    iZeroCount[0] = WelsGetNoneZeroCount_c (pInput);

+    iZeroCount[1] = sFuncPtrList.pfGetNoneZeroCount (pInput);

+    ASSERT_EQ (iZeroCount[0], iZeroCount[1]);

+  }

+}

+TEST (ReconstructionFunTest, WelsHadamardT4Dc) {

+  ENFORCE_STACK_ALIGN_1D (int16_t, pDct, 16 * 16, 16)

+  ENFORCE_STACK_ALIGN_2D (int16_t, pLumaDc, 2, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData ((uint8_t*)pDct, 16 * 16 * 2);

+    for (int32_t j = 0 ; j < 16; j++) {

+      for (int32_t i = 0 ; i < 16; i++) {

+        pDct[i + j * 16] = WELS_CLIP3 (pDct[i + j * 16], -4080, 4080);

+      }

+    }

+    WelsHadamardT4Dc_c (pLumaDc[0], pDct);

+    sFuncPtrList.pfTransformHadamard4x4Dc (pLumaDc[1], pDct);

+    for (int32_t i = 0 ; i < 16; i++) {

+      ASSERT_EQ (pLumaDc[0][i], pLumaDc[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsDctT4) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 4, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 4, 16)

+  ENFORCE_STACK_ALIGN_2D (int16_t, pOut, 2, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData (pInput1, 16 * 4);

+    FillWithRandomData (pInput2, 32 * 4);

+    WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);

+    sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);

+    for (int32_t i = 0 ; i < 16; i++) {

+      ASSERT_EQ (pOut[0][i], pOut[1][i]);

+    }

+  }

+  memset (pInput1, 255, 16 * 4);

+  memset (pInput2, 0, 32 * 4);

+  WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);

+  sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);

+  for (int32_t i = 0 ; i < 16; i++) {

+    ASSERT_EQ (pOut[0][i], pOut[1][i]);

+  }

+  memset (pInput1, 0, 16 * 4);

+  memset (pInput2, 255, 32 * 4);

+  WelsDctT4_c (pOut[0], pInput1, 16, pInput2, 32);

+  sFuncPtrList.pfDctT4 (pOut[1], pInput1, 16, pInput2, 32);

+  for (int32_t i = 0 ; i < 16; i++) {

+    ASSERT_EQ (pOut[0][i], pOut[1][i]);

+  }

+}

+TEST (ReconstructionFunTest, WelsDctFourT4) {

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 8, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 8, 16)

+  ENFORCE_STACK_ALIGN_2D (int16_t, pOut, 2, 64, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData (pInput1, 16 * 8);

+    FillWithRandomData (pInput2, 32 * 8);

+    WelsDctFourT4_c (pOut[0], pInput1, 16, pInput2, 32);

+    sFuncPtrList.pfDctFourT4 (pOut[1], pInput1, 16, pInput2, 32);

+    for (int32_t i = 0 ; i < 64; i++) {

+      ASSERT_EQ (pOut[0][i], pOut[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsIDctT4Rec) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 16, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 4, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 4, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 4, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 4, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData (pPred, 32 * 4);

+    FillWithRandomData (pInput1, 16 * 4);

+    FillWithRandomData (pInput2, 32 * 4);

+    WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);

+    sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);

+    WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);

+    sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);

+    for (int32_t j = 0 ; j < 4; j++) {

+      for (int32_t i = 0 ; i < 4; i++) {

+        ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);

+      }

+    }

+  }

+  memset (pPred, 255, 32 * 4);

+  memset (pInput1, 255, 16 * 4);

+  memset (pInput2, 0, 32 * 4);

+  WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);

+  sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);

+  WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);

+  sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);

+  for (int32_t j = 0 ; j < 4; j++) {

+    for (int32_t i = 0 ; i < 4; i++) {

+      ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);

+    }

+  }

+  memset (pPred, 255, 32 * 4);

+  memset (pInput1, 0, 16 * 4);

+  memset (pInput2, 255, 32 * 4);

+  WelsDctT4_c (pDct[0], pInput1, 16, pInput2, 32);

+  sFuncPtrList.pfDctT4 (pDct[1], pInput1, 16, pInput2, 32);

+  WelsIDctT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);

+  sFuncPtrList.pfIDctT4 (pRec[1], 16, pPred, 32, pDct[1]);

+  for (int32_t j = 0 ; j < 4; j++) {

+    for (int32_t i = 0 ; i < 4; i++) {

+      ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsIDctFourT4Rec) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 64, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pPred, 32 * 8, 16)

+  ENFORCE_STACK_ALIGN_2D (uint8_t, pRec, 2, 16 * 8, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput1, 16 * 8, 16)

+  ENFORCE_STACK_ALIGN_1D (uint8_t, pInput2, 32 * 8, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    FillWithRandomData (pInput1, 16 * 8);

+    FillWithRandomData (pInput2, 32 * 8);

+    FillWithRandomData (pPred, 32 * 8);

+    WelsDctFourT4_c (pDct[0], pInput1, 16, pInput2, 32);

+    sFuncPtrList.pfDctFourT4 (pDct[1], pInput1, 16, pInput2, 32);

+    WelsIDctFourT4Rec_c (pRec[0], 16, pPred, 32, pDct[0]);

+    sFuncPtrList.pfIDctFourT4 (pRec[1], 16, pPred, 32, pDct[1]);

+    for (int32_t j = 0 ; j < 8; j++) {

+      for (int32_t i = 0 ; i < 8; i++) {

+        ASSERT_EQ (pRec[0][i + j * 16], pRec[1][i + j * 16]);

+      }

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsDequant4x4) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 32);

+    for (int32_t i = 0 ; i < 16; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuant4x4_c (pInput[0], pFF, pMF);

+    sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);

+    WelsDequant4x4_c (pInput[0], g_kuiDequantCoeff[uiQp]);

+    sFuncPtrList.pfDequantization4x4 (pInput[1], g_kuiDequantCoeff[uiQp]);

+    for (int32_t i = 0 ; i < 16; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsDequantIHadamard4x4) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 32);

+    for (int32_t i = 0 ; i < 16; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuant4x4_c (pInput[0], pFF, pMF);

+    sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);

+    WelsDequantIHadamard4x4_c (pInput[0], g_kuiDequantCoeff[uiQp][0]);

+    sFuncPtrList.pfDequantizationIHadamard4x4 (pInput[1], g_kuiDequantCoeff[uiQp][0]);

+    for (int32_t i = 0 ; i < 16; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsQuant4x4) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 32);

+    for (int32_t i = 0 ; i < 16; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuant4x4_c (pInput[0], pFF, pMF);

+    sFuncPtrList.pfQuantization4x4 (pInput[1], pFF, pMF);

+    for (int32_t i = 0 ; i < 16; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsQuant4x4Dc) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 16, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 32);

+    for (int32_t i = 0 ; i < 16; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 32);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuant4x4Dc_c (pInput[0], pFF[0], pMF[0]);

+    sFuncPtrList.pfQuantizationDc4x4 (pInput[1], pFF[0], pMF[0]);

+    for (int32_t i = 0 ; i < 16; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsQuantFour4x4) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 128);

+    for (int32_t i = 0 ; i < 64; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuantFour4x4_c (pInput[0], pFF, pMF);

+    sFuncPtrList.pfQuantizationFour4x4 (pInput[1], pFF, pMF);

+    for (int32_t i = 0 ; i < 64; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsQuantFour4x4Max) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  int16_t pMax[2][4];

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 128);

+    for (int32_t i = 0 ; i < 64; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuantFour4x4Max_c (pInput[0], pFF, pMF, pMax[0]);

+    sFuncPtrList.pfQuantizationFour4x4Max (pInput[1], pFF, pMF, pMax[1]);

+    for (int32_t i = 0 ; i < 64; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+      ASSERT_EQ (pMax[0][i >> 4], pMax[1][i >> 4]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsDeQuantFour4x4) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  WelsInitReconstructionFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 128);

+    for (int32_t i = 0 ; i < 64; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -32640, 32640);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    WelsQuantFour4x4_c (pInput[0], pFF, pMF);

+    sFuncPtrList.pfQuantizationFour4x4 (pInput[1], pFF, pMF);

+    WelsDequantFour4x4_c (pInput[0], g_kuiDequantCoeff[uiQp]);

+    sFuncPtrList.pfDequantizationFour4x4 (pInput[1], g_kuiDequantCoeff[uiQp]);

+    for (int32_t i = 0 ; i < 64; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+  }

+}

+TEST (ReconstructionFunTest, WelsHadamardQuant2x2Skip) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 128);

+    for (int32_t i = 0 ; i < 64; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -4080, 4080);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    int32_t iSkip_c = WelsHadamardQuant2x2Skip_c (pInput[0], pFF[0], pMF[0]);

+    int32_t iSkip_test = sFuncPtrList.pfQuantizationHadamard2x2Skip (pInput[1], pFF[0], pMF[0]);

+    ASSERT_EQ ((iSkip_test != 0), (iSkip_c != 0));

+  }

+}

+TEST (ReconstructionFunTest, WelsHadamardQuant2x2) {

+  ENFORCE_STACK_ALIGN_2D (int16_t, pInput, 2, 64, 16)

+  ENFORCE_STACK_ALIGN_2D (int16_t, pDct, 2, 4, 16)

+  ENFORCE_STACK_ALIGN_2D (int16_t, pBlock, 2, 4, 16)

+  int32_t iCpuCores = 0;

+  SWelsFuncPtrList sFuncPtrList;

+  uint32_t m_uiCpuFeatureFlag = WelsCPUFeatureDetect (&iCpuCores);

+  WelsInitEncodingFuncs (&sFuncPtrList, m_uiCpuFeatureFlag);

+  for (int32_t k = 0; k < RECONTEST_NUM; k++) {

+    uint8_t uiQp = rand() % 52;

+    FillWithRandomData ((uint8_t*)pInput[0], 128);

+    for (int32_t i = 0 ; i < 64; i++) {

+      pInput[0][i] = WELS_CLIP3 (pInput[0][i], -4080, 4080);

+    }

+    memcpy ((uint8_t*)pInput[1], (uint8_t*)pInput[0], 128);

+    const int16_t* pMF = g_kiQuantMF[uiQp];

+    const int16_t* pFF = g_iQuantIntraFF[uiQp];

+    int32_t iSkip_c = WelsHadamardQuant2x2_c (pInput[0], pFF[0], pMF[0], pDct[0], pBlock[0]);

+    int32_t iSkip_test = sFuncPtrList.pfQuantizationHadamard2x2 (pInput[1], pFF[0], pMF[0], pDct[1], pBlock[1]);

+    ASSERT_EQ ((iSkip_test != 0), (iSkip_c != 0));

+    for (int32_t i = 0 ; i < 64; i++) {

+      ASSERT_EQ (pInput[0][i], pInput[1][i]);

+    }

+    for (int32_t i = 0 ; i < 4; i++) {

+      ASSERT_EQ (pDct[0][i], pDct[1][i]);

+      ASSERT_EQ (pBlock[0][i], pBlock[1][i]);

+    }

+  }

+}

--- a/test/encoder/targets.mk

+++ b/test/encoder/targets.mk

@@ -8,6 +8,7 @@

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_GetIntraPredictor.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MemoryAlloc.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_MotionEstimate.cpp\

+        $(ENCODER_UNITTEST_SRCDIR)/EncUT_Reconstruct.cpp\

 	$(ENCODER_UNITTEST_SRCDIR)/EncUT_Sample.cpp\

 ENCODER_UNITTEST_OBJS += $(ENCODER_UNITTEST_CPP_SRCS:.cpp=.$(OBJ))