shithub: libvpx

--- a/test/fdct4x4_test.cc

+++ b/test/fdct4x4_test.cc

@@ -31,7 +31,7 @@

 void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,

                  int stride, int /*tx_type*/) {

-  vp9_short_idct4x4_add_c(out, dst, stride >> 1);

+  vp9_idct4x4_16_add_c(out, dst, stride >> 1);

 void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,

             int stride, int tx_type) {

--- a/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm

+++ b/vp9/common/arm/neon/vp9_short_idct4x4_1_add_neon.asm

@@ -8,7 +8,7 @@

-    EXPORT  |vp9_short_idct4x4_1_add_neon|

+    EXPORT  |vp9_idct4x4_1_add_neon|

ARM

     REQUIRE8

     PRESERVE8

@@ -15,7 +15,7 @@

     AREA ||.text||, CODE, READONLY, ALIGN=2

-;void vp9_short_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,

+;void vp9_idct4x4_1_add_neon(int16_t *input, uint8_t *dest,

 ;                                  int dest_stride)

 ; r0  int16_t input

@@ -22,7 +22,7 @@

 ; r1  uint8_t *dest

 ; r2  int dest_stride)

-|vp9_short_idct4x4_1_add_neon| PROC

+|vp9_idct4x4_1_add_neon| PROC

     ldrsh            r0, [r0]

     ; generate cospi_16_64 = 11585

@@ -63,6 +63,6 @@

     vst1.32          {d7[1]}, [r12]

     bx               lr

-    ENDP             ; |vp9_short_idct4x4_1_add_neon|

+    ENDP             ; |vp9_idct4x4_1_add_neon|

END

--- a/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm

+++ b/vp9/common/arm/neon/vp9_short_idct4x4_add_neon.asm

@@ -8,7 +8,7 @@

 ;  be found in the AUTHORS file in the root of the source tree.

-    EXPORT  |vp9_short_idct4x4_add_neon|

+    EXPORT  |vp9_idct4x4_16_add_neon|

ARM

     REQUIRE8

     PRESERVE8

@@ -16,13 +16,13 @@

     AREA ||.text||, CODE, READONLY, ALIGN=2

     AREA     Block, CODE, READONLY ; name this block of code

-;void vp9_short_idct4x4_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

+;void vp9_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride)

 ; r0  int16_t input

 ; r1  uint8_t *dest

 ; r2  int dest_stride)

-|vp9_short_idct4x4_add_neon| PROC

+|vp9_idct4x4_16_add_neon| PROC

     ; The 2D transform is done with two passes which are actually pretty

     ; similar. We first transform the rows. This is done by transposing

@@ -185,6 +185,6 @@

     vst1.32 {d26[1]}, [r1], r2

     vst1.32 {d26[0]}, [r1]  ; no post-increment

     bx              lr

-    ENDP  ; |vp9_short_idct4x4_add_neon|

+    ENDP  ; |vp9_idct4x4_16_add_neon|

END

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -18,7 +18,7 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+void vp9_iwht4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

    0.5 shifts per pixel. */

   int i;

@@ -70,7 +70,7 @@

-void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {

+void vp9_iwht4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {

   int i;

   int a1, e1;

   int16_t tmp[4];

@@ -116,7 +116,7 @@

   output[3] = step[0] - step[3];

-void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+void vp9_idct4x4_16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int16_t out[4 * 4];

   int16_t *outptr = out;

   int i, j;

@@ -140,7 +140,7 @@

-void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+void vp9_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

   int i;

   int a1;

   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);

@@ -1286,20 +1286,19 @@

 // idct

-void vp9_idct_add(int16_t *input, uint8_t *dest, int stride, int eob) {

+void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {

   if (eob > 1)

-    vp9_short_idct4x4_add(input, dest, stride);

+    vp9_idct4x4_16_add(input, dest, stride);

   else

-    vp9_short_idct4x4_1_add(input, dest, stride);

+    vp9_idct4x4_1_add(input, dest, stride);

-void vp9_idct_add_lossless(int16_t *input, uint8_t *dest, int stride,

-                             int eob) {

+void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob) {

   if (eob > 1)

-    vp9_short_iwalsh4x4_add(input, dest, stride);

+    vp9_iwht4x4_16_add(input, dest, stride);

   else

-    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);

+    vp9_iwht4x4_1_add(input, dest, stride);

 void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob) {

@@ -1348,7 +1347,7 @@

 void vp9_iht_add(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,

                    int eob) {

   if (tx_type == DCT_DCT)

-    vp9_idct_add(input, dest, stride, eob);

+    vp9_idct4x4_add(input, dest, stride, eob);

   else

     vp9_short_iht4x4_add(input, dest, stride, tx_type);

--- a/vp9/common/vp9_idct.h

+++ b/vp9/common/vp9_idct.h

@@ -88,9 +88,8 @@

 } transform_2d;

-void vp9_idct_add(int16_t *input, uint8_t *dest, int stride, int eob);

-void vp9_idct_add_lossless(int16_t *input, uint8_t *dest,

-                           int stride, int eob);

+void vp9_idct4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);

+void vp9_iwht4x4_add(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_idct_add_8x8(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_idct_add_16x16(int16_t *input, uint8_t *dest, int stride, int eob);

 void vp9_idct_add_32x32(int16_t *input, uint8_t *dest, int stride, int eob);

--- a/vp9/common/vp9_rtcd_defs.sh

+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -267,11 +267,11 @@

 # dct

-prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_idct4x4_1_add sse2 neon

+prototype void vp9_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_idct4x4_1_add sse2 neon

-prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_idct4x4_add sse2 neon

+prototype void vp9_idct4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_idct4x4_16_add sse2 neon

 prototype void vp9_short_idct8x8_1_add "int16_t *input, uint8_t *dest, int dest_stride"

 specialize vp9_short_idct8x8_1_add sse2 neon

@@ -310,11 +310,11 @@

 specialize vp9_idct4_1d sse2

 # dct and add

-prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_iwalsh4x4_1_add

+prototype void vp9_iwht4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_iwht4x4_1_add

-prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"

-specialize vp9_short_iwalsh4x4_add

+prototype void vp9_iwht4x4_16_add "int16_t *input, uint8_t *dest, int dest_stride"

+specialize vp9_iwht4x4_16_add

 # Encoder functions below this point.

--- a/vp9/common/x86/vp9_idct_intrin_sse2.c

+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -15,7 +15,7 @@

 #include "vp9/common/vp9_common.h"

 #include "vp9/common/vp9_idct.h"

-void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct4x4_16_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   const __m128i zero = _mm_setzero_si128();

   const __m128i eight = _mm_set1_epi16(8);

   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,

@@ -148,7 +148,7 @@

   RECON_AND_STORE4X4(dest, input3);

-void vp9_short_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {

+void vp9_idct4x4_1_add_sse2(int16_t *input, uint8_t *dest, int stride) {

   __m128i dc_value;

   const __m128i zero = _mm_setzero_si128();

   int a;

--- a/vp9/decoder/vp9_decodframe.c

+++ b/vp9/decoder/vp9_decodframe.c

@@ -490,8 +490,7 @@

                  cm->uv_dc_delta_q == 0 &&

                  cm->uv_ac_delta_q == 0;

-  xd->itxm_add = xd->lossless ? vp9_idct_add_lossless

-                              : vp9_idct_add;

+  xd->itxm_add = xd->lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;

 static INTERPOLATIONFILTERTYPE read_interp_filter_type(

--- a/vp9/encoder/vp9_encodeframe.c

+++ b/vp9/encoder/vp9_encodeframe.c

@@ -1866,8 +1866,8 @@

     // printf("Switching to lossless\n");

     cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;

     cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;

-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;

+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_iwht4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_iwht4x4_16_add;

     cpi->mb.optimize = 0;

     cpi->common.lf.filter_level = 0;

     cpi->zbin_mode_boost_enabled = 0;

@@ -1876,8 +1876,8 @@

     // printf("Not lossless\n");

     cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;

     cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;

-    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;

-    cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;

+    cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_idct4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add = vp9_idct4x4_16_add;

--- a/vp9/encoder/vp9_onyx_if.c

+++ b/vp9/encoder/vp9_onyx_if.c

@@ -1261,11 +1261,11 @@

   cpi->oxcf.lossless = oxcf->lossless;

   if (cpi->oxcf.lossless) {

-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;

-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_iwht4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_iwht4x4_16_add;

   } else {

-    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;

-    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;

+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_idct4x4_1_add;

+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_idct4x4_16_add;

   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;