shithub: libvpx

--- a/vp9/common/vp9_idct.c

+++ b/vp9/common/vp9_idct.c

@@ -19,23 +19,33 @@

 #include "vp9/common/vp9_idct.h"

 void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {

+/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,

+   0.5 shifts per pixel. */

   int i;

   int16_t output[16];

-  int a1, b1, c1, d1;

+  int a1, b1, c1, d1, e1;

   int16_t *ip = input;

   int16_t *op = output;

   for (i = 0; i < 4; i++) {

-    a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;

-    b1 = (ip[1] + ip[2]) >> WHT_UPSCALE_FACTOR;

-    c1 = (ip[1] - ip[2]) >> WHT_UPSCALE_FACTOR;

-    d1 = (ip[0] - ip[3]) >> WHT_UPSCALE_FACTOR;

+    a1 = ip[0] >> WHT_UPSCALE_FACTOR;

+    c1 = ip[1] >> WHT_UPSCALE_FACTOR;

+    d1 = ip[2] >> WHT_UPSCALE_FACTOR;

+    b1 = ip[3] >> WHT_UPSCALE_FACTOR;

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[1] = (c1 + d1) >> 1;

-    op[2] = (a1 - b1) >> 1;

-    op[3] = (d1 - c1) >> 1;

+    c1 = a1 - c1;

+    b1 += d1;

+    e1 = (c1 - b1) >> 1;

+    a1 -= e1;

+    d1 += e1;

+    b1 = a1 - b1;

+    c1 -= d1;

+    op[0] = a1;

+    op[1] = b1;

+    op[2] = c1;

+    op[3] = d1;

     ip += 4;

     op += 4;

@@ -42,20 +52,23 @@

   ip = output;

   for (i = 0; i < 4; i++) {

-    a1 = ip[4 * 0] + ip[4 * 3];

-    b1 = ip[4 * 1] + ip[4 * 2];

-    c1 = ip[4 * 1] - ip[4 * 2];

-    d1 = ip[4 * 0] - ip[4 * 3];

+    a1 = ip[4 * 0];

+    c1 = ip[4 * 1];

+    d1 = ip[4 * 2];

+    b1 = ip[4 * 3];

+    c1 = a1 - c1;

+    b1 += d1;

+    e1 = (c1 - b1) >> 1;

+    a1 -= e1;

+    d1 += e1;

+    b1 = a1 - b1;

+    c1 -= d1;

-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +

-                                       ((a1 + b1 + 1) >> 1));

-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +

-                                       ((c1 + d1) >> 1));

-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +

-                                       ((a1 - b1) >> 1));

-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +

-                                       ((d1 - c1) >> 1));

+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);

+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + b1);

+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + c1);

+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + d1);

     ip++;

     dest++;

@@ -64,23 +77,24 @@

 void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {

   int i;

+  int a1, e1;

   int16_t tmp[4];

   int16_t *ip = in;

   int16_t *op = tmp;

-  op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;

-  op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;

+  a1 = ip[0] >> WHT_UPSCALE_FACTOR;

+  e1 = a1 >> 1;

+  op[0] = op[1] = op[2] = a1 - e1;

+  op[3] = e1;

   ip = tmp;

   for (i = 0; i < 4; i++) {

-    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +

-                                       ((ip[0] + 1) >> 1));

-    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +

-                                       (ip[0] >> 1));

-    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +

-                                       (ip[0] >> 1));

-    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +

-                                       (ip[0] >> 1));

+    e1 = ip[0] >> 1;

+    a1 = ip[0] - e1;

+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] + a1);

+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] + a1);

+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] + a1);

+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] + e1);

     ip++;

     dest++;

--- a/vp9/encoder/vp9_dct.c

+++ b/vp9/encoder/vp9_dct.c

@@ -591,24 +591,34 @@

+/* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per

+   pixel. */

 void vp9_short_walsh4x4_c(short *input, short *output, int pitch) {

   int i;

-  int a1, b1, c1, d1;

+  int a1, b1, c1, d1, e1;

   short *ip = input;

   short *op = output;

   int pitch_short = pitch >> 1;

   for (i = 0; i < 4; i++) {

-    a1 = ip[0 * pitch_short] + ip[3 * pitch_short];

-    b1 = ip[1 * pitch_short] + ip[2 * pitch_short];

-    c1 = ip[1 * pitch_short] - ip[2 * pitch_short];

-    d1 = ip[0 * pitch_short] - ip[3 * pitch_short];

+    a1 = ip[0 * pitch_short];

+    b1 = ip[1 * pitch_short];

+    c1 = ip[2 * pitch_short];

+    d1 = ip[3 * pitch_short];

-    op[0] = (a1 + b1 + 1) >> 1;

-    op[4] = (c1 + d1) >> 1;

-    op[8] = (a1 - b1) >> 1;

-    op[12] = (d1 - c1) >> 1;

+    b1 = a1 - b1;

+    c1 += d1;

+    e1 = (c1 - b1) >> 1;

+    a1 += e1;

+    d1 -= e1;

+    c1 = a1 - c1;

+    b1 -= d1;

+    op[0] = a1;

+    op[4] = c1;

+    op[8] = d1;

+    op[12] = b1;

     ip++;

     op++;

@@ -616,15 +626,23 @@

   op = output;

   for (i = 0; i < 4; i++) {

-    a1 = ip[0] + ip[3];

-    b1 = ip[1] + ip[2];

-    c1 = ip[1] - ip[2];

-    d1 = ip[0] - ip[3];

+    a1 = ip[0];

+    b1 = ip[1];

+    c1 = ip[2];

+    d1 = ip[3];

-    op[0] = ((a1 + b1 + 1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[1] = ((c1 + d1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[2] = ((a1 - b1) >> 1) << WHT_UPSCALE_FACTOR;

-    op[3] = ((d1 - c1) >> 1) << WHT_UPSCALE_FACTOR;

+    b1 = a1 - b1;

+    c1 += d1;

+    e1 = (c1 - b1) >> 1;

+    a1 += e1;

+    d1 -= e1;

+    c1 = a1 - c1;

+    b1 -= d1;

+    op[0] = a1 << WHT_UPSCALE_FACTOR;

+    op[1] = c1 << WHT_UPSCALE_FACTOR;

+    op[2] = d1 << WHT_UPSCALE_FACTOR;

+    op[3] = b1 << WHT_UPSCALE_FACTOR;

     ip += 4;

     op += 4;