shithub: libvpx

--- a/test/quantize_test.cc

+++ b/test/quantize_test.cc

@@ -192,4 +192,12 @@

                         ::testing::Values(make_tuple(&vp8_fast_quantize_b_neon,

                                                      &vp8_fast_quantize_b_c)));

 #endif  // HAVE_NEON

+#if HAVE_MSA

+INSTANTIATE_TEST_CASE_P(

+    MSA, QuantizeTest,

+    ::testing::Values(

+        make_tuple(&vp8_fast_quantize_b_msa, &vp8_fast_quantize_b_c),

+        make_tuple(&vp8_regular_quantize_b_msa, &vp8_regular_quantize_b_c)));

+#endif  // HAVE_MSA

 }  // namespace

--- a/vp8/common/mips/msa/vp8_macros_msa.h

+++ b/vp8/common/mips/msa/vp8_macros_msa.h

@@ -553,6 +553,20 @@

 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)

+/* Description : Shuffle halfword vector elements as per mask vector

+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : halfword elements from 'in0' & 'in1' are copied selectively to

+                 'out0' as per control vector 'mask0'

+*/

+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)   \

+{                                                                      \

+    out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);  \

+    out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);  \

+}

+#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)

 /* Description : Dot product of byte vector elements

    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

@@ -604,6 +618,31 @@

 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)

+/* Description : Dot product of halfword vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed halfword elements from 'mult0' are multiplied with

+                 signed halfword elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed word.

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

+*/

+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)  \

+{                                                                \

+    out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);    \

+    out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);    \

+}

+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \

+                 cnst0, cnst1, cnst2, cnst3,                  \

+                 out0, out1, out2, out3)                      \

+{                                                             \

+    DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \

+    DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \

+}

+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)

 /* Description : Dot product & addition of byte vector elements

    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

@@ -1307,6 +1346,18 @@

 {                                                     \

     ADD2(in0, in1, in2, in3, out0, out1);             \

     ADD2(in4, in5, in6, in7, out2, out3);             \

+}

+/* Description : Subtraction of 2 pairs of vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+   Details     : Each element in 'in1' is subtracted from 'in0' and result is

+                 written to 'out0'.

+*/

+#define SUB2(in0, in1, in2, in3, out0, out1)  \

+{                                             \

+    out0 = in0 - in1;                         \

+    out1 = in2 - in3;                         \

 /* Description : Sign extend halfword elements from right half of the vector

--- a/vp8/common/rtcd_defs.pl

+++ b/vp8/common/rtcd_defs.pl

@@ -286,10 +286,10 @@

 # Quantizer

 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";

-specialize qw/vp8_regular_quantize_b sse2 sse4_1/;

+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa/;

 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";

-specialize qw/vp8_fast_quantize_b sse2 ssse3 neon/;

+specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa/;

 # Block subtraction

--- /dev/null

+++ b/vp8/encoder/mips/msa/quantize_msa.c

@@ -1,0 +1,246 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vp8_rtcd.h"

+#include "vp8/common/mips/msa/vp8_macros_msa.h"

+#include "vp8/encoder/block.h"

+static int8_t fast_quantize_b_msa(int16_t *coeff_ptr, int16_t *zbin,

+                                  int16_t *round, int16_t *quant,

+                                  int16_t *de_quant, int16_t *q_coeff,

+                                  int16_t *dq_coeff)

+{

+    int32_t cnt, eob;

+    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,

+                          3, 8, 11, 13, 9, 10, 14, 15 };

+    v8i16 round0, round1;

+    v8i16 sign_z0, sign_z1;

+    v8i16 q_coeff0, q_coeff1;

+    v8i16 x0, x1, de_quant0, de_quant1;

+    v8i16 coeff0, coeff1, z0, z1;

+    v8i16 quant0, quant1, quant2, quant3;

+    v8i16 zero = { 0 };

+    v8i16 inv_zig_zag0, inv_zig_zag1;

+    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };

+    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };

+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;

+    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;

+    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);

+    eob = -1;

+    LD_SH2(coeff_ptr, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               z0, z1);

+    LD_SH2(round, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               round0, round1);

+    LD_SH2(quant, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               quant0, quant2);

+    sign_z0 = z0 >> 15;

+    sign_z1 = z1 >> 15;

+    x0 = __msa_add_a_h(z0, zero);

+    x1 = __msa_add_a_h(z1, zero);

+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);

+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);

+    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);

+    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);

+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,

+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);

+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);

+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);

+    x0 = x0 ^ sign_z0;

+    x1 = x1 ^ sign_z1;

+    SUB2(x0, sign_z0, x1, sign_z1, x0, x1);

+    VSHF_H2_SH(x0, x1, x0, x1, inv_zig_zag0, inv_zig_zag1, q_coeff0, q_coeff1);

+    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);

+    LD_SH2(de_quant, 8, de_quant0, de_quant1);

+    q_coeff0 *= de_quant0;

+    q_coeff1 *= de_quant1;

+    ST_SH2(q_coeff0, q_coeff1, dq_coeff, 8);

+    for (cnt = 0; cnt < 16; ++cnt)

+    {

+        if ((cnt <= 7) && (x1[7 - cnt] != 0))

+        {

+            eob = (15 - cnt);

+            break;

+        }

+        if ((cnt > 7) && (x0[7 - (cnt - 8)] != 0))

+        {

+            eob = (7 - (cnt - 8));

+            break;

+        }

+    }

+    return (int8_t)(eob + 1);

+}

+static int8_t exact_regular_quantize_b_msa(int16_t *zbin_boost,

+                                           int16_t *coeff_ptr,

+                                           int16_t *zbin,

+                                           int16_t *round,

+                                           int16_t *quant,

+                                           int16_t *quant_shift,

+                                           int16_t *de_quant,

+                                           int16_t zbin_oq_in,

+                                           int16_t *q_coeff,

+                                           int16_t *dq_coeff)

+{

+    int32_t cnt, eob;

+    int16_t *boost_temp = zbin_boost;

+    v16i8 inv_zig_zag = { 0, 1, 5, 6, 2, 4, 7, 12,

+                          3, 8, 11, 13, 9, 10, 14, 15 };

+    v8i16 round0, round1;

+    v8i16 sign_z0, sign_z1;

+    v8i16 q_coeff0, q_coeff1;

+    v8i16 z_bin0, z_bin1, zbin_o_q;

+    v8i16 x0, x1, sign_x0, sign_x1, de_quant0, de_quant1;

+    v8i16 coeff0, coeff1, z0, z1;

+    v8i16 quant0, quant1, quant2, quant3;

+    v8i16 zero = { 0 };

+    v8i16 inv_zig_zag0, inv_zig_zag1;

+    v8i16 zigzag_mask0 = { 0, 1, 4, 8, 5, 2, 3, 6 };

+    v8i16 zigzag_mask1 = { 9, 12, 13, 10, 7, 11, 14, 15 };

+    v8i16 temp0_h, temp1_h, temp2_h, temp3_h;

+    v4i32 temp0_w, temp1_w, temp2_w, temp3_w;

+    ILVRL_B2_SH(zero, inv_zig_zag, inv_zig_zag0, inv_zig_zag1);

+    zbin_o_q = __msa_fill_h(zbin_oq_in);

+    eob = -1;

+    LD_SH2(coeff_ptr, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               z0, z1);

+    LD_SH2(round, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               round0, round1);

+    LD_SH2(quant, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               quant0, quant2);

+    LD_SH2(zbin, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               z_bin0, z_bin1);

+    sign_z0 = z0 >> 15;

+    sign_z1 = z1 >> 15;

+    x0 = __msa_add_a_h(z0, zero);

+    x1 = __msa_add_a_h(z1, zero);

+    SUB2(x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);

+    SUB2(z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);

+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);

+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);

+    ILVL_H2_SH(round0, x0, round1, x1, temp1_h, temp3_h);

+    ILVR_H2_SH(round0, x0, round1, x1, temp0_h, temp2_h);

+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,

+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);

+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);

+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, temp0_h, temp2_h);

+    LD_SH2(quant_shift, 8, coeff0, coeff1);

+    VSHF_H2_SH(coeff0, coeff1, coeff0, coeff1, zigzag_mask0, zigzag_mask1,

+               quant0, quant2);

+    ILVL_H2_SH(quant0, quant0, quant2, quant2, quant1, quant3);

+    ILVR_H2_SH(quant0, quant0, quant2, quant2, quant0, quant2);

+    ADD2(x0, round0, x1, round1, x0, x1);

+    ILVL_H2_SH(temp0_h, x0, temp2_h, x1, temp1_h, temp3_h);

+    ILVR_H2_SH(temp0_h, x0, temp2_h, x1, temp0_h, temp2_h);

+    DOTP_SH4_SW(temp0_h, temp1_h, temp2_h, temp3_h, quant0, quant1, quant2,

+                quant3, temp0_w, temp1_w, temp2_w, temp3_w);

+    SRA_4V(temp0_w, temp1_w, temp2_w, temp3_w, 16);

+    PCKEV_H2_SH(temp1_w, temp0_w, temp3_w, temp2_w, x0, x1);

+    sign_x0 = x0 ^ sign_z0;

+    sign_x1 = x1 ^ sign_z1;

+    SUB2(sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);

+    for (cnt = 0; cnt < 16; ++cnt)

+    {

+        if (cnt <= 7)

+        {

+            if (boost_temp[0] <= z_bin0[cnt])

+            {

+                if (x0[cnt])

+                {

+                    eob = cnt;

+                    boost_temp = zbin_boost;

+                }

+                else

+                {

+                    boost_temp++;

+                }

+            }

+            else

+            {

+                sign_x0[cnt] = 0;

+                boost_temp++;

+            }

+        }

+        else

+        {

+            if (boost_temp[0] <= z_bin1[cnt - 8])

+            {

+                if (x1[cnt - 8])

+                {

+                    eob = cnt;

+                    boost_temp = zbin_boost;

+                }

+                else

+                {

+                    boost_temp++;

+                }

+            }

+            else

+            {

+                sign_x1[cnt - 8] = 0;

+                boost_temp++;

+            }

+        }

+    }

+    VSHF_H2_SH(sign_x0, sign_x1, sign_x0, sign_x1, inv_zig_zag0, inv_zig_zag1,

+               q_coeff0, q_coeff1);

+    ST_SH2(q_coeff0, q_coeff1, q_coeff, 8);

+    LD_SH2(de_quant, 8, de_quant0, de_quant1);

+    MUL2(de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0, de_quant1);

+    ST_SH2(de_quant0, de_quant1, dq_coeff, 8);

+    return (int8_t)(eob + 1);

+}

+void vp8_fast_quantize_b_msa(BLOCK *b, BLOCKD *d)

+{

+    int16_t *coeff_ptr = b->coeff;

+    int16_t *zbin_ptr = b->zbin;

+    int16_t *round_ptr = b->round;

+    int16_t *quant_ptr = b->quant_fast;

+    int16_t *qcoeff_ptr = d->qcoeff;

+    int16_t *dqcoeff_ptr = d->dqcoeff;

+    int16_t *dequant_ptr = d->dequant;

+    *d->eob = fast_quantize_b_msa(coeff_ptr, zbin_ptr, round_ptr, quant_ptr,

+                                  dequant_ptr, qcoeff_ptr, dqcoeff_ptr);

+}

+void vp8_regular_quantize_b_msa(BLOCK *b, BLOCKD *d)

+{

+    int16_t *zbin_boost_ptr = b->zrun_zbin_boost;

+    int16_t *coeff_ptr = b->coeff;

+    int16_t *zbin_ptr = b->zbin;

+    int16_t *round_ptr = b->round;

+    int16_t *quant_ptr = b->quant;

+    int16_t *quant_shift_ptr = b->quant_shift;

+    int16_t *qcoeff_ptr = d->qcoeff;

+    int16_t *dqcoeff_ptr = d->dqcoeff;

+    int16_t *dequant_ptr = d->dequant;

+    int16_t zbin_oq_value = b->zbin_extra;

+    *d->eob = exact_regular_quantize_b_msa(zbin_boost_ptr, coeff_ptr,

+                                           zbin_ptr, round_ptr,

+                                           quant_ptr, quant_shift_ptr,

+                                           dequant_ptr, zbin_oq_value,

+                                           qcoeff_ptr, dqcoeff_ptr);

+}

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -104,5 +104,6 @@

 endif

 VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c

+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c

 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))