shithub: libvpx

--- a/vp8/common/mips/msa/vp8_macros_msa.h

+++ b/vp8/common/mips/msa/vp8_macros_msa.h

@@ -629,6 +629,31 @@

 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)

+/* Description : Dot product & addition of halfword vector elements

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Signed halfword elements from 'mult0' are multiplied with

+                 signed halfword elements from 'cnst0' producing a result

+                 twice the size of input i.e. signed word.

+                 The multiplication result of adjacent odd-even elements

+                 are added to the 'out0' vector

+*/

+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)             \

+{                                                                            \

+    out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \

+    out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \

+}

+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)

+#define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \

+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \

+{                                                                      \

+    DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \

+    DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \

+}

+#define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)

 /* Description : Clips all signed halfword elements of input vector

                  between 0 & 255

    Arguments   : Input  - in

@@ -783,6 +808,7 @@

     out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);  \

 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)

+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)

 /* Description : Interleave even word elements from vectors

    Arguments   : Inputs  - in0, in1, in2, in3

@@ -1035,6 +1061,24 @@

 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)

 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)

+/* Description : Indexed word element values are replicated to all

+                 elements in output vector

+   Arguments   : Inputs  - in, stidx

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : 'stidx' element value from 'in' vector is replicated to all

+                 elements in 'out0' vector

+                 'stidx + 1' element value from 'in' vector is replicated to all

+                 elements in 'out1' vector

+                 Valid index range for word operation is 0-3

+*/

+#define SPLATI_W2(RTYPE, in, stidx, out0, out1)          \

+{                                                        \

+    out0 = (RTYPE)__msa_splati_w((v4i32)in, stidx);      \

+    out1 = (RTYPE)__msa_splati_w((v4i32)in, (stidx+1));  \

+}

+#define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)

 /* Description : Pack even byte elements of vector pairs

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

@@ -1160,6 +1204,21 @@

 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)

+/* Description : Shift left all elements of vector (generic for all data types)

+   Arguments   : Inputs  - in0, in1, in2, in3, shift

+                 Outputs - in place operation

+                 Return Type - as per input vector RTYPE

+   Details     : Each element of vector 'in0' is left shifted by 'shift' and

+                 the result is written in-place.

+*/

+#define SLLI_4V(in0, in1, in2, in3, shift)  \

+{                                           \

+    in0 = in0 << shift;                     \

+    in1 = in1 << shift;                     \

+    in2 = in2 << shift;                     \

+    in3 = in3 << shift;                     \

+}

 /* Description : Arithmetic shift right all elements of vector

                  (generic for all data types)

    Arguments   : Inputs  - in0, in1, in2, in3, shift

@@ -1250,6 +1309,22 @@

     ADD2(in4, in5, in6, in7, out2, out3);             \

+/* Description : Sign extend halfword elements from right half of the vector

+   Arguments   : Input  - in    (halfword vector)

+                 Output - out   (sign extended word vector)

+                 Return Type - signed word

+   Details     : Sign bit of halfword elements from input vector 'in' is

+                 extracted and interleaved with same vector 'in0' to generate

+                 4 word elements keeping sign intact

+*/

+#define UNPCK_R_SH_SW(in, out)                     \

+{                                                  \

+    v8i16 sign_m;                                  \

+                                                   \

+    sign_m = __msa_clti_s_h((v8i16)in, 0);         \

+    out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \

+}

 /* Description : Zero extend unsigned byte elements to halfword elements

    Arguments   : Input   - in          (unsigned byte vector)

                  Outputs - out0, out1  (unsigned  halfword vectors)

@@ -1399,6 +1474,21 @@

     tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);             \

     out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \

     out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);               \

+}

+/* Description : Transpose 4x4 block with half word elements in vectors

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1, out2, out3

+                 Return Type - signed halfword

+*/

+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \

+{                                                                       \

+    v8i16 s0_m, s1_m;                                                   \

+                                                                        \

+    ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \

+    ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \

+    out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);               \

+    out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);               \

 /* Description : Transpose 8x4 block with half word elements in vectors

--- a/vp8/common/rtcd_defs.pl

+++ b/vp8/common/rtcd_defs.pl

@@ -271,15 +271,15 @@

 # Forward DCT

 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";

-specialize qw/vp8_short_fdct4x4 mmx sse2 media neon/;

+specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/;

 $vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6;

 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";

-specialize qw/vp8_short_fdct8x4 mmx sse2 media neon/;

+specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/;

 $vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6;

 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";

-specialize qw/vp8_short_walsh4x4 sse2 media neon/;

+specialize qw/vp8_short_walsh4x4 sse2 media neon msa/;

 $vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6;

--- /dev/null

+++ b/vp8/encoder/mips/msa/dct_msa.c

@@ -1,0 +1,199 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vp8_rtcd.h"

+#include "vp8/common/mips/msa/vp8_macros_msa.h"

+#define TRANSPOSE4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \

+{                                                                   \

+    v8i16 s0_m, s1_m, tp0_m, tp1_m, tp2_m, tp3_m;                   \

+                                                                    \

+    ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \

+    ILVRL_H2_SH(s1_m, s0_m, tp0_m, tp1_m);                          \

+    ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                     \

+    ILVRL_H2_SH(s1_m, s0_m, tp2_m, tp3_m);                          \

+    PCKEV_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out0, out2);            \

+    PCKOD_D2_SH(tp2_m, tp0_m, tp3_m, tp1_m, out1, out3);            \

+}

+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)    \

+{                                                                   \

+    v8i16 tmp0_m;                                                   \

+                                                                    \

+    SPLATI_H3_SH(coeff, val0, val1, val2, tmp0_m, const1, const2);  \

+    ILVEV_H2_SH(tmp0_m, const1, const2, tmp0_m, const1, const2);    \

+}

+#define RET_1_IF_NZERO_H(in0)       \

+({                                  \

+    v8i16 tmp0_m;                   \

+    v8i16 one_m = __msa_ldi_h(1);   \

+                                    \

+    tmp0_m = __msa_ceqi_h(in0, 0);  \

+    tmp0_m = tmp0_m ^ 255;          \

+    tmp0_m = one_m & tmp0_m;        \

+                                    \

+    tmp0_m;                         \

+})

+#define RET_1_IF_NZERO_W(in0)       \

+({                                  \

+    v4i32 tmp0_m;                   \

+    v4i32 one_m = __msa_ldi_w(1);   \

+                                    \

+    tmp0_m = __msa_ceqi_w(in0, 0);  \

+    tmp0_m = tmp0_m ^ 255;          \

+    tmp0_m = one_m & tmp0_m;        \

+                                    \

+    tmp0_m;                         \

+})

+#define RET_1_IF_NEG_W(in0)           \

+({                                    \

+    v4i32 tmp0_m;                     \

+                                      \

+    v4i32 one_m = __msa_ldi_w(1);     \

+    tmp0_m = __msa_clti_s_w(in0, 0);  \

+    tmp0_m = one_m & tmp0_m;          \

+                                      \

+    tmp0_m;                           \

+})

+void vp8_short_fdct4x4_msa(int16_t *input, int16_t *output, int32_t pitch)

+{

+    v8i16 in0, in1, in2, in3;

+    v8i16 temp0, temp1;

+    v8i16 const0, const1;

+    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };

+    v4i32 out0, out1, out2, out3;

+    v8i16 zero = { 0 };

+    LD_SH4(input, pitch / 2, in0, in1, in2, in3);

+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);

+    SLLI_4V(temp0, temp1, in1, in3, 3);

+    in0 = temp0 + temp1;

+    in2 = temp0 - temp1;

+    SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);

+    temp0 = __msa_ilvr_h(in3, in1);

+    in1 = __msa_splati_h(coeff, 3);

+    out0 = (v4i32)__msa_ilvev_h(zero, in1);

+    coeff = __msa_ilvl_h(zero, coeff);

+    out1 = __msa_splati_w((v4i32)coeff, 0);

+    DPADD_SH2_SW(temp0, temp0, const0, const1, out0, out1);

+    out0 >>= 12;

+    out1 >>= 12;

+    PCKEV_H2_SH(out0, out0, out1, out1, in1, in3);

+    TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, in0, in1, in2, in3);

+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);

+    in0 = temp0 + temp1 + 7;

+    in2 = temp0 - temp1 + 7;

+    in0 >>= 4;

+    in2 >>= 4;

+    ILVR_H2_SW(zero, in0, zero, in2, out0, out2);

+    temp1 = RET_1_IF_NZERO_H(in3);

+    ILVR_H2_SH(zero, temp1, in3, in1, temp1, temp0);

+    SPLATI_W2_SW(coeff, 2, out3, out1);

+    out3 += out1;

+    out1 = __msa_splati_w((v4i32)coeff, 1);

+    DPADD_SH2_SW(temp0, temp0, const0, const1, out1, out3);

+    out1 >>= 16;

+    out3 >>= 16;

+    out1 += (v4i32)temp1;

+    PCKEV_H2_SH(out1, out0, out3, out2, in0, in2);

+    ST_SH2(in0, in2, output, 8);

+}

+void vp8_short_fdct8x4_msa(int16_t *input, int16_t *output, int32_t pitch)

+{

+    v8i16 in0, in1, in2, in3;

+    v8i16 temp0, temp1, tmp0, tmp1;

+    v8i16 const0, const1, const2;

+    v8i16 coeff = { 2217, 5352, -5352, 14500, 7500, 12000, 25000, 26000 };

+    v8i16 zero = { 0 };

+    v4i32 vec0_w, vec1_w, vec2_w, vec3_w;

+    LD_SH4(input, pitch / 2, in0, in1, in2, in3);

+    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);

+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);

+    SLLI_4V(temp0, temp1, in1, in3, 3);

+    in0 = temp0 + temp1;

+    in2 = temp0 - temp1;

+    SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);

+    temp0 = __msa_splati_h(coeff, 3);

+    vec1_w = (v4i32)__msa_ilvev_h(zero, temp0);

+    coeff = __msa_ilvl_h(zero, coeff);

+    vec3_w = __msa_splati_w((v4i32)coeff, 0);

+    ILVRL_H2_SH(in3, in1, tmp1, tmp0);

+    vec0_w = vec1_w;

+    vec2_w = vec3_w;

+    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,

+                 vec0_w, vec1_w, vec2_w, vec3_w);

+    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 12);

+    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);

+    TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);

+    BUTTERFLY_4(in0, in1, in2, in3, temp0, temp1, in1, in3);

+    in0 = temp0 + temp1 + 7;

+    in2 = temp0 - temp1 + 7;

+    in0 >>= 4;

+    in2 >>= 4;

+    SPLATI_W2_SW(coeff, 2, vec3_w, vec1_w);

+    vec3_w += vec1_w;

+    vec1_w = __msa_splati_w((v4i32)coeff, 1);

+    const0 = RET_1_IF_NZERO_H(in3);

+    ILVRL_H2_SH(in3, in1, tmp1, tmp0);

+    vec0_w = vec1_w;

+    vec2_w = vec3_w;

+    DPADD_SH4_SW(tmp1, tmp0, tmp1, tmp0, const1, const1, const2, const2,

+                 vec0_w, vec1_w, vec2_w, vec3_w);

+    SRA_4V(vec1_w, vec0_w, vec3_w, vec2_w, 16);

+    PCKEV_H2_SH(vec1_w, vec0_w, vec3_w, vec2_w, in1, in3);

+    in1 += const0;

+    PCKEV_D2_SH(in1, in0, in3, in2, temp0, temp1);

+    ST_SH2(temp0, temp1, output, 8);

+    PCKOD_D2_SH(in1, in0, in3, in2, in0, in2);

+    ST_SH2(in0, in2, output + 16, 8);

+}

+void vp8_short_walsh4x4_msa(int16_t *input, int16_t *output, int32_t pitch)

+{

+    v8i16 in0_h, in1_h, in2_h, in3_h;

+    v4i32 in0_w, in1_w, in2_w, in3_w, temp0, temp1, temp2, temp3;

+    LD_SH4(input, pitch / 2, in0_h, in1_h, in2_h, in3_h);

+    TRANSPOSE4x4_SH_SH(in0_h, in1_h, in2_h, in3_h, in0_h, in1_h, in2_h, in3_h);

+    UNPCK_R_SH_SW(in0_h, in0_w);

+    UNPCK_R_SH_SW(in1_h, in1_w);

+    UNPCK_R_SH_SW(in2_h, in2_w);

+    UNPCK_R_SH_SW(in3_h, in3_w);

+    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);

+    SLLI_4V(temp0, temp1, temp2, temp3, 2);

+    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);

+    temp0 = RET_1_IF_NZERO_W(temp0);

+    in0_w += temp0;

+    TRANSPOSE4x4_SW_SW(in0_w, in1_w, in2_w, in3_w, in0_w, in1_w, in2_w, in3_w);

+    BUTTERFLY_4(in0_w, in1_w, in3_w, in2_w, temp0, temp3, temp2, temp1);

+    BUTTERFLY_4(temp0, temp1, temp2, temp3, in0_w, in1_w, in2_w, in3_w);

+    in0_w += RET_1_IF_NEG_W(in0_w);

+    in1_w += RET_1_IF_NEG_W(in1_w);

+    in2_w += RET_1_IF_NEG_W(in2_w);

+    in3_w += RET_1_IF_NEG_W(in3_w);

+    ADD4(in0_w, 3, in1_w, 3, in2_w, 3, in3_w, 3, in0_w, in1_w, in2_w, in3_w);

+    SRA_4V(in0_w, in1_w, in2_w, in3_w, 3);

+    PCKEV_H2_SH(in1_w, in0_w, in3_w, in2_w, in0_h, in1_h);

+    ST_SH2(in0_h, in1_h, output, 8);

+}

--- a/vp8/vp8cx.mk

+++ b/vp8/vp8cx.mk

@@ -103,4 +103,6 @@

 VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm

 endif

+VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c

 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))