shithub: libvpx

--- /dev/null

+++ b/vp8/common/mips/msa/mfqe_msa.c

@@ -1,0 +1,146 @@

+/*

+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.

+ *

+ *  Use of this source code is governed by a BSD-style license

+ *  that can be found in the LICENSE file in the root of the source

+ *  tree. An additional intellectual property rights grant can be found

+ *  in the file PATENTS.  All contributing project authors may

+ *  be found in the AUTHORS file in the root of the source tree.

+ */

+#include "./vp8_rtcd.h"

+#include "vp8/common/postproc.h"

+#include "vp8/common/mips/msa/vp8_macros_msa.h"

+static void filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,

+                                    uint8_t *dst_ptr, int32_t dst_stride,

+                                    int32_t src_weight)

+{

+    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;

+    int32_t row;

+    uint64_t src0_d, src1_d, dst0_d, dst1_d;

+    v16i8 src0 = { 0 };

+    v16i8 src1 = { 0 };

+    v16i8 dst0 = { 0 };

+    v16i8 dst1 = { 0 };

+    v8i16 src_wt, dst_wt, res_h_r, res_h_l, src_r, src_l, dst_r, dst_l;

+    src_wt = __msa_fill_h(src_weight);

+    dst_wt = __msa_fill_h(dst_weight);

+    for (row = 2; row--;)

+    {

+        LD2(src_ptr, src_stride, src0_d, src1_d);

+        src_ptr += (2 * src_stride);

+        LD2(dst_ptr, dst_stride, dst0_d, dst1_d);

+        INSERT_D2_SB(src0_d, src1_d, src0);

+        INSERT_D2_SB(dst0_d, dst1_d, dst0);

+        LD2(src_ptr, src_stride, src0_d, src1_d);

+        src_ptr += (2 * src_stride);

+        LD2((dst_ptr + 2 * dst_stride), dst_stride, dst0_d, dst1_d);

+        INSERT_D2_SB(src0_d, src1_d, src1);

+        INSERT_D2_SB(dst0_d, dst1_d, dst1);

+        UNPCK_UB_SH(src0, src_r, src_l);

+        UNPCK_UB_SH(dst0, dst_r, dst_l);

+        res_h_r = (src_r * src_wt);

+        res_h_r += (dst_r * dst_wt);

+        res_h_l = (src_l * src_wt);

+        res_h_l += (dst_l * dst_wt);

+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);

+        dst0 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);

+        ST8x2_UB(dst0, dst_ptr, dst_stride);

+        dst_ptr += (2 * dst_stride);

+        UNPCK_UB_SH(src1, src_r, src_l);

+        UNPCK_UB_SH(dst1, dst_r, dst_l);

+        res_h_r = (src_r * src_wt);

+        res_h_r += (dst_r * dst_wt);

+        res_h_l = (src_l * src_wt);

+        res_h_l += (dst_l * dst_wt);

+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);

+        dst1 = (v16i8)__msa_pckev_b((v16i8)res_h_l, (v16i8)res_h_r);

+        ST8x2_UB(dst1, dst_ptr, dst_stride);

+        dst_ptr += (2 * dst_stride);

+    }

+}

+static void filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,

+                                      uint8_t *dst_ptr, int32_t dst_stride,

+                                      int32_t src_weight)

+{

+    int32_t dst_weight = (1 << MFQE_PRECISION) - src_weight;

+    int32_t row;

+    v16i8 src0, src1, src2, src3;

+    v16i8 dst0, dst1, dst2, dst3;

+    v8i16 src_wt, dst_wt;

+    v8i16 res_h_r, res_h_l;

+    v8i16 src_r, src_l, dst_r, dst_l;

+    src_wt = __msa_fill_h(src_weight);

+    dst_wt = __msa_fill_h(dst_weight);

+    for (row = 4; row--;)

+    {

+        LD_SB4(src_ptr, src_stride, src0, src1, src2, src3);

+        src_ptr += (4 * src_stride);

+        LD_SB4(dst_ptr, dst_stride, dst0, dst1, dst2, dst3);

+        UNPCK_UB_SH(src0, src_r, src_l);

+        UNPCK_UB_SH(dst0, dst_r, dst_l);

+        res_h_r = (src_r * src_wt);

+        res_h_r += (dst_r * dst_wt);

+        res_h_l = (src_l * src_wt);

+        res_h_l += (dst_l * dst_wt);

+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);

+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);

+        dst_ptr += dst_stride;

+        UNPCK_UB_SH(src1, src_r, src_l);

+        UNPCK_UB_SH(dst1, dst_r, dst_l);

+        res_h_r = (src_r * src_wt);

+        res_h_r += (dst_r * dst_wt);

+        res_h_l = (src_l * src_wt);

+        res_h_l += (dst_l * dst_wt);

+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);

+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);

+        dst_ptr += dst_stride;

+        UNPCK_UB_SH(src2, src_r, src_l);

+        UNPCK_UB_SH(dst2, dst_r, dst_l);

+        res_h_r = (src_r * src_wt);

+        res_h_r += (dst_r * dst_wt);

+        res_h_l = (src_l * src_wt);

+        res_h_l += (dst_l * dst_wt);

+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);

+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);

+        dst_ptr += dst_stride;

+        UNPCK_UB_SH(src3, src_r, src_l);

+        UNPCK_UB_SH(dst3, dst_r, dst_l);

+        res_h_r = (src_r * src_wt);

+        res_h_r += (dst_r * dst_wt);

+        res_h_l = (src_l * src_wt);

+        res_h_l += (dst_l * dst_wt);

+        SRARI_H2_SH(res_h_r, res_h_l, MFQE_PRECISION);

+        PCKEV_ST_SB(res_h_r, res_h_l, dst_ptr);

+        dst_ptr += dst_stride;

+    }

+}

+void vp8_filter_by_weight16x16_msa(uint8_t *src_ptr, int32_t src_stride,

+                                   uint8_t *dst_ptr, int32_t dst_stride,

+                                   int32_t src_weight)

+{

+    filter_by_weight16x16_msa(src_ptr, src_stride, dst_ptr, dst_stride,

+                              src_weight);

+}

+void vp8_filter_by_weight8x8_msa(uint8_t *src_ptr, int32_t src_stride,

+                                 uint8_t *dst_ptr, int32_t dst_stride,

+                                 int32_t src_weight)

+{

+    filter_by_weight8x8_msa(src_ptr, src_stride, dst_ptr, dst_stride,

+                            src_weight);

+}

--- a/vp8/common/mips/msa/vp8_macros_msa.h

+++ b/vp8/common/mips/msa/vp8_macros_msa.h

@@ -435,6 +435,25 @@

     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \

+/* Description : Store 8x2 byte block to destination memory from input vector

+   Arguments   : Inputs - in, pdst, stride

+   Details     : Index 0 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst)

+                 Index 1 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst + stride)

+*/

+#define ST8x2_UB(in, pdst, stride)            \

+{                                             \

+    uint64_t out0_m, out1_m;                  \

+    uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \

+                                              \

+    out0_m = __msa_copy_u_d((v2i64)in, 0);    \

+    out1_m = __msa_copy_u_d((v2i64)in, 1);    \

+                                              \

+    SD(out0_m, pblk_8x2_m);                   \

+    SD(out1_m, pblk_8x2_m + stride);          \

+}

 /* Description : Store 8x4 byte block to destination memory from input

                  vectors

    Arguments   : Inputs - in0, in1, pdst, stride

@@ -623,6 +642,19 @@

     out_m;                                              \

})

+/* Description : Set element n input vector to GPR value

+   Arguments   : Inputs - in0, in1, in2, in3

+                 Output - out

+                 Return Type - as per RTYPE

+   Details     : Set element 0 in vector 'out' to value specified in 'in0'

+*/

+#define INSERT_D2(RTYPE, in0, in1, out)               \

+{                                                     \

+    out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \

+    out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \

+}

+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)

 /* Description : Interleave even byte elements from vectors

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

@@ -1114,6 +1146,20 @@

 {                                                     \

     ADD2(in0, in1, in2, in3, out0, out1);             \

     ADD2(in4, in5, in6, in7, out2, out3);             \

+}

+/* Description : Zero extend unsigned byte elements to halfword elements

+   Arguments   : Input   - in          (unsigned byte vector)

+                 Outputs - out0, out1  (unsigned  halfword vectors)

+                 Return Type - signed halfword

+   Details     : Zero extended right half of vector is returned in 'out0'

+                 Zero extended left half of vector is returned in 'out1'

+*/

+#define UNPCK_UB_SH(in, out0, out1)       \

+{                                         \

+    v16i8 zero_m = { 0 };                 \

+                                          \

+    ILVRL_B2_SH(zero_m, in, out0, out1);  \

 /* Description : Sign extend halfword elements from input vector and return

--- a/vp8/common/rtcd_defs.pl

+++ b/vp8/common/rtcd_defs.pl

@@ -191,10 +191,10 @@

     # no asm yet

     add_proto qw/void vp8_filter_by_weight16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";

-    specialize qw/vp8_filter_by_weight16x16 sse2/;

+    specialize qw/vp8_filter_by_weight16x16 sse2 msa/;

     add_proto qw/void vp8_filter_by_weight8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";

-    specialize qw/vp8_filter_by_weight8x8 sse2/;

+    specialize qw/vp8_filter_by_weight8x8 sse2 msa/;

     add_proto qw/void vp8_filter_by_weight4x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight";

     # no asm yet

--- a/vp8/vp8_common.mk

+++ b/vp8/vp8_common.mk

@@ -122,6 +122,10 @@

 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/sixtap_filter_msa.c

 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h

+ifeq ($(CONFIG_POSTPROC),yes)

+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c

+endif

 # common (c)

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c

 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c