shithub: libvpx

--- a/vp9/common/mips/msa/vp9_macros_msa.h

+++ b/vp9/common/mips/msa/vp9_macros_msa.h

@@ -229,13 +229,12 @@

 #endif  // (__mips_isa_rev >= 6)

 /* Description : Load 4 words with stride

-   Arguments   : Inputs  - psrc    (source pointer to load from)

-                         - stride

+   Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1, out2, out3

-   Details     : Loads word in 'out0' from (psrc)

-                 Loads word in 'out1' from (psrc + stride)

-                 Loads word in 'out2' from (psrc + 2 * stride)

-                 Loads word in 'out3' from (psrc + 3 * stride)

+   Details     : Load word in 'out0' from (psrc)

+                 Load word in 'out1' from (psrc + stride)

+                 Load word in 'out2' from (psrc + 2 * stride)

+                 Load word in 'out3' from (psrc + 3 * stride)

*/

 #define LW4(psrc, stride, out0, out1, out2, out3) {  \

   out0 = LW((psrc));                                 \

@@ -245,11 +244,10 @@

 /* Description : Load double words with stride

-   Arguments   : Inputs  - psrc    (source pointer to load from)

-                         - stride

+   Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

-   Details     : Loads double word in 'out0' from (psrc)

-                 Loads double word in 'out1' from (psrc + stride)

+   Details     : Load double word in 'out0' from (psrc)

+                 Load double word in 'out1' from (psrc + stride)

*/

 #define LD2(psrc, stride, out0, out1) {  \

   out0 = LD((psrc));                     \

@@ -261,11 +259,11 @@

 /* Description : Store 4 words with stride

-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride

-   Details     : Stores word from 'in0' to (pdst)

-                 Stores word from 'in1' to (pdst + stride)

-                 Stores word from 'in2' to (pdst + 2 * stride)

-                 Stores word from 'in3' to (pdst + 3 * stride)

+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

+   Details     : Store word from 'in0' to (pdst)

+                 Store word from 'in1' to (pdst + stride)

+                 Store word from 'in2' to (pdst + 2 * stride)

+                 Store word from 'in3' to (pdst + 3 * stride)

*/

 #define SW4(in0, in1, in2, in3, pdst, stride) {  \

   SW(in0, (pdst))                                \

@@ -275,11 +273,11 @@

 /* Description : Store 4 double words with stride

-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride

-   Details     : Stores double word from 'in0' to (pdst)

-                 Stores double word from 'in1' to (pdst + stride)

-                 Stores double word from 'in2' to (pdst + 2 * stride)

-                 Stores double word from 'in3' to (pdst + 3 * stride)

+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

+   Details     : Store double word from 'in0' to (pdst)

+                 Store double word from 'in1' to (pdst + stride)

+                 Store double word from 'in2' to (pdst + 2 * stride)

+                 Store double word from 'in3' to (pdst + 3 * stride)

*/

 #define SD4(in0, in1, in2, in3, pdst, stride) {  \

   SD(in0, (pdst))                                \

@@ -289,12 +287,11 @@

 /* Description : Load vectors with 16 byte elements with stride

-   Arguments   : Inputs  - psrc    (source pointer to load from)

-                         - stride

+   Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Loads 16 byte elements in 'out0' from (psrc)

-                 Loads 16 byte elements in 'out1' from (psrc + stride)

+   Details     : Load 16 byte elements in 'out0' from (psrc)

+                 Load 16 byte elements in 'out1' from (psrc + stride)

*/

 #define LD_B2(RTYPE, psrc, stride, out0, out1) {  \

   out0 = LD_B(RTYPE, (psrc));                     \

@@ -333,11 +330,10 @@

 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)

 /* Description : Load vectors with 8 halfword elements with stride

-   Arguments   : Inputs  - psrc    (source pointer to load from)

-                         - stride

+   Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

-   Details     : Loads 8 halfword elements in 'out0' from (psrc)

-                 Loads 8 halfword elements in 'out1' from (psrc + stride)

+   Details     : Load 8 halfword elements in 'out0' from (psrc)

+                 Load 8 halfword elements in 'out1' from (psrc + stride)

*/

 #define LD_H2(RTYPE, psrc, stride, out0, out1) {  \

   out0 = LD_H(RTYPE, (psrc));                     \

@@ -368,9 +364,9 @@

 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)

-/* Description : Load as 4x4 block of signed halfword elements from 1D source

+/* Description : Load 4x4 block of signed halfword elements from 1D source

                  data into 4 vectors (Each vector with 4 signed halfwords)

-   Arguments   : Inputs  - psrc

+   Arguments   : Input   - psrc

                  Outputs - out0, out1, out2, out3

*/

 #define LD4x4_SH(psrc, out0, out1, out2, out3) {         \

@@ -381,8 +377,7 @@

 /* Description : Load 2 vectors of signed word elements with stride

-   Arguments   : Inputs  - psrc    (source pointer to load from)

-                         - stride

+   Arguments   : Inputs  - psrc, stride

                  Outputs - out0, out1

                  Return Type - signed word

*/

@@ -392,10 +387,9 @@

 /* Description : Store vectors of 16 byte elements with stride

-   Arguments   : Inputs  - in0, in1, stride

-                 Outputs - pdst    (destination pointer to store to)

-   Details     : Stores 16 byte elements from 'in0' to (pdst)

-                 Stores 16 byte elements from 'in1' to (pdst + stride)

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 16 byte elements from 'in0' to (pdst)

+                 Store 16 byte elements from 'in1' to (pdst + stride)

*/

 #define ST_B2(RTYPE, in0, in1, pdst, stride) {  \

   ST_B(RTYPE, in0, (pdst));                     \

@@ -417,10 +411,9 @@

 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)

 /* Description : Store vectors of 8 halfword elements with stride

-   Arguments   : Inputs  - in0, in1, stride

-                 Outputs - pdst    (destination pointer to store to)

-   Details     : Stores 8 halfword elements from 'in0' to (pdst)

-                 Stores 8 halfword elements from 'in1' to (pdst + stride)

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Store 8 halfword elements from 'in0' to (pdst)

+                 Store 8 halfword elements from 'in1' to (pdst + stride)

*/

 #define ST_H2(RTYPE, in0, in1, pdst, stride) {  \

   ST_H(RTYPE, in0, (pdst));                     \

@@ -441,8 +434,7 @@

 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)

 /* Description : Store vectors of word elements with stride

-   Arguments   : Inputs  - in0, in1, stride

-                         - pdst    (destination pointer to store to)

+   Arguments   : Inputs - in0, in1, pdst, stride

    Details     : Store 4 word elements from 'in0' to (pdst)

                  Store 4 word elements from 'in1' to (pdst + stride)

*/

@@ -451,17 +443,16 @@

   ST_SW(in1, (pdst) + stride);            \

-/* Description : Store as 2x4 byte block to destination memory from input vector

-   Arguments   : Inputs  - in, stidx, pdst, stride

-                 Return Type - unsigned byte

-   Details     : Index stidx halfword element from 'in' vector is copied and

-                 stored on first line

-                 Index stidx+1 halfword element from 'in' vector is copied and

-                 stored on second line

-                 Index stidx+2 halfword element from 'in' vector is copied and

-                 stored on third line

-                 Index stidx+3 halfword element from 'in' vector is copied and

-                 stored on fourth line

+/* Description : Store 2x4 byte block to destination memory from input vector

+   Arguments   : Inputs - in, stidx, pdst, stride

+   Details     : Index 'stidx' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst)

+                 Index 'stidx+1' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst + stride)

+                 Index 'stidx+2' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst + 2 * stride)

+                 Index 'stidx+3' halfword element from 'in' vector is copied to

+                 the GP register and stored to (pdst + 3 * stride)

*/

 #define ST2x4_UB(in, stidx, pdst, stride) {         \

   uint16_t out0_m, out1_m, out2_m, out3_m;          \

@@ -479,10 +470,10 @@

 /* Description : Store 4x2 byte block to destination memory from input vector

-   Arguments   : Inputs  - in, pdst, stride

-   Details     : Index 0 word element from 'in' vector is copied to a GP

+   Arguments   : Inputs - in, pdst, stride

+   Details     : Index 0 word element from 'in' vector is copied to the GP

                  register and stored to (pdst)

-                 Index 1 word element from 'in' vector is copied to a GP

+                 Index 1 word element from 'in' vector is copied to the GP

                  register and stored to (pdst + stride)

*/

 #define ST4x2_UB(in, pdst, stride) {        \

@@ -496,17 +487,16 @@

   SW(out1_m, pblk_4x2_m + stride);          \

-/* Description : Store as 4x4 byte block to destination memory from input vector

-   Arguments   : Inputs  - in0, in1, pdst, stride

-                 Return Type - unsigned byte

-   Details     : Idx0 word element from input vector 'in0' is copied and stored

-                 on first line

-                 Idx1 word element from input vector 'in0' is copied and stored

-                 on second line

-                 Idx2 word element from input vector 'in1' is copied and stored

-                 on third line

-                 Idx3 word element from input vector 'in1' is copied and stored

-                 on fourth line

+/* Description : Store 4x4 byte block to destination memory from input vector

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : 'Idx0' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst)

+                 'Idx1' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst + stride)

+                 'Idx2' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst + 2 * stride)

+                 'Idx3' word element from input vector 'in0' is copied to the

+                 GP register and stored to (pdst + 3 * stride)

*/

 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \

   uint32_t out0_m, out1_m, out2_m, out3_m;                          \

@@ -526,10 +516,10 @@

   ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \

-/* Description : Store as 8x1 byte block to destination memory from input vector

-   Arguments   : Inputs  - in, pdst

-   Details     : Index 0 double word element from input vector 'in' is copied

-                 and stored to destination memory at (pdst)

+/* Description : Store 8x1 byte block to destination memory from input vector

+   Arguments   : Inputs - in, pdst

+   Details     : Index 0 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst)

*/

 #define ST8x1_UB(in, pdst) {              \

   uint64_t out0_m;                        \

@@ -538,12 +528,12 @@

   SD(out0_m, pdst);                       \

-/* Description : Store as 8x2 byte block to destination memory from input vector

-   Arguments   : Inputs  - in, pdst, stride

-   Details     : Index 0 double word element from input vector 'in' is copied

-                 and stored to destination memory at (pdst)

-                 Index 1 double word element from input vector 'in' is copied

-                 and stored to destination memory at (pdst + stride)

+/* Description : Store 8x2 byte block to destination memory from input vector

+   Arguments   : Inputs - in, pdst, stride

+   Details     : Index 0 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst)

+                 Index 1 double word element from 'in' vector is copied to the

+                 GP register and stored to (pdst + stride)

*/

 #define ST8x2_UB(in, pdst, stride) {        \

   uint64_t out0_m, out1_m;                  \

@@ -556,17 +546,17 @@

   SD(out1_m, pblk_8x2_m + stride);          \

-/* Description : Store as 8x4 byte block to destination memory from input

+/* Description : Store 8x4 byte block to destination memory from input

                  vectors

-   Arguments   : Inputs  - in0, in1, pdst, stride

-   Details     : Index 0 double word element from input vector 'in0' is copied

-                 and stored to destination memory at (pblk_8x4_m)

-                 Index 1 double word element from input vector 'in0' is copied

-                 and stored to destination memory at (pblk_8x4_m + stride)

-                 Index 0 double word element from input vector 'in1' is copied

-                 and stored to destination memory at (pblk_8x4_m + 2 * stride)

-                 Index 1 double word element from input vector 'in1' is copied

-                 and stored to destination memory at (pblk_8x4_m + 3 * stride)

+   Arguments   : Inputs - in0, in1, pdst, stride

+   Details     : Index 0 double word element from 'in0' vector is copied to the

+                 GP register and stored to (pdst)

+                 Index 1 double word element from 'in0' vector is copied to the

+                 GP register and stored to (pdst + stride)

+                 Index 0 double word element from 'in1' vector is copied to the

+                 GP register and stored to (pdst + 2 * stride)

+                 Index 1 double word element from 'in1' vector is copied to the

+                 GP register and stored to (pdst + 3 * stride)

*/

 #define ST8x4_UB(in0, in1, pdst, stride) {                  \

   uint64_t out0_m, out1_m, out2_m, out3_m;                  \

@@ -583,14 +573,10 @@

 /* Description : average with rounding (in0 + in1 + 1) / 2.

    Arguments   : Inputs  - in0, in1, in2, in3,

                  Outputs - out0, out1

-                 Return Type - signed byte

-   Details     : Each byte element from 'in0' vector is added with each byte

-                 element from 'in1' vector. The addition of the elements plus 1

-                (for rounding) is done unsigned with full precision,

-                i.e. the result has one extra bit. Unsigned division by 2

-                (or logical shift right by one bit) is performed before writing

-                the result to vector 'out0'

-                Similar for the pair of 'in2' and 'in3'

+                 Return Type - as per RTYPE

+   Details     : Each unsigned byte element from 'in0' vector is added with

+                 each unsigned byte element from 'in1' vector. Then average

+                 with rounding is calculated and written to 'out0'

*/

 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \

@@ -605,12 +591,12 @@

 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)

-/* Description : Immediate number of columns to slide with zero

+/* Description : Immediate number of elements to slide with zero

    Arguments   : Inputs  - in0, in1, slide_val

                  Outputs - out0, out1

                  Return Type - as per RTYPE

    Details     : Byte elements from 'zero_m' vector are slide into 'in0' by

-                 number of elements specified by 'slide_val'

+                 value specified in the 'slide_val'

*/

 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \

   v16i8 zero_m = { 0 };                                              \

@@ -626,12 +612,12 @@

 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)

-/* Description : Immediate number of columns to slide

+/* Description : Immediate number of elements to slide

    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val

                  Outputs - out0, out1

                  Return Type - as per RTYPE

    Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by

-                 number of elements specified by 'slide_val'

+                 value specified in the 'slide_val'

*/

 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \

   out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \

@@ -651,10 +637,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Selective byte elements from in0 & in1 are copied to out0 as

-                 per control vector mask0

-                 Selective byte elements from in2 & in3 are copied to out1 as

-                 per control vector mask1

+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to

+                 'out0' as per control vector 'mask0'

*/

 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \

   out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \

@@ -673,16 +657,14 @@

 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)

 /* Description : Dot product of byte vector elements

-   Arguments   : Inputs  - mult0, mult1

-                           cnst0, cnst1

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

-                 Return Type - unsigned halfword

-   Details     : Unsigned byte elements from mult0 are multiplied with

-                 unsigned byte elements from cnst0 producing a result

+                 Return Type - as per RTYPE

+   Details     : Unsigned byte elements from 'mult0' are multiplied with

+                 unsigned byte elements from 'cnst0' producing a result

                  twice the size of input i.e. unsigned halfword.

-                 Then this multiplication results of adjacent odd-even elements

-                 are added together and stored to the out vector

-                 (2 unsigned halfword results)

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

*/

 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

   out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \

@@ -699,16 +681,14 @@

 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)

 /* Description : Dot product of byte vector elements

-   Arguments   : Inputs  - mult0, mult1

-                           cnst0, cnst1

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

-                 Return Type - signed halfword

-   Details     : Signed byte elements from mult0 are multiplied with

-                 signed byte elements from cnst0 producing a result

+                 Return Type - as per RTYPE

+   Details     : Signed byte elements from 'mult0' are multiplied with

+                 signed byte elements from 'cnst0' producing a result

                  twice the size of input i.e. signed halfword.

-                 Then this multiplication results of adjacent odd-even elements

-                 are added together and stored to the out vector

-                 (2 signed halfword results)

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

*/

 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

   out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \

@@ -724,16 +704,14 @@

 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)

 /* Description : Dot product of halfword vector elements

-   Arguments   : Inputs  - mult0, mult1

-                           cnst0, cnst1

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

-                 Return Type - signed word

-   Details     : Signed halfword elements from mult0 are multiplied with

-                 signed halfword elements from cnst0 producing a result

+                 Return Type - as per RTYPE

+   Details     : Signed halfword elements from 'mult0' are multiplied with

+                 signed halfword elements from 'cnst0' producing a result

                  twice the size of input i.e. signed word.

-                 Then this multiplication results of adjacent odd-even elements

-                 are added together and stored to the out vector

-                 (2 signed word results)

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

*/

 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

   out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \

@@ -750,16 +728,14 @@

 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)

 /* Description : Dot product of word vector elements

-   Arguments   : Inputs  - mult0, mult1

-                           cnst0, cnst1

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

-                 Return Type - signed word

-   Details     : Signed word elements from mult0 are multiplied with

-                 signed word elements from cnst0 producing a result

+                 Return Type - as per RTYPE

+   Details     : Signed word elements from 'mult0' are multiplied with

+                 signed word elements from 'cnst0' producing a result

                  twice the size of input i.e. signed double word.

-                 Then this multiplication results of adjacent odd-even elements

-                 are added together and stored to the out vector

-                 (2 signed double word results)

+                 The multiplication result of adjacent odd-even elements

+                 are added together and written to the 'out0' vector

*/

 #define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \

   out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \

@@ -768,16 +744,14 @@

 #define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)

 /* Description : Dot product & addition of byte vector elements

-   Arguments   : Inputs  - mult0, mult1

-                           cnst0, cnst1

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

-                 Return Type - signed halfword

-   Details     : Signed byte elements from mult0 are multiplied with

-                 signed byte elements from cnst0 producing a result

+                 Return Type - as per RTYPE

+   Details     : Signed byte elements from 'mult0' are multiplied with

+                 signed byte elements from 'cnst0' producing a result

                  twice the size of input i.e. signed halfword.

-                 Then this multiplication results of adjacent odd-even elements

-                 are added to the out vector

-                 (2 signed halfword results)

+                 The multiplication result of adjacent odd-even elements

+                 are added to the 'out0' vector

*/

 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \

   out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \

@@ -793,8 +767,7 @@

 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)

 /* Description : Dot product & addition of halfword vector elements

-   Arguments   : Inputs  - mult0, mult1

-                           cnst0, cnst1

+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1

                  Outputs - out0, out1

                  Return Type - as per RTYPE

    Details     : Signed halfword elements from 'mult0' are multiplied with

@@ -828,10 +801,10 @@

 /* Description : Minimum values between unsigned elements of

                  either vector are copied to the output vector

    Arguments   : Inputs  - in0, in1, min_vec

-                 Outputs - in0, in1, (in place)

-                 Return Type - unsigned halfword

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

    Details     : Minimum of unsigned halfword element values from 'in0' and

-                 'min_value' are written to output vector 'in0'

+                 'min_vec' are written to output vector 'in0'

*/

 #define MIN_UH2(RTYPE, in0, in1, min_vec) {         \

   in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \

@@ -847,8 +820,8 @@

 /* Description : Clips all signed halfword elements of input vector

                  between 0 & 255

-   Arguments   : Inputs  - in       (input vector)

-                 Outputs - out_m    (output vector with clipped elements)

+   Arguments   : Input  - in

+                 Output - out_m

                  Return Type - signed halfword

*/

 #define CLIP_SH_0_255(in) ({                          \

@@ -868,12 +841,12 @@

   CLIP_SH2_0_255(in2, in3);                   \

-/* Description : Addition of 4 signed word elements

-                 4 signed word elements of input vector are added together and

+/* Description : Horizontal addition of 4 signed word elements of input vector

+   Arguments   : Input  - in       (signed word vector)

+                 Output - sum_m    (i32 sum)

+                 Return Type - signed word (GP)

+   Details     : 4 signed word elements of 'in' vector are added together and

                  the resulting integer sum is returned

-   Arguments   : Inputs  - in       (signed word vector)

-                 Outputs - sum_m    (i32 sum)

-                 Return Type - signed word

*/

 #define HADD_SW_S32(in) ({                        \

   v2i64 res0_m, res1_m;                           \

@@ -892,7 +865,7 @@

                  Return Type - as per RTYPE

    Details     : Each unsigned odd byte element from 'in0' is added to

                  even unsigned byte element from 'in0' (pairwise) and the

-                 halfword result is stored in 'out0'

+                 halfword result is written to 'out0'

*/

 #define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \

   out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \

@@ -934,11 +907,11 @@

 #define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)

-/* Description : Insert specified word elements from input vectors to 1

-                 destination vector

-   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)

-                 Outputs - out                (output vector)

+/* Description : Set element n input vector to GPR value

+   Arguments   : Inputs - in0, in1, in2, in3

+                 Output - out

                  Return Type - as per RTYPE

+   Details     : Set element 0 in vector 'out' to value specified in 'in0'

*/

 #define INSERT_W2(RTYPE, in0, in1, out) {           \

   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \

@@ -955,12 +928,6 @@

 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)

 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)

-/* Description : Insert specified double word elements from input vectors to 1

-                 destination vector

-   Arguments   : Inputs  - in0, in1      (2 input vectors)

-                 Outputs - out           (output vector)

-                 Return Type - as per RTYPE

-*/

 #define INSERT_D2(RTYPE, in0, in1, out) {           \

   out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \

   out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \

@@ -972,10 +939,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Even byte elements of 'in0' and even byte

-                 elements of 'in1' are interleaved and copied to 'out0'

-                 Even byte elements of 'in2' and even byte

-                 elements of 'in3' are interleaved and copied to 'out1'

+   Details     : Even byte elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

*/

 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \

@@ -988,10 +953,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Even halfword elements of 'in0' and even halfword

-                 elements of 'in1' are interleaved and copied to 'out0'

-                 Even halfword elements of 'in2' and even halfword

-                 elements of 'in3' are interleaved and copied to 'out1'

+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

*/

 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \

@@ -1018,10 +981,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Even double word elements of 'in0' and even double word

-                 elements of 'in1' are interleaved and copied to 'out0'

-                 Even double word elements of 'in2' and even double word

-                 elements of 'in3' are interleaved and copied to 'out1'

+   Details     : Even double word elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'

*/

 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \

@@ -1033,10 +994,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Left half of byte elements of in0 and left half of byte

-                 elements of in1 are interleaved and copied to out0.

-                 Left half of byte elements of in2 and left half of byte

-                 elements of in3 are interleaved and copied to out1.

+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'.

*/

 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \

@@ -1059,10 +1018,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Left half of halfword elements of in0 and left half of halfword

-                 elements of in1 are interleaved and copied to out0.

-                 Left half of halfword elements of in2 and left half of halfword

-                 elements of in3 are interleaved and copied to out1.

+   Details     : Left half of halfword elements of 'in0' and 'in1' are

+                 interleaved and written to 'out0'.

*/

 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \

@@ -1074,10 +1031,8 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Left half of word elements of in0 and left half of word

-                 elements of in1 are interleaved and copied to out0.

-                 Left half of word elements of in2 and left half of word

-                 elements of in3 are interleaved and copied to out1.

+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved

+                 and written to 'out0'.

*/

 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \

@@ -1087,14 +1042,11 @@

 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)

 /* Description : Interleave right half of byte elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Right half of byte elements of in0 and right half of byte

-                 elements of in1 are interleaved and copied to out0.

-                 Right half of byte elements of in2 and right half of byte

-                 elements of in3 are interleaved and copied to out1.

-                 Similar for other pairs

+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved

+                 and written to out0.

*/

 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \

@@ -1126,14 +1078,11 @@

 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)

 /* Description : Interleave right half of halfword elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3

-                 Return Type - signed halfword

-   Details     : Right half of halfword elements of in0 and right half of

-                 halfword elements of in1 are interleaved and copied to out0.

-                 Right half of halfword elements of in2 and right half of

-                 halfword elements of in3 are interleaved and copied to out1.

-                 Similar for other pairs

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Right half of halfword elements of 'in0' and 'in1' are

+                 interleaved and written to 'out0'.

*/

 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \

@@ -1163,13 +1112,11 @@

 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)

 /* Description : Interleave right half of double word elements from vectors

-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                 Outputs - out0, out1, out2, out3

-                 Return Type - unsigned double word

-   Details     : Right half of double word elements of in0 and right half of

-                 double word elements of in1 are interleaved and copied to out0.

-                 Right half of double word elements of in2 and right half of

-                 double word elements of in3 are interleaved and copied to out1.

+   Arguments   : Inputs  - in0, in1, in2, in3

+                 Outputs - out0, out1

+                 Return Type - as per RTYPE

+   Details     : Right half of double word elements of 'in0' and 'in1' are

+                 interleaved and written to 'out0'.

*/

 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \

   out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \

@@ -1198,9 +1145,7 @@

                  Outputs - out0, out1

                  Return Type - as per RTYPE

    Details     : Right half of byte elements from 'in0' and 'in1' are

-                 interleaved and stored to 'out0'

-                 Left half of byte elements from 'in0' and 'in1' are

-                 interleaved and stored to 'out1'

+                 interleaved and written to 'out0'

*/

 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \

   out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \

@@ -1226,14 +1171,14 @@

 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)

 /* Description : Saturate the halfword element values to the max

-                 unsigned value of (sat_val+1 bits)

+                 unsigned value of (sat_val + 1) bits

                  The element data width remains unchanged

-   Arguments   : Inputs  - in0, in1, in2, in3, sat_val

-                 Outputs - in0, in1, in2, in3 (in place)

-                 Return Type - unsigned halfword

+   Arguments   : Inputs  - in0, in1, sat_val

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

    Details     : Each unsigned halfword element from 'in0' is saturated to the

-                 value generated with (sat_val+1) bit range.

-                 The results are stored in place

+                 value generated with (sat_val + 1) bit range.

+                 The results are written in place

*/

 #define SAT_UH2(RTYPE, in0, in1, sat_val) {         \

   in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \

@@ -1248,14 +1193,14 @@

 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)

 /* Description : Saturate the halfword element values to the max

-                 unsigned value of (sat_val+1 bits)

+                 unsigned value of (sat_val + 1) bits

                  The element data width remains unchanged

-   Arguments   : Inputs  - in0, in1, in2, in3, sat_val

-                 Outputs - in0, in1, in2, in3 (in place)

-                 Return Type - unsigned halfword

+   Arguments   : Inputs  - in0, in1, sat_val

+                 Outputs - in place operation

+                 Return Type - as per RTYPE

    Details     : Each unsigned halfword element from 'in0' is saturated to the

-                 value generated with (sat_val+1) bit range

-                 The results are stored in place

+                 value generated with (sat_val + 1) bit range

+                 The results are written in place

*/

 #define SAT_SH2(RTYPE, in0, in1, sat_val) {         \

   in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \

@@ -1296,12 +1241,9 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Even byte elements of in0 are copied to the left half of

-                 out0 & even byte elements of in1 are copied to the right

-                 half of out0.

-                 Even byte elements of in2 are copied to the left half of

-                 out1 & even byte elements of in3 are copied to the right

-                 half of out1.

+   Details     : Even byte elements of 'in0' are copied to the left half of

+                 'out0' & even byte elements of 'in1' are copied to the right

+                 half of 'out0'.

*/

 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \

@@ -1324,12 +1266,9 @@

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

                  Return Type - as per RTYPE

-   Details     : Even halfword elements of in0 are copied to the left half of

-                 out0 & even halfword elements of in1 are copied to the right

-                 half of out0.

-                 Even halfword elements of in2 are copied to the left half of

-                 out1 & even halfword elements of in3 are copied to the right

-                 half of out1.

+   Details     : Even halfword elements of 'in0' are copied to the left half of

+                 'out0' & even halfword elements of 'in1' are copied to the

+                 right half of 'out0'.

*/

 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \

@@ -1348,13 +1287,10 @@

 /* Description : Pack even double word elements of vector pairs

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

-                 Return Type - unsigned byte

-   Details     : Even double elements of in0 are copied to the left half of

-                 out0 & even double elements of in1 are copied to the right

-                 half of out0.

-                 Even double elements of in2 are copied to the left half of

-                 out1 & even double elements of in3 are copied to the right

-                 half of out1.

+                 Return Type - as per RTYPE

+   Details     : Even double elements of 'in0' are copied to the left half of

+                 'out0' & even double elements of 'in1' are copied to the right

+                 half of 'out0'.

*/

 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \

@@ -1372,15 +1308,10 @@

 /* Description : Each byte element is logically xor'ed with immediate 128

    Arguments   : Inputs  - in0, in1

-                 Outputs - in0, in1 (in-place)

+                 Outputs - in place operation

                  Return Type - as per RTYPE

    Details     : Each unsigned byte element from input vector 'in0' is

-                 logically xor'ed with 128 and the result is in-place stored in

-                 'in0' vector

-                 Each unsigned byte element from input vector 'in1' is

-                 logically xor'ed with 128 and the result is in-place stored in

-                 'in1' vector

-                 Similar for other pairs

+                 logically xor'ed with 128 and the result is stored in-place.

*/

 #define XORI_B2_128(RTYPE, in0, in1) {         \

   in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \

@@ -1432,8 +1363,7 @@

                  Return Type - as per RTYPE

    Details     : Signed halfword elements from 'in0' are added to signed

                  halfword elements of 'in1'. The result is then signed saturated

-                 between -32768 to +32767 (as per halfword data type)

-                 Similar for other pairs

+                 between halfword data type range

*/

 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \

   out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \

@@ -1450,11 +1380,10 @@

 /* Description : Shift left all elements of vector (generic for all data types)

    Arguments   : Inputs  - in0, in1, in2, in3, shift

-                 Outputs - in0, in1, in2, in3 (in place)

+                 Outputs - in place operation

                  Return Type - as per input vector RTYPE

    Details     : Each element of vector 'in0' is left shifted by 'shift' and

-                 the result is in place written to 'in0'

-                 Similar for other pairs

+                 the result is written in-place.

*/

 #define SLLI_4V(in0, in1, in2, in3, shift) {  \

   in0 = in0 << shift;                         \

@@ -1466,12 +1395,10 @@

 /* Description : Arithmetic shift right all elements of vector

                  (generic for all data types)

    Arguments   : Inputs  - in0, in1, in2, in3, shift

-                 Outputs - in0, in1, in2, in3 (in place)

+                 Outputs - in place operation

                  Return Type - as per input vector RTYPE

    Details     : Each element of vector 'in0' is right shifted by 'shift' and

-                 the result is in place written to 'in0'

-                 Here, 'shift' is GP variable passed in

-                 Similar for other pairs

+                 the result is written in-place. 'shift' is a GP variable.

*/

 #define SRA_4V(in0, in1, in2, in3, shift) {  \

   in0 = in0 >> shift;                        \

@@ -1502,14 +1429,13 @@

 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)

 /* Description : Shift right arithmetic rounded (immediate)

-   Arguments   : Inputs  - in0, in1, in2, in3, shift

-                 Outputs - in0, in1, in2, in3 (in place)

+   Arguments   : Inputs  - in0, in1, shift

+                 Outputs - in place operation

                  Return Type - as per RTYPE

-   Details     : Each element of vector 'in0' is shifted right arithmetic by

-                 value in 'shift'.

-                 The last discarded bit is added to shifted value for rounding

-                 and the result is in place written to 'in0'

-                 Similar for other pairs

+   Details     : Each element of vector 'in0' is shifted right arithmetically by

+                 the value in 'shift'. The last discarded bit is added to the

+                 shifted value for rounding and the result is written in-place.

+                 'shift' is an immediate value.

*/

 #define SRARI_H2(RTYPE, in0, in1, shift) {        \

   in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \

@@ -1525,16 +1451,6 @@

 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)

 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)

-/* Description : Shift right arithmetic rounded (immediate)

-   Arguments   : Inputs  - in0, in1, shift

-                 Outputs - in0, in1     (in place)

-                 Return Type - as per RTYPE

-   Details     : Each element of vector 'in0' is shifted right arithmetic by

-                 value in 'shift'.

-                 The last discarded bit is added to shifted value for rounding

-                 and the result is in place written to 'in0'

-                 Similar for other pairs

-*/

 #define SRARI_W2(RTYPE, in0, in1, shift) {        \

   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \

   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \

@@ -1581,8 +1497,8 @@

 /* Description : Addition of 2 pairs of vectors

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

-   Details     : Each element from 2 pairs vectors is added and 2 results are

-                 produced

+   Details     : Each element in 'in0' is added to 'in1' and result is written

+                 to 'out0'.

*/

 #define ADD2(in0, in1, in2, in3, out0, out1) {  \

   out0 = in0 + in1;                             \

@@ -1597,8 +1513,8 @@

 /* Description : Subtraction of 2 pairs of vectors

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1

-   Details     : Each element from 2 pairs vectors is subtracted and 2 results

-                 are produced

+   Details     : Each element in 'in1' is subtracted from 'in0' and result is

+                 written to 'out0'.

*/

 #define SUB2(in0, in1, in2, in3, out0, out1) {  \

   out0 = in0 - in1;                             \

@@ -1613,8 +1529,8 @@

 /* Description : Sign extend halfword elements from right half of the vector

-   Arguments   : Inputs  - in    (input halfword vector)

-                 Outputs - out   (sign extended word vectors)

+   Arguments   : Input  - in    (halfword vector)

+                 Output - out   (sign extended word vector)

                  Return Type - signed word

    Details     : Sign bit of halfword elements from input vector 'in' is

                  extracted and interleaved with same vector 'in0' to generate

@@ -1628,8 +1544,8 @@

 /* Description : Zero extend unsigned byte elements to halfword elements

-   Arguments   : Inputs  - in           (1 input unsigned byte vector)

-                 Outputs - out0, out1   (unsigned 2 halfword vectors)

+   Arguments   : Input   - in          (unsigned byte vector)

+                 Outputs - out0, out1  (unsigned  halfword vectors)

                  Return Type - signed halfword

    Details     : Zero extended right half of vector is returned in 'out0'

                  Zero extended left half of vector is returned in 'out1'

@@ -1641,9 +1557,9 @@

 /* Description : Sign extend halfword elements from input vector and return

-                 result in pair of vectors

-   Arguments   : Inputs  - in           (1 input halfword vector)

-                 Outputs - out0, out1   (sign extended 2 word vectors)

+                 the result in pair of vectors

+   Arguments   : Input   - in            (halfword vector)

+                 Outputs - out0, out1   (sign extended word vectors)

                  Return Type - signed word

    Details     : Sign bit of halfword elements from input vector 'in' is

                  extracted and interleaved right with same vector 'in0' to

@@ -1717,13 +1633,10 @@

   out15 = in0 - in15;                                                         \

-/* Description : Transposes input 8x8 byte block

+/* Description : Transpose input 8x8 byte block

    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

-                           (input 8x8 byte block)

                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                           (output 8x8 byte block)

-                 Return Type - unsigned byte

-   Details     :

+                 Return Type - as per RTYPE

*/

 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

                         out0, out1, out2, out3, out4, out5, out6, out7) {  \

@@ -1741,12 +1654,11 @@

 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)

-/* Description : Transposes 16x8 block into 8x16 with byte elements in vectors

+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors

    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,

                            in8, in9, in10, in11, in12, in13, in14, in15

                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7

                  Return Type - unsigned byte

-   Details     :

*/

 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \

                             in8, in9, in10, in11, in12, in13, in14, in15,      \

@@ -1789,11 +1701,10 @@

   out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \

-/* Description : Transposes 4x4 block with half word elements in vectors

+/* Description : Transpose 4x4 block with half word elements in vectors

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1, out2, out3

                  Return Type - signed halfword

-   Details     :

*/

 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \

   v8i16 s0_m, s1_m;                                                       \

@@ -1804,11 +1715,10 @@

   out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \

-/* Description : Transposes 4x8 block with half word elements in vectors

+/* Description : Transpose 4x8 block with half word elements in vectors

    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7

                  Return Type - signed halfword

-   Details     :

*/

 #define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \

                            out0, out1, out2, out3, out4, out5, out6, out7) {  \

@@ -1832,11 +1742,10 @@

   out7 = zero_m;                                                              \

-/* Description : Transposes 8x4 block with half word elements in vectors

+/* Description : Transpose 8x4 block with half word elements in vectors

    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7

                  Return Type - signed halfword

-   Details     :

*/

 #define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \

   v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \

@@ -1847,11 +1756,10 @@

   ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \

-/* Description : Transposes 8x8 block with half word elements in vectors

+/* Description : Transpose 8x8 block with half word elements in vectors

    Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7

                  Outputs - out0, out1, out2, out3, out4, out5, out6, out7

-                 Return Type - signed halfword

-   Details     :

+                 Return Type - as per RTYPE

*/

 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \

                        out0, out1, out2, out3, out4, out5, out6, out7) {  \

@@ -1876,11 +1784,10 @@

 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)

-/* Description : Transposes 4x4 block with word elements in vectors

+/* Description : Transpose 4x4 block with word elements in vectors

    Arguments   : Inputs  - in0, in1, in2, in3

                  Outputs - out0, out1, out2, out3

                  Return Type - signed word

-   Details     :

*/

 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \

   v4i32 s0_m, s1_m, s2_m, s3_m;                                           \

@@ -1895,15 +1802,12 @@

 /* Description : Add block 4x4

-   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride

-                 Outputs -

-                 Return Type - unsigned bytes

+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride

    Details     : Least significant 4 bytes from each input vector are added to

-                 the destination bytes, clipped between 0-255 and then stored.

+                 the destination bytes, clipped between 0-255 and stored.

*/

 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \

   uint32_t src0_m, src1_m, src2_m, src3_m;                      \

-  uint32_t out0_m, out1_m, out2_m, out3_m;                      \

   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \

   v16i8 dst0_m = { 0 };                                         \

   v16i8 dst1_m = { 0 };                                         \

@@ -1917,17 +1821,12 @@

   ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \

   CLIP_SH2_0_255(res0_m, res1_m);                               \

   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \

-                                                                \

-  out0_m = __msa_copy_u_w((v4i32)dst0_m, 0);                    \

-  out1_m = __msa_copy_u_w((v4i32)dst0_m, 1);                    \

-  out2_m = __msa_copy_u_w((v4i32)dst1_m, 0);                    \

-  out3_m = __msa_copy_u_w((v4i32)dst1_m, 1);                    \

-  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \

+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \

 /* Description : Pack even elements of input vectors & xor with 128

-   Arguments   : Inputs  - in0, in1

-                 Outputs - out_m

+   Arguments   : Inputs - in0, in1

+                 Output - out_m

                  Return Type - unsigned byte

    Details     : Signed byte even elements from 'in0' and 'in1' are packed

                  together in one vector and the resulting vector is xor'ed with

@@ -1943,8 +1842,8 @@

 /* Description : Converts inputs to unsigned bytes, interleave, average & store

                  as 8x4 unsigned byte block

-   Arguments   : Inputs  - in0, in1, in2, in3, dst0, dst1, dst2, dst3,

-                           pdst, stride

+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,

+                          pdst, stride

*/

 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \

                                 dst0, dst1, dst2, dst3, pdst, stride) {  \

@@ -1960,7 +1859,7 @@

 /* Description : Pack even byte elements and store byte vector in destination

                  memory

-   Arguments   : Inputs  - in0, in1, pdst

+   Arguments   : Inputs - in0, in1, pdst

*/

 #define PCKEV_ST_SB(in0, in1, pdst) {             \

   v16i8 tmp_m;                                    \

@@ -1970,7 +1869,7 @@

 /* Description : Horizontal 2 tap filter kernel code

-   Arguments   : Inputs  - in0, in1, mask, coeff, shift

+   Arguments   : Inputs - in0, in1, mask, coeff, shift

*/

 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \

   v16i8 tmp0_m;                                                \